提交 9aff3c14 编写于 作者: L luxuhui

refactor: refactor the delegators for arm

N/A
Signed-off-by: NLuxuhui <luxuhui@xiaomi.com>
上级 fbd0ff09
...@@ -60,6 +60,7 @@ MaceStatus OpDelegatorRegistry::Register(const DelegatorInfo &key, ...@@ -60,6 +60,7 @@ MaceStatus OpDelegatorRegistry::Register(const DelegatorInfo &key,
DelegatorCreator OpDelegatorRegistry::GetCreator( DelegatorCreator OpDelegatorRegistry::GetCreator(
const DelegatorInfo &key) const { const DelegatorInfo &key) const {
if (registry_.count(key) > 0) { if (registry_.count(key) > 0) {
VLOG(3) << "find delegator creator: " << key.ToString();
return registry_.at(key); return registry_.at(key);
} }
......
...@@ -105,6 +105,7 @@ cc_library( ...@@ -105,6 +105,7 @@ cc_library(
name = "arm_neon_kernels", name = "arm_neon_kernels",
srcs = glob( srcs = glob(
[ [
"arm/base/*.cc",
"arm/fp32/*.cc", "arm/fp32/*.cc",
"arm/fp16/gemv.h", "arm/fp16/gemv.h",
], ],
...@@ -121,6 +122,7 @@ cc_library( ...@@ -121,6 +122,7 @@ cc_library(
)), )),
hdrs = glob( hdrs = glob(
[ [
"arm/base/*.h",
"arm/fp32/*.h", "arm/fp32/*.h",
], ],
) + if_quantize_enabled(glob( ) + if_quantize_enabled(glob(
......
...@@ -5,6 +5,9 @@ file(GLOB OPS_REF_Q8_KERNELS_SRCS ...@@ -5,6 +5,9 @@ file(GLOB OPS_REF_Q8_KERNELS_SRCS
ref/q8/*.cc ref/q8/*.cc
) )
file(GLOB OPS_ARM_NEON_BASE_KERNELS_SRCS
arm/base/*.cc
)
file(GLOB OPS_ARM_NEON_FP32_KERNELS_SRCS file(GLOB OPS_ARM_NEON_FP32_KERNELS_SRCS
arm/fp32/*.cc arm/fp32/*.cc
) )
...@@ -32,7 +35,7 @@ if(MACE_ENABLE_QUANTIZE) ...@@ -32,7 +35,7 @@ if(MACE_ENABLE_QUANTIZE)
endif(MACE_ENABLE_QUANTIZE) endif(MACE_ENABLE_QUANTIZE)
if(MACE_ENABLE_NEON) if(MACE_ENABLE_NEON)
set(OPS_SRCS ${OPS_SRCS} ${OPS_ARM_NEON_FP32_KERNELS_SRCS}) set(OPS_SRCS ${OPS_SRCS} ${OPS_ARM_NEON_BASE_KERNELS_SRCS} ${OPS_ARM_NEON_FP32_KERNELS_SRCS})
if(MACE_ENABLE_QUANTIZE) if(MACE_ENABLE_QUANTIZE)
set(OPS_SRCS ${OPS_SRCS} ${OPS_ARM_NEON_Q8_KERNELS_SRCS}) set(OPS_SRCS ${OPS_SRCS} ${OPS_ARM_NEON_Q8_KERNELS_SRCS})
endif(MACE_ENABLE_QUANTIZE) endif(MACE_ENABLE_QUANTIZE)
......
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/arm/base/activation.h"
namespace mace {
namespace ops {
namespace arm {
template<typename T>
MaceStatus Activation<T>::Compute(const OpContext *context,
const Tensor *input, Tensor *output) {
Tensor::MappingGuard input_guard(input);
if (input != output) {
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
Tensor::MappingGuard output_guard(output);
DoActivation(context, input, output);
} else {
DoActivation(context, input, output);
}
return MaceStatus::MACE_SUCCESS;
}
template<typename T>
void Activation<T>::DoActivation(const OpContext *context,
const Tensor *input,
Tensor *output) {
const T *input_data = input->data<T>();
T *output_data = output->mutable_data<T>();
const index_t size = input->size();
utils::ThreadPool &thread_pool =
context->device()->cpu_runtime()->thread_pool();
switch (type_) {
case RELU: {
ActivateRelu(&thread_pool, input_data, size, output_data);
break;
}
case RELUX: {
ActivateRelux(&thread_pool, input_data, size, output_data);
break;
}
case LEAKYRELU: {
ActivateLeakyRelu(&thread_pool, input_data, size, output_data);
break;
}
case TANH: {
ActivateTanh(&thread_pool, input_data, size, output_data);
break;
}
case SIGMOID: {
ActivateSigmoid(&thread_pool, input_data, size, output_data);
break;
}
case NOOP: {
break;
}
default: {
MACE_NOT_IMPLEMENTED;
}
}
}
void RegisterActivationDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Activation<float>, delegator::ActivationParam,
MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, float, ImplType::NEON));
}
} // namespace arm
} // namespace ops
} // namespace mace
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_ARM_BASE_ACTIVATION_H_
#define MACE_OPS_ARM_BASE_ACTIVATION_H_
#include "mace/ops/delegator/activation.h"
namespace mace {
namespace ops {
namespace arm {
template<typename T>
class Activation : public delegator::Activation {
public:
explicit Activation(const delegator::ActivationParam &param)
: delegator::Activation(param) {}
~Activation() = default;
MaceStatus Compute(const OpContext *context,
const Tensor *input, Tensor *output) override;
private:
void DoActivation(const OpContext *context,
const Tensor *input, Tensor *output);
void ActivateRelu(utils::ThreadPool *thread_pool, const T *input_data,
const index_t input_size, T *output_data);
void ActivateRelux(utils::ThreadPool *thread_pool, const T *input_data,
const index_t input_size, T *output_data);
void ActivateLeakyRelu(utils::ThreadPool *thread_pool, const T *input_data,
const index_t input_size, T *output_data);
void ActivateTanh(utils::ThreadPool *thread_pool, const T *input_data,
const index_t input_size, T *output_data);
void ActivateSigmoid(utils::ThreadPool *thread_pool, const T *input_data,
const index_t input_size, T *output_data);
};
} // namespace arm
} // namespace ops
} // namespace mace
#endif // MACE_OPS_ARM_BASE_ACTIVATION_H_
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/arm/base/bias_add.h"
namespace mace {
namespace ops {
namespace arm {
template<typename T>
MaceStatus BiasAdd<T>::Compute(const OpContext *context, const Tensor *input,
const Tensor *bias, Tensor *output) {
if (input != output) {
if (bias == nullptr) {
output->Copy(*input);
} else {
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard bias_guard(bias);
Tensor::MappingGuard output_guard(output);
AddBias(context, input, bias, output);
}
} else {
if (bias != nullptr) {
Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard bias_guard(bias);
AddBias(context, input, bias, output);
}
}
return MaceStatus::MACE_SUCCESS;
}
template<typename T>
void BiasAdd<T>::AddBias(const OpContext *context, const Tensor *input,
const Tensor *bias, mace::Tensor *output) {
auto input_data = input->data<T>();
auto bias_data = bias->data<T>();
auto output_data = output->mutable_data<T>();
const index_t batch = input->dim(0);
const index_t channels = input->dim(1);
const index_t height = input->dim(2);
const index_t width = input->dim(3);
const index_t image_size = height * width;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
if (bias->dim_size() == 1) {
Add1DimBias(&thread_pool, input_data, bias_data,
output_data, batch, channels, image_size);
} else {
Add2DimsBias(&thread_pool, input_data, bias_data,
output_data, batch, channels, image_size);
}
}
void RegisterBiasAddDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, BiasAdd<float>, DelegatorParam,
MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, float, ImplType::NEON));
}
} // namespace arm
} // namespace ops
} // namespace mace
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_ARM_BASE_BIAS_ADD_H_
#define MACE_OPS_ARM_BASE_BIAS_ADD_H_
#include "mace/ops/delegator/bias_add.h"
namespace mace {
namespace ops {
namespace arm {
template<typename T>
class BiasAdd : public delegator::BiasAdd {
public:
explicit BiasAdd(const DelegatorParam &param) : delegator::BiasAdd(param) {}
~BiasAdd() = default;
MaceStatus Compute(const OpContext *context, const Tensor *input,
const Tensor *bias, Tensor *output) override;
private:
void AddBias(const OpContext *context, const Tensor *input,
const Tensor *bias, Tensor *output);
void Add1DimBias(utils::ThreadPool *thread_pool, const T *input_data,
const T *bias_data, T *output_data,
const index_t batch, const index_t channels,
const index_t image_size);
void Add2DimsBias(utils::ThreadPool *thread_pool, const T *input_data,
const T *bias_data, T *output_data,
const index_t batch, const index_t channels,
const index_t image_size);
};
} // namespace arm
} // namespace ops
} // namespace mace
#endif // MACE_OPS_ARM_BASE_BIAS_ADD_H_
// Copyright 2019 The MACE Authors. All Rights Reserved. // Copyright 2020 The MACE Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -12,18 +12,17 @@ ...@@ -12,18 +12,17 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/ops/arm/fp32/conv_2d.h" #include "mace/ops/arm/base/conv_2d.h"
#include <algorithm>
#include <memory> #include <memory>
#include <utility> #include <utility>
#include <algorithm>
#include "mace/utils/memory.h" #include "mace/utils/memory.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
void Conv2dBase::CalOutputShapeAndInputPadSize( void Conv2dBase::CalOutputShapeAndInputPadSize(
const std::vector<index_t> &input_shape, const std::vector<index_t> &input_shape,
...@@ -164,10 +163,10 @@ MaceStatus Conv2dBase::ResizeOutAndPadInOut(const OpContext *context, ...@@ -164,10 +163,10 @@ MaceStatus Conv2dBase::ResizeOutAndPadInOut(const OpContext *context,
auto scratch_buffer = context->device()->scratch_buffer(); auto scratch_buffer = context->device()->scratch_buffer();
const index_t padded_in_size = const index_t padded_in_size =
MACE_EXTRA_BUFFER_PAD_SIZE + (is_in_padded ? PadAlignSize( MACE_EXTRA_BUFFER_PAD_SIZE + (is_in_padded ? PadAlignSize(
sizeof(float) * batch * in_channels * padded_in_height type_size_ * batch * in_channels * padded_in_height
* padded_in_width) : 0); * padded_in_width) : 0);
const index_t padded_out_size = is_out_padded ? PadAlignSize( const index_t padded_out_size = is_out_padded ? PadAlignSize(
sizeof(float) * batch * out_channels * padded_out_height type_size_ * batch * out_channels * padded_out_height
* padded_out_width) : 0; * padded_out_width) : 0;
scratch_buffer->Rewind(); scratch_buffer->Rewind();
...@@ -176,7 +175,7 @@ MaceStatus Conv2dBase::ResizeOutAndPadInOut(const OpContext *context, ...@@ -176,7 +175,7 @@ MaceStatus Conv2dBase::ResizeOutAndPadInOut(const OpContext *context,
std::unique_ptr<Tensor> std::unique_ptr<Tensor>
padded_in = padded_in =
make_unique<Tensor>(scratch_buffer->Scratch(padded_in_size), make_unique<Tensor>(scratch_buffer->Scratch(padded_in_size),
DataType::DT_FLOAT); input->dtype());
padded_in->Resize({batch, in_channels, padded_in_height, padded_in_width}); padded_in->Resize({batch, in_channels, padded_in_height, padded_in_width});
PadInput(*input, in_pad_size[0], in_pad_size[2], padded_in.get()); PadInput(*input, in_pad_size[0], in_pad_size[2], padded_in.get());
*padded_input = std::move(padded_in); *padded_input = std::move(padded_in);
...@@ -185,7 +184,7 @@ MaceStatus Conv2dBase::ResizeOutAndPadInOut(const OpContext *context, ...@@ -185,7 +184,7 @@ MaceStatus Conv2dBase::ResizeOutAndPadInOut(const OpContext *context,
std::unique_ptr<Tensor> std::unique_ptr<Tensor>
padded_out = padded_out =
make_unique<Tensor>(scratch_buffer->Scratch(padded_out_size), make_unique<Tensor>(scratch_buffer->Scratch(padded_out_size),
DataType::DT_FLOAT); output->dtype());
padded_out->Resize({batch, out_channels, padded_out_height, padded_out->Resize({batch, out_channels, padded_out_height,
padded_out_width}); padded_out_width});
*padded_output = std::move(padded_out); *padded_output = std::move(padded_out);
...@@ -206,8 +205,8 @@ void Conv2dBase::PadInput(const Tensor &src, ...@@ -206,8 +205,8 @@ void Conv2dBase::PadInput(const Tensor &src,
const index_t padded_width = dst->dim(3); const index_t padded_width = dst->dim(3);
const int pad_bottom = static_cast<int>(padded_height - height - pad_top); const int pad_bottom = static_cast<int>(padded_height - height - pad_top);
const int pad_right = static_cast<int>(padded_width - width - pad_left); const int pad_right = static_cast<int>(padded_width - width - pad_left);
auto in_data = src.data<float>(); auto in_data = src.data<uint8_t>();
auto padded_in_data = dst->mutable_data<float>(); auto padded_in_data = dst->mutable_data<uint8_t>();
const index_t img_size = height * width; const index_t img_size = height * width;
const index_t padded_img_size = padded_height * padded_width; const index_t padded_img_size = padded_height * padded_width;
...@@ -215,25 +214,26 @@ void Conv2dBase::PadInput(const Tensor &src, ...@@ -215,25 +214,26 @@ void Conv2dBase::PadInput(const Tensor &src,
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) { for (index_t c = 0; c < channels; ++c) {
const index_t bc = b * channels + c; const index_t bc = b * channels + c;
const float *in_base = in_data + bc * img_size; const uint8_t *in_base = in_data + bc * img_size * type_size_;
float *padded_in_base = padded_in_data + bc * padded_img_size; uint8_t *padded_in_base =
padded_in_data + bc * padded_img_size * type_size_;
memset(padded_in_base, 0, sizeof(float) * pad_top * padded_width); memset(padded_in_base, 0, type_size_ * pad_top * padded_width);
padded_in_base += pad_top * padded_width; padded_in_base += pad_top * padded_width * type_size_;
for (index_t h = 0; h < height; ++h) { for (index_t h = 0; h < height; ++h) {
memset(padded_in_base, memset(padded_in_base,
0, 0,
sizeof(float) * pad_left); type_size_ * pad_left);
memcpy(padded_in_base + pad_left, memcpy(padded_in_base + pad_left * type_size_,
in_base, in_base,
sizeof(float) * width); type_size_ * width);
memset(padded_in_base + pad_left + width, memset(padded_in_base + (pad_left + width) * type_size_,
0, 0,
sizeof(float) * pad_right); type_size_ * pad_right);
in_base += width; in_base += width * type_size_;
padded_in_base += padded_width; padded_in_base += padded_width * type_size_;
} }
memset(padded_in_base, 0, sizeof(float) * pad_bottom * padded_width); memset(padded_in_base, 0, type_size_ * pad_bottom * padded_width);
} }
} }
} }
...@@ -247,8 +247,8 @@ void Conv2dBase::UnPadOutput(const Tensor &src, Tensor *dst) { ...@@ -247,8 +247,8 @@ void Conv2dBase::UnPadOutput(const Tensor &src, Tensor *dst) {
const index_t padded_height = src.dim(2); const index_t padded_height = src.dim(2);
const index_t padded_width = src.dim(3); const index_t padded_width = src.dim(3);
auto padded_out_data = src.data<float>(); auto padded_out_data = src.data<uint8_t>();
auto out_data = dst->mutable_data<float>(); auto out_data = dst->mutable_data<uint8_t>();
const index_t img_size = height * width; const index_t img_size = height * width;
const index_t padded_img_size = padded_height * padded_width; const index_t padded_img_size = padded_height * padded_width;
...@@ -256,21 +256,93 @@ void Conv2dBase::UnPadOutput(const Tensor &src, Tensor *dst) { ...@@ -256,21 +256,93 @@ void Conv2dBase::UnPadOutput(const Tensor &src, Tensor *dst) {
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) { for (index_t c = 0; c < channels; ++c) {
const index_t bc = (b * channels + c); const index_t bc = (b * channels + c);
float *out_base = out_data + bc * img_size; uint8_t *out_base = out_data + bc * img_size * type_size_;
const float *padded_out_base = padded_out_data + bc * padded_img_size; const uint8_t *padded_out_base =
padded_out_data + bc * padded_img_size * type_size_;
for (index_t h = 0; h < height; ++h) { for (index_t h = 0; h < height; ++h) {
memcpy(out_base, memcpy(out_base, padded_out_base, type_size_ * width);
padded_out_base, out_base += width * type_size_;
sizeof(float) * width); padded_out_base += padded_width * type_size_;
out_base += width;
padded_out_base += padded_width;
} // h } // h
} // c } // c
} // b } // b
} }
} // namespace fp32 ConvComputeParam Conv2dBase::PreWorkAndGetConv2DParam(
const OpContext *context, const Tensor *in_tensor, Tensor *out_tensor) {
auto &in_shape = in_tensor->shape();
auto &out_shape = out_tensor->shape();
const index_t batch = in_shape[0];
const index_t in_channels = in_shape[1];
const index_t in_height = in_shape[2];
const index_t in_width = in_shape[3];
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = in_channels * in_image_size;
const index_t out_batch_size = out_channels * out_image_size;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
return ConvComputeParam(batch, in_channels, in_height, in_width,
out_channels, out_height, out_width,
in_image_size, out_image_size,
in_batch_size, out_batch_size, &thread_pool);
}
DepthwiseConvComputeParam Conv2dBase::PreWorkAndGetDepthwiseConv2DParam(
const OpContext *context, const Tensor *input,
const Tensor *filter, Tensor *output) {
std::vector<index_t> out_shape(4);
std::vector<int> paddings(2);
auto &in_shape = input->shape();
auto &filter_shape = filter->shape();
CalOutputShapeAndInputPadSize(in_shape, filter_shape, &out_shape, &paddings);
out_shape[1] *= filter_shape[1];
MACE_CHECK(output->Resize(out_shape) == MaceStatus::MACE_SUCCESS,
"Resize failed.");
output->Clear();
const int pad_top = paddings[0] / 2;
const int pad_left = paddings[1] / 2;
const index_t batch = in_shape[0];
const index_t in_channels = in_shape[1];
const index_t in_height = in_shape[2];
const index_t in_width = in_shape[3];
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = in_channels * in_image_size;
const index_t out_batch_size = out_channels * out_image_size;
const index_t multiplier = out_channels / in_channels;
std::vector<index_t> out_bounds;
CalOutputBoundaryWithoutUsingInputPad(out_shape, paddings, &out_bounds);
const index_t valid_h_start = out_bounds[0];
const index_t valid_h_stop = out_bounds[1];
const index_t valid_w_start = out_bounds[2];
const index_t valid_w_stop = out_bounds[3];
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
return DepthwiseConvComputeParam(
batch, in_channels, in_height, in_width, out_channels, out_height,
out_width, in_image_size, out_image_size, in_batch_size, out_batch_size,
&thread_pool, pad_top, pad_left, multiplier, valid_h_start, valid_h_stop,
valid_w_start, valid_w_stop);
}
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
......
// Copyright 2019 The MACE Authors. All Rights Reserved. // Copyright 2020 The MACE Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -12,28 +12,97 @@ ...@@ -12,28 +12,97 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_OPS_ARM_FP32_CONV_2D_H_ #ifndef MACE_OPS_ARM_BASE_CONV_2D_H_
#define MACE_OPS_ARM_FP32_CONV_2D_H_ #define MACE_OPS_ARM_BASE_CONV_2D_H_
#include <vector>
#include <memory> #include <memory>
#include <vector>
#include "mace/core/ops/op_context.h" #include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/ops/delegator/conv_2d.h" #include "mace/ops/arm/base/gemm.h"
#include "mace/ops/arm/fp32/gemm.h"
#include "mace/ops/common/conv_pool_2d_util.h" #include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/ops/delegator/conv_2d.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
struct ConvComputeParam {
const index_t batch;
const index_t in_channels;
const index_t in_height;
const index_t in_width;
const index_t out_channels;
const index_t out_height;
const index_t out_width;
const index_t in_image_size;
const index_t out_image_size;
const index_t in_batch_size;
const index_t out_batch_size;
utils::ThreadPool &thread_pool;
ConvComputeParam(const index_t b,
const index_t in_c,
const index_t in_h,
const index_t in_w,
const index_t out_c,
const index_t out_h,
const index_t out_w,
const index_t in_size,
const index_t out_size,
const index_t in_b_size,
const index_t out_b_size,
utils::ThreadPool *thrd_pool)
: batch(b), in_channels(in_c), in_height(in_h), in_width(in_w),
out_channels(out_c), out_height(out_h), out_width(out_w),
in_image_size(in_size), out_image_size(out_size),
in_batch_size(in_b_size), out_batch_size(out_b_size),
thread_pool(*thrd_pool) {}
};
struct DepthwiseConvComputeParam : public ConvComputeParam {
const int pad_top;
const int pad_left;
const index_t multiplier;
const index_t valid_h_start;
const index_t valid_h_stop;
const index_t valid_w_start;
const index_t valid_w_stop;
DepthwiseConvComputeParam(const index_t b,
const index_t in_c,
const index_t in_h,
const index_t in_w,
const index_t out_c,
const index_t out_h,
const index_t out_w,
const index_t in_size,
const index_t out_size,
const index_t in_b_size,
const index_t out_b_size,
utils::ThreadPool *thrd_pool,
const int pad_top_data,
const int pad_left_data,
const index_t multiplier_data,
const index_t valid_height_start,
const index_t valid_height_stop,
const index_t valid_width_start,
const index_t valid_width_stop)
: ConvComputeParam(b, in_c, in_h, in_w, out_c, out_h, out_w,
in_size, out_size, in_b_size, out_b_size, thrd_pool),
pad_top(pad_top_data), pad_left(pad_left_data),
multiplier(multiplier_data),
valid_h_start(valid_height_start), valid_h_stop(valid_height_stop),
valid_w_start(valid_width_start), valid_w_stop(valid_width_stop) {}
};
class Conv2dBase : public delegator::Conv2d { class Conv2dBase : public delegator::Conv2d {
public: public:
explicit Conv2dBase(const delegator::Conv2dParam &param) explicit Conv2dBase(const delegator::Conv2dParam &param, int type_size)
: delegator::Conv2d(param) {} : delegator::Conv2d(param), type_size_(type_size) {}
virtual ~Conv2dBase() = default; virtual ~Conv2dBase() = default;
...@@ -72,11 +141,19 @@ class Conv2dBase : public delegator::Conv2d { ...@@ -72,11 +141,19 @@ class Conv2dBase : public delegator::Conv2d {
const int pad_left, const int pad_left,
Tensor *dst); Tensor *dst);
void UnPadOutput(const Tensor &src, Tensor *dst); void UnPadOutput(const Tensor &src, Tensor *dst);
ConvComputeParam PreWorkAndGetConv2DParam(
const OpContext *context, const Tensor *in_tensor, Tensor *out_tensor);
DepthwiseConvComputeParam PreWorkAndGetDepthwiseConv2DParam(
const OpContext *context, const Tensor *input,
const Tensor *filter, Tensor *output);
private:
int type_size_;
}; };
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
#endif // MACE_OPS_ARM_FP32_CONV_2D_H_ #endif // MACE_OPS_ARM_BASE_CONV_2D_H_
// Copyright 2019 The MACE Authors. All Rights Reserved. // Copyright 2020 The MACE Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -12,33 +12,16 @@ ...@@ -12,33 +12,16 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/ops/arm/fp32/conv_2d.h" #include "mace/ops/arm/base/conv_2d_1x1.h"
#include "mace/ops/arm/fp32/gemm.h"
#include "mace/ops/delegator/conv_2d.h" #include <vector>
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
class Conv2dK1x1 : public Conv2dBase {
public:
explicit Conv2dK1x1(const delegator::Conv2dParam &param)
: Conv2dBase(param),
gemm_(delegator::GemmParam()) {}
virtual ~Conv2dK1x1() {}
MaceStatus Compute( template<typename T>
const OpContext *context, MaceStatus Conv2dK1x1<T>::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
private:
Gemm gemm_;
};
MaceStatus Conv2dK1x1::Compute(const OpContext *context,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
Tensor *output) { Tensor *output) {
...@@ -50,13 +33,8 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context, ...@@ -50,13 +33,8 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context,
std::vector<index_t> output_shape; std::vector<index_t> output_shape;
std::vector<int> in_pad_size; std::vector<int> in_pad_size;
std::vector<int> out_pad_size; std::vector<int> out_pad_size;
CalOutputShapeAndPadSize(input, CalOutputShapeAndPadSize(input, filter, 1, 1,
filter, &output_shape, &in_pad_size, &out_pad_size);
1,
1,
&output_shape,
&in_pad_size,
&out_pad_size);
MACE_RETURN_IF_ERROR(output->Resize(output_shape)); MACE_RETURN_IF_ERROR(output->Resize(output_shape));
const index_t out_channels = output_shape[1]; const index_t out_channels = output_shape[1];
...@@ -70,16 +48,16 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context, ...@@ -70,16 +48,16 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context,
in_height != padded_in_height || in_width != padded_in_width; in_height != padded_in_height || in_width != padded_in_width;
auto scratch_buffer = context->device()->scratch_buffer(); auto scratch_buffer = context->device()->scratch_buffer();
const index_t padded_in_size = is_in_padded ? PadAlignSize( const index_t padded_in_size = is_in_padded ? PadAlignSize(
sizeof(float) * batch * in_channels * padded_in_height sizeof(T) * batch * in_channels * padded_in_height
* padded_in_width) : 0; * padded_in_width) : 0;
const index_t pack_filter_size = const index_t pack_filter_size =
PadAlignSize(sizeof(float) * out_channels * in_channels); PadAlignSize(sizeof(T) * out_channels * in_channels);
const index_t pack_input_size = const index_t pack_input_size =
PadAlignSize( PadAlignSize(
sizeof(float) * in_channels * padded_in_height * padded_in_width); sizeof(T) * in_channels * padded_in_height * padded_in_width);
const index_t pack_output_size = const index_t pack_output_size =
PadAlignSize( PadAlignSize(
sizeof(float) * out_channels * padded_in_height * padded_in_width); sizeof(T) * out_channels * padded_in_height * padded_in_width);
const index_t gemm_pack_size = const index_t gemm_pack_size =
pack_filter_size + pack_input_size + pack_output_size; pack_filter_size + pack_input_size + pack_output_size;
...@@ -115,12 +93,11 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context, ...@@ -115,12 +93,11 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context,
void RegisterConv2dK1x1Delegator(OpDelegatorRegistry *registry) { void RegisterConv2dK1x1Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR( MACE_REGISTER_DELEGATOR(
registry, Conv2dK1x1, delegator::Conv2dParam, registry, Conv2dK1x1<float>, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU, MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K1x1)); float, ImplType::NEON, K1x1));
} }
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_ARM_BASE_CONV_2D_1X1_H_
#define MACE_OPS_ARM_BASE_CONV_2D_1X1_H_
#include "mace/ops/arm/base/conv_2d.h"
#include "mace/ops/arm/base/gemm.h"
namespace mace {
namespace ops {
namespace arm {
template<typename T>
class Conv2dK1x1 : public Conv2dBase {
public:
explicit Conv2dK1x1(const delegator::Conv2dParam &param)
: Conv2dBase(param, sizeof(T)),
gemm_(delegator::GemmParam()) {}
virtual ~Conv2dK1x1() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
private:
Gemm<T> gemm_;
};
} // namespace arm
} // namespace ops
} // namespace mace
#endif // MACE_OPS_ARM_BASE_CONV_2D_1X1_H_
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/arm/base/conv_2d_1xn.h"
namespace mace {
namespace ops {
namespace arm {
void RegisterConv2dK1xNDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Conv2dK1x7S1<float>, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K1x7S1));
MACE_REGISTER_DELEGATOR(
registry, Conv2dK7x1S1<float>, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K7x1S1));
MACE_REGISTER_DELEGATOR(
registry, Conv2dK1x15S1<float>, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K1x15S1));
MACE_REGISTER_DELEGATOR(
registry, Conv2dK15x1S1<float>, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K15x1S1));
}
} // namespace arm
} // namespace ops
} // namespace mace
// Copyright 2019 The MACE Authors. All Rights Reserved. // Copyright 2020 The MACE Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -12,76 +12,66 @@ ...@@ -12,76 +12,66 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_OPS_ARM_FP32_CONV_2D_1XN_H_ #ifndef MACE_OPS_ARM_BASE_CONV_2D_1XN_H_
#define MACE_OPS_ARM_FP32_CONV_2D_1XN_H_ #define MACE_OPS_ARM_BASE_CONV_2D_1XN_H_
#include <vector> #include <vector>
#include "mace/core/ops/op_context.h" #include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/ops/arm/fp32/conv_2d.h" #include "mace/ops/arm/base/conv_2d_mxn.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
class Conv2dK1x7S1 : public Conv2dBase { template<typename T>
class Conv2dK1x7S1 : public Conv2dKMxN<T> {
public: public:
explicit Conv2dK1x7S1(const delegator::Conv2dParam &param) explicit Conv2dK1x7S1(const delegator::Conv2dParam &param)
: Conv2dBase(param) {} : Conv2dKMxN<T>(param, 1, 4) {}
virtual ~Conv2dK1x7S1() {} virtual ~Conv2dK1x7S1() {}
MaceStatus Compute( MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
const OpContext *context, const T *input_data, T *output_data) override;
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
}; };
class Conv2dK7x1S1 : public Conv2dBase { template<typename T>
class Conv2dK7x1S1 : public Conv2dKMxN<T> {
public: public:
explicit Conv2dK7x1S1(const delegator::Conv2dParam &param) explicit Conv2dK7x1S1(const delegator::Conv2dParam &param)
: Conv2dBase(param) {} : Conv2dKMxN<T>(param, 4, 1) {}
virtual ~Conv2dK7x1S1() {} virtual ~Conv2dK7x1S1() {}
MaceStatus Compute( MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
const OpContext *context, const T *input_data, T *output_data) override;
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
}; };
class Conv2dK1x15S1 : public Conv2dBase { template<typename T>
class Conv2dK1x15S1 : public Conv2dKMxN<T> {
public: public:
explicit Conv2dK1x15S1(const delegator::Conv2dParam &param) explicit Conv2dK1x15S1(const delegator::Conv2dParam &param)
: Conv2dBase(param) {} : Conv2dKMxN<T>(param, 1, 4) {}
virtual ~Conv2dK1x15S1() {} virtual ~Conv2dK1x15S1() {}
MaceStatus Compute( MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
const OpContext *context, const T *input_data, T *output_data) override;
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
}; };
class Conv2dK15x1S1 : public Conv2dBase { template<typename T>
class Conv2dK15x1S1 : public Conv2dKMxN<T> {
public: public:
explicit Conv2dK15x1S1(const delegator::Conv2dParam &param) explicit Conv2dK15x1S1(const delegator::Conv2dParam &param)
: Conv2dBase(param) {} : Conv2dKMxN<T>(param, 4, 1) {}
virtual ~Conv2dK15x1S1() {} virtual ~Conv2dK15x1S1() {}
MaceStatus Compute( MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
const OpContext *context, const T *input_data, T *output_data) override;
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
}; };
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
#endif // MACE_OPS_ARM_FP32_CONV_2D_1XN_H_ #endif // MACE_OPS_ARM_BASE_CONV_2D_1XN_H_
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/arm/base/conv_2d_3x3.h"
namespace mace {
namespace ops {
namespace arm {
void RegisterConv2dK3x3Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Conv2dK3x3S1<float>, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S1));
MACE_REGISTER_DELEGATOR(
registry, Conv2dK3x3S2<float>, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S2));
}
} // namespace arm
} // namespace ops
} // namespace mace
// Copyright 2019 The MACE Authors. All Rights Reserved. // Copyright 2020 The MACE Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -12,50 +12,44 @@ ...@@ -12,50 +12,44 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_OPS_ARM_FP32_CONV_2D_3X3_H_ #ifndef MACE_OPS_ARM_BASE_CONV_2D_3X3_H_
#define MACE_OPS_ARM_FP32_CONV_2D_3X3_H_ #define MACE_OPS_ARM_BASE_CONV_2D_3X3_H_
#include <vector> #include <vector>
#include "mace/core/ops/op_context.h" #include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/ops/arm/fp32/conv_2d.h" #include "mace/ops/arm/base/conv_2d_mxn.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
class Conv2dK3x3S1 : public Conv2dBase { template<typename T>
class Conv2dK3x3S1 : public Conv2dKMxN<T> {
public: public:
explicit Conv2dK3x3S1(const delegator::Conv2dParam &param) explicit Conv2dK3x3S1(const delegator::Conv2dParam &param)
: Conv2dBase(param) {} : Conv2dKMxN<T>(param, 2, 4) {}
virtual ~Conv2dK3x3S1() {} virtual ~Conv2dK3x3S1() {}
MaceStatus Compute( MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
const OpContext *context, const T *input_data, T *output_data) override;
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
}; };
class Conv2dK3x3S2 : public Conv2dBase { template<typename T>
class Conv2dK3x3S2 : public Conv2dKMxN<T> {
public: public:
explicit Conv2dK3x3S2(const delegator::Conv2dParam &param) explicit Conv2dK3x3S2(const delegator::Conv2dParam &param)
: Conv2dBase(param) {} : Conv2dKMxN<T>(param, 1, 4) {}
virtual ~Conv2dK3x3S2() {} virtual ~Conv2dK3x3S2() {}
MaceStatus Compute( MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
const OpContext *context, const T *input_data, T *output_data) override;
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
}; };
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
#endif // MACE_OPS_ARM_FP32_CONV_2D_3X3_H_ #endif // MACE_OPS_ARM_BASE_CONV_2D_3X3_H_
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/arm/base/conv_2d_5x5.h"
namespace mace {
namespace ops {
namespace arm {
void RegisterConv2dK5x5Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Conv2dK5x5S1<float>, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K5x5S1));
}
} // namespace arm
} // namespace ops
} // namespace mace
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_ARM_BASE_CONV_2D_5X5_H_
#define MACE_OPS_ARM_BASE_CONV_2D_5X5_H_
#include <vector>
#include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/arm/base/conv_2d_mxn.h"
#include "mace/public/mace.h"
namespace mace {
namespace ops {
namespace arm {
template<typename T>
class Conv2dK5x5S1 : public Conv2dKMxN<T> {
public:
explicit Conv2dK5x5S1(const delegator::Conv2dParam &param)
: Conv2dKMxN<T>(param, 1, 4) {}
virtual ~Conv2dK5x5S1() {}
MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
const T *input_data, T *output_data) override;
};
} // namespace arm
} // namespace ops
} // namespace mace
#endif // MACE_OPS_ARM_BASE_CONV_2D_5X5_H_
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/arm/base/conv_2d_7x7.h"
namespace mace {
namespace ops {
namespace arm {
void RegisterConv2dK7x7Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Conv2dK7x7S1<float>, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K7x7S1));
MACE_REGISTER_DELEGATOR(
registry, Conv2dK7x7S2<float>, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K7x7S2));
MACE_REGISTER_DELEGATOR(
registry, Conv2dK7x7S3<float>, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K7x7S3));
}
} // namespace arm
} // namespace ops
} // namespace mace
// Copyright 2019 The MACE Authors. All Rights Reserved. // Copyright 2020 The MACE Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -12,63 +12,55 @@ ...@@ -12,63 +12,55 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_OPS_ARM_FP32_CONV_2D_7X7_H_ #ifndef MACE_OPS_ARM_BASE_CONV_2D_7X7_H_
#define MACE_OPS_ARM_FP32_CONV_2D_7X7_H_ #define MACE_OPS_ARM_BASE_CONV_2D_7X7_H_
#include <vector> #include <vector>
#include "mace/core/ops/op_context.h" #include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/ops/arm/fp32/conv_2d.h" #include "mace/ops/arm/base/conv_2d_mxn.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
class Conv2dK7x7S1 : public Conv2dBase { template<typename T>
class Conv2dK7x7S1 : public Conv2dKMxN<T> {
public: public:
explicit Conv2dK7x7S1(const delegator::Conv2dParam &param) explicit Conv2dK7x7S1(const delegator::Conv2dParam &param)
: Conv2dBase(param) {} : Conv2dKMxN<T>(param, 1, 4) {}
virtual ~Conv2dK7x7S1() {} virtual ~Conv2dK7x7S1() {}
MaceStatus Compute( MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
const OpContext *context, const T *input_data, T *output_data) override;
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
}; };
class Conv2dK7x7S2 : public Conv2dBase { template<typename T>
class Conv2dK7x7S2 : public Conv2dKMxN<T> {
public: public:
explicit Conv2dK7x7S2(const delegator::Conv2dParam &param) explicit Conv2dK7x7S2(const delegator::Conv2dParam &param)
: Conv2dBase(param) {} : Conv2dKMxN<T>(param, 1, 4) {}
virtual ~Conv2dK7x7S2() {} virtual ~Conv2dK7x7S2() {}
MaceStatus Compute( MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
const OpContext *context, const T *input_data, T *output_data) override;
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
}; };
class Conv2dK7x7S3 : public Conv2dBase { template<typename T>
class Conv2dK7x7S3 : public Conv2dKMxN<T> {
public: public:
explicit Conv2dK7x7S3(const delegator::Conv2dParam &param) explicit Conv2dK7x7S3(const delegator::Conv2dParam &param)
: Conv2dBase(param) {} : Conv2dKMxN<T>(param, 1, 4) {}
virtual ~Conv2dK7x7S3() {} virtual ~Conv2dK7x7S3() {}
MaceStatus Compute( MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
const OpContext *context, const T *input_data, T *output_data) override;
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
}; };
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
#endif // MACE_OPS_ARM_FP32_CONV_2D_7X7_H_ #endif // MACE_OPS_ARM_BASE_CONV_2D_7X7_H_
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/arm/base/conv_2d_general.h"
#include <memory>
namespace mace {
namespace ops {
namespace arm {
template<typename T>
MaceStatus Conv2dGeneral<T>::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) {
std::unique_ptr<const Tensor> padded_input;
std::unique_ptr<Tensor> padded_output;
ResizeOutAndPadInOut(context, input, filter, output, 1, 4,
&padded_input, &padded_output);
const Tensor *in_tensor = input;
if (padded_input != nullptr) {
in_tensor = padded_input.get();
}
Tensor *out_tensor = output;
if (padded_output != nullptr) {
out_tensor = padded_output.get();
}
out_tensor->Clear();
Tensor::MappingGuard in_guard(input);
Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard out_guard(output);
const T *filter_data = filter->data<T>();
const T *input_data = in_tensor->data<T>();
T *output_data = out_tensor->mutable_data<T>();
const ConvComputeParam p =
PreWorkAndGetConv2DParam(context, in_tensor, out_tensor);
auto &filter_shape = filter->shape();
DoCompute(p, filter_data, input_data, output_data, filter_shape);
UnPadOutput(*out_tensor, output);
return MaceStatus::MACE_SUCCESS;
}
void RegisterConv2dGeneralDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Conv2dGeneral<float>, delegator::Conv2dParam,
MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, float, ImplType::NEON));
}
} // namespace arm
} // namespace ops
} // namespace mace
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_ARM_BASE_CONV_2D_GENERAL_H_
#define MACE_OPS_ARM_BASE_CONV_2D_GENERAL_H_
#include <vector>
#include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/arm/base/conv_2d.h"
#include "mace/public/mace.h"
namespace mace {
namespace ops {
namespace arm {
template<typename T>
class Conv2dGeneral : public Conv2dBase {
public:
explicit Conv2dGeneral(const delegator::Conv2dParam &param)
: Conv2dBase(param, sizeof(T)) {}
virtual ~Conv2dGeneral() {}
MaceStatus Compute(const OpContext *context, const Tensor *input,
const Tensor *filter, Tensor *output) override;
protected:
MaceStatus DoCompute(
const ConvComputeParam &p, const T *filter_data,
const T *input_data, T *output_data,
const std::vector<index_t> &filter_shape);
};
} // namespace arm
} // namespace ops
} // namespace mace
#endif // MACE_OPS_ARM_BASE_CONV_2D_GENERAL_H_
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_ARM_BASE_CONV_2D_MXN_H_
#define MACE_OPS_ARM_BASE_CONV_2D_MXN_H_
#include <memory>
#include <vector>
#include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/arm/base/conv_2d.h"
#include "mace/public/mace.h"
namespace mace {
namespace ops {
namespace arm {
template<typename T>
class Conv2dKMxN : public Conv2dBase {
public:
explicit Conv2dKMxN(const delegator::Conv2dParam &param,
const int tile_h, const int tile_w)
: Conv2dBase(param, sizeof(T)),
out_tile_h_(tile_h), out_tile_w_(tile_w) {}
virtual ~Conv2dKMxN() {}
MaceStatus Compute(const OpContext *context, const Tensor *input,
const Tensor *filter, Tensor *output) override {
std::unique_ptr<const Tensor> padded_input;
std::unique_ptr<Tensor> padded_output;
ResizeOutAndPadInOut(context, input, filter, output, out_tile_h_,
out_tile_w_, &padded_input, &padded_output);
const Tensor *in_tensor = input;
if (padded_input != nullptr) {
in_tensor = padded_input.get();
}
Tensor *out_tensor = output;
if (padded_output != nullptr) {
out_tensor = padded_output.get();
}
out_tensor->Clear();
Tensor::MappingGuard in_guard(input);
Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard out_guard(output);
const T *filter_data = filter->data<T>();
const T *input_data = in_tensor->data<T>();
T *output_data = out_tensor->mutable_data<T>();
const ConvComputeParam p =
PreWorkAndGetConv2DParam(context, in_tensor, out_tensor);
DoCompute(p, filter_data, input_data, output_data);
UnPadOutput(*out_tensor, output);
return MaceStatus::MACE_SUCCESS;
}
virtual MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
const T *input_data, T *output_data) = 0;
private:
const int out_tile_h_;
const int out_tile_w_;
};
} // namespace arm
} // namespace ops
} // namespace mace
#endif // MACE_OPS_ARM_BASE_CONV_2D_MXN_H_
// Copyright 2019 The MACE Authors. All Rights Reserved. // Copyright 2020 The MACE Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -12,17 +12,17 @@ ...@@ -12,17 +12,17 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/ops/arm/fp32/deconv_2d.h" #include "mace/ops/arm/base/deconv_2d.h"
#include <utility>
#include <functional> #include <functional>
#include "mace/utils/memory.h" #include <utility>
#include "mace/ops/common/conv_pool_2d_util.h" #include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/utils/memory.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
MaceStatus Deconv2dBase::ResizeOutAndPadOut( MaceStatus Deconv2dBase::ResizeOutAndPadOut(
const OpContext *context, const OpContext *context,
...@@ -67,7 +67,7 @@ MaceStatus Deconv2dBase::ResizeOutAndPadOut( ...@@ -67,7 +67,7 @@ MaceStatus Deconv2dBase::ResizeOutAndPadOut(
std::accumulate(padded_out_shape.begin(), std::accumulate(padded_out_shape.begin(),
padded_out_shape.end(), padded_out_shape.end(),
1, 1,
std::multiplies<index_t>()) * sizeof(float); std::multiplies<index_t>()) * type_size_;
ScratchBuffer *scratch = context->device()->scratch_buffer(); ScratchBuffer *scratch = context->device()->scratch_buffer();
scratch->Rewind(); scratch->Rewind();
index_t scratch_size = PadAlignSize(padded_out_size); index_t scratch_size = PadAlignSize(padded_out_size);
...@@ -75,7 +75,7 @@ MaceStatus Deconv2dBase::ResizeOutAndPadOut( ...@@ -75,7 +75,7 @@ MaceStatus Deconv2dBase::ResizeOutAndPadOut(
std::unique_ptr<Tensor> std::unique_ptr<Tensor>
padded_out padded_out
(make_unique<Tensor>(scratch->Scratch(scratch_size), DT_FLOAT)); (make_unique<Tensor>(scratch->Scratch(scratch_size), output->dtype()));
padded_out->Reshape(padded_out_shape); padded_out->Reshape(padded_out_shape);
*padded_output = std::move(padded_out); *padded_output = std::move(padded_out);
} }
...@@ -97,24 +97,97 @@ void Deconv2dBase::UnPadOutput(const Tensor &src, ...@@ -97,24 +97,97 @@ void Deconv2dBase::UnPadOutput(const Tensor &src,
const index_t padded_height = src.dim(2); const index_t padded_height = src.dim(2);
const index_t padded_width = src.dim(3); const index_t padded_width = src.dim(3);
auto padded_out_data = src.data<float>(); auto padded_out_data = src.data<uint8_t>();
auto out_data = dst->mutable_data<float>(); auto out_data = dst->mutable_data<uint8_t>();
for (index_t i = 0; i < batch; ++i) { for (index_t i = 0; i < batch; ++i) {
for (index_t j = 0; j < channels; ++j) { for (index_t j = 0; j < channels; ++j) {
for (index_t k = 0; k < height; ++k) { for (index_t k = 0; k < height; ++k) {
const float *input_base = const uint8_t *input_base =
padded_out_data + ((i * channels + j) * padded_height padded_out_data + ((i * channels + j) * padded_height
+ (k + pad_h)) * padded_width; + (k + pad_h)) * padded_width * type_size_;
float *output_base = uint8_t *output_base =
out_data + ((i * channels + j) * height + k) * width; out_data + ((i * channels + j) * height + k) * width * type_size_;
memcpy(output_base, input_base + pad_w, width * sizeof(float)); memcpy(output_base,
input_base + pad_w * type_size_,
width * type_size_);
} }
} }
} }
} }
} // namespace fp32 DeconvComputeParam Deconv2dBase::PreWorkAndGetDeconvParam(
const OpContext *context, const Tensor *input, Tensor *out_tensor) {
auto &in_shape = input->shape();
auto &out_shape = out_tensor->shape();
const index_t batch = in_shape[0];
const index_t inch = in_shape[1];
const index_t h = in_shape[2];
const index_t w = in_shape[3];
const index_t outch = out_shape[1];
const index_t outh = out_shape[2];
const index_t outw = out_shape[3];
const index_t out_img_size = outh * outw;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
return DeconvComputeParam(batch, inch, h, w, outch, outh, outw,
out_img_size, &thread_pool);
}
DepthwiseDeconvComputeParam Deconv2dBase::PreWorkAndGetDepthwiseDeconvParam(
const OpContext *context, const Tensor *input, Tensor *out_tensor) {
auto &in_shape = input->shape();
auto &out_shape = out_tensor->shape();
const index_t batch = in_shape[0];
const index_t channels = in_shape[1];
const index_t h = in_shape[2];
const index_t w = in_shape[3];
const index_t in_img_size = h * w;
const index_t outh = out_shape[2];
const index_t outw = out_shape[3];
const index_t out_img_size = outh * outw;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
return DepthwiseDeconvComputeParam(batch, channels, h, w, in_img_size,
outh, outw, out_img_size, &thread_pool);
}
GroupDeconvComputeParam Deconv2dBase::PreWorkAndGetGroupDeconvParam(
const OpContext *context, const Tensor *input, Tensor *out_tensor) {
auto &in_shape = input->shape();
auto &out_shape = out_tensor->shape();
const index_t batch = in_shape[0];
const index_t inch = in_shape[1];
const index_t h = in_shape[2];
const index_t w = in_shape[3];
const index_t outch = out_shape[1];
const index_t outh = out_shape[2];
const index_t outw = out_shape[3];
const index_t in_img_size = h * w;
const index_t out_img_size = outh * outw;
const index_t inch_g = inch / group_;
const index_t outch_g = outch / group_;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
return GroupDeconvComputeParam(batch, inch, h, w, outch, outh, outw,
in_img_size, out_img_size, inch_g,
outch_g, &thread_pool);
}
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_ARM_BASE_DECONV_2D_H_
#define MACE_OPS_ARM_BASE_DECONV_2D_H_
#include <memory>
#include <vector>
#include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h"
#include "mace/core/types.h"
#include "mace/ops/arm/base/gemm.h"
#include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/ops/delegator/deconv_2d.h"
#include "mace/public/mace.h"
namespace mace {
namespace ops {
namespace arm {
struct DeconvComputeParam {
const index_t batch;
const index_t in_channels;
const index_t in_height;
const index_t in_width;
const index_t out_channels;
const index_t out_height;
const index_t out_width;
const index_t out_img_size;
utils::ThreadPool &thread_pool;
DeconvComputeParam(const index_t b,
const index_t in_c,
const index_t in_h,
const index_t in_w,
const index_t out_c,
const index_t out_h,
const index_t out_w,
const index_t out_size,
utils::ThreadPool *thrd_pool)
: batch(b), in_channels(in_c), in_height(in_h), in_width(in_w),
out_channels(out_c), out_height(out_h), out_width(out_w),
out_img_size(out_size), thread_pool(*thrd_pool) {}
};
struct DepthwiseDeconvComputeParam {
const index_t batch;
const index_t in_channels;
const index_t in_height;
const index_t in_width;
const index_t in_img_size;
const index_t out_height;
const index_t out_width;
const index_t out_img_size;
utils::ThreadPool &thread_pool;
DepthwiseDeconvComputeParam(const index_t b,
const index_t in_c,
const index_t in_h,
const index_t in_w,
const index_t in_size,
const index_t out_h,
const index_t out_w,
const index_t out_size,
utils::ThreadPool *thrd_pool)
: batch(b),
in_channels(in_c),
in_height(in_h),
in_width(in_w),
in_img_size(in_size),
out_height(out_h),
out_width(out_w),
out_img_size(out_size),
thread_pool(*thrd_pool) {}
};
struct GroupDeconvComputeParam {
const index_t batch;
const index_t in_channels;
const index_t in_height;
const index_t in_width;
const index_t out_channels;
const index_t out_height;
const index_t out_width;
const index_t in_img_size;
const index_t out_img_size;
const index_t inch_g;
const index_t outch_g;
utils::ThreadPool &thread_pool;
GroupDeconvComputeParam(const index_t in_b,
const index_t in_ch,
const index_t in_h,
const index_t in_w,
const index_t out_ch,
const index_t out_h,
const index_t out_w,
const index_t in_size,
const index_t out_size,
const index_t in_ch_g,
const index_t out_ch_g,
utils::ThreadPool *thrd_pool)
: batch(in_b),
in_channels(in_ch),
in_height(in_h),
in_width(in_w),
out_channels(out_ch),
out_height(out_h),
out_width(out_w),
in_img_size(in_size),
out_img_size(out_size),
inch_g(in_ch_g),
outch_g(out_ch_g),
thread_pool(*thrd_pool) {}
};
class Deconv2dBase : public delegator::Deconv2d {
public:
explicit Deconv2dBase(const delegator::Deconv2dParam &param, int type_size)
: delegator::Deconv2d(param),
group_(param.group_), type_size_(type_size) {}
virtual ~Deconv2dBase() = default;
protected:
MaceStatus ResizeOutAndPadOut(const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output,
std::vector<int> *out_pad_size,
std::unique_ptr<Tensor> *padded_output);
void UnPadOutput(const Tensor &src,
const std::vector<int> &out_pad_size,
Tensor *dst);
DeconvComputeParam PreWorkAndGetDeconvParam(
const OpContext *context, const Tensor *input, Tensor *out_tensor);
DepthwiseDeconvComputeParam PreWorkAndGetDepthwiseDeconvParam(
const OpContext *context, const Tensor *input, Tensor *out_tensor);
GroupDeconvComputeParam PreWorkAndGetGroupDeconvParam(
const OpContext *context, const Tensor *input, Tensor *out_tensor);
protected:
index_t group_;
private:
int type_size_;
};
} // namespace arm
} // namespace ops
} // namespace mace
#endif // MACE_OPS_ARM_BASE_DECONV_2D_H_
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/arm/base/deconv_2d_2x2.h"
namespace mace {
namespace ops {
namespace arm {
void RegisterDeconv2dK2x2Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Deconv2dK2x2S1<float>, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
float, ImplType::NEON, K2x2S1));
MACE_REGISTER_DELEGATOR(
registry, Deconv2dK2x2S2<float>, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
float, ImplType::NEON, K2x2S2));
}
} // namespace arm
} // namespace ops
} // namespace mace
// Copyright 2019 The MACE Authors. All Rights Reserved. // Copyright 2020 The MACE Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_OPS_ARM_FP32_DECONV_2D_2X2_H_ #ifndef MACE_OPS_ARM_BASE_DECONV_2D_2X2_H_
#define MACE_OPS_ARM_FP32_DECONV_2D_2X2_H_ #define MACE_OPS_ARM_BASE_DECONV_2D_2X2_H_
#include <vector> #include <vector>
#include <memory> #include <memory>
...@@ -21,46 +21,38 @@ ...@@ -21,46 +21,38 @@
#include "mace/core/ops/op_context.h" #include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/core/types.h" #include "mace/core/types.h"
#include "mace/ops/arm/fp32/deconv_2d.h" #include "mace/ops/arm/base/deconv_2d_mxn.h"
#include "mace/ops/common/conv_pool_2d_util.h" #include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
class Deconv2dK2x2S1 : public Deconv2dBase { template<typename T>
class Deconv2dK2x2S1 : public Deconv2dKMxN<T> {
public: public:
explicit Deconv2dK2x2S1(const delegator::Deconv2dParam &param) explicit Deconv2dK2x2S1(const delegator::Deconv2dParam &param)
: Deconv2dBase(param) {} : Deconv2dKMxN<T>(param) {}
virtual ~Deconv2dK2x2S1() {} virtual ~Deconv2dK2x2S1() {}
MaceStatus Compute( MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter,
const OpContext *context, const T *input_data, T *padded_out_data) override;
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
}; };
class Deconv2dK2x2S2 : public Deconv2dBase { template<typename T>
class Deconv2dK2x2S2 : public Deconv2dKMxN<T> {
public: public:
explicit Deconv2dK2x2S2(const delegator::Deconv2dParam &param) explicit Deconv2dK2x2S2(const delegator::Deconv2dParam &param)
: Deconv2dBase(param) {} : Deconv2dKMxN<T>(param) {}
virtual ~Deconv2dK2x2S2() {} virtual ~Deconv2dK2x2S2() {}
MaceStatus Compute( MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter,
const OpContext *context, const T *input_data, T *padded_out_data) override;
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
}; };
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
#endif // MACE_OPS_ARM_FP32_DECONV_2D_2X2_H_ #endif // MACE_OPS_ARM_BASE_DECONV_2D_2X2_H_
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/arm/base/deconv_2d_3x3.h"
namespace mace {
namespace ops {
namespace arm {
void RegisterDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Deconv2dK3x3S1<float>, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S1));
MACE_REGISTER_DELEGATOR(
registry, Deconv2dK3x3S2<float>, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S2));
}
} // namespace arm
} // namespace ops
} // namespace mace
// Copyright 2019 The MACE Authors. All Rights Reserved. // Copyright 2020 The MACE Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_OPS_ARM_FP32_DECONV_2D_3X3_H_ #ifndef MACE_OPS_ARM_BASE_DECONV_2D_3X3_H_
#define MACE_OPS_ARM_FP32_DECONV_2D_3X3_H_ #define MACE_OPS_ARM_BASE_DECONV_2D_3X3_H_
#include <vector> #include <vector>
#include <memory> #include <memory>
...@@ -21,46 +21,38 @@ ...@@ -21,46 +21,38 @@
#include "mace/core/ops/op_context.h" #include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/core/types.h" #include "mace/core/types.h"
#include "mace/ops/arm/fp32/deconv_2d.h" #include "mace/ops/arm/base/deconv_2d_mxn.h"
#include "mace/ops/common/conv_pool_2d_util.h" #include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
class Deconv2dK3x3S1 : public Deconv2dBase { template<typename T>
class Deconv2dK3x3S1 : public Deconv2dKMxN<T> {
public: public:
explicit Deconv2dK3x3S1(const delegator::Deconv2dParam &param) explicit Deconv2dK3x3S1(const delegator::Deconv2dParam &param)
: Deconv2dBase(param) {} : Deconv2dKMxN<T>(param) {}
virtual ~Deconv2dK3x3S1() {} virtual ~Deconv2dK3x3S1() {}
MaceStatus Compute( MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter,
const OpContext *context, const T *input_data, T *padded_out_data) override;
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
}; };
class Deconv2dK3x3S2 : public Deconv2dBase { template<typename T>
class Deconv2dK3x3S2 : public Deconv2dKMxN<T> {
public: public:
explicit Deconv2dK3x3S2(const delegator::Deconv2dParam &param) explicit Deconv2dK3x3S2(const delegator::Deconv2dParam &param)
: Deconv2dBase(param) {} : Deconv2dKMxN<T>(param) {}
virtual ~Deconv2dK3x3S2() {} virtual ~Deconv2dK3x3S2() {}
MaceStatus Compute( MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter,
const OpContext *context, const T *input_data, T *padded_out_data) override;
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
}; };
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
#endif // MACE_OPS_ARM_FP32_DECONV_2D_3X3_H_ #endif // MACE_OPS_ARM_BASE_DECONV_2D_3X3_H_
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/arm/base/deconv_2d_4x4.h"
namespace mace {
namespace ops {
namespace arm {
void RegisterDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Deconv2dK4x4S1<float>, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
float, ImplType::NEON, K4x4S1));
MACE_REGISTER_DELEGATOR(
registry, Deconv2dK4x4S2<float>, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
float, ImplType::NEON, K4x4S2));
}
} // namespace arm
} // namespace ops
} // namespace mace
// Copyright 2019 The MACE Authors. All Rights Reserved. // Copyright 2020 The MACE Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -12,55 +12,47 @@ ...@@ -12,55 +12,47 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_OPS_ARM_FP32_DECONV_2D_4X4_H_ #ifndef MACE_OPS_ARM_BASE_DECONV_2D_4X4_H_
#define MACE_OPS_ARM_FP32_DECONV_2D_4X4_H_ #define MACE_OPS_ARM_BASE_DECONV_2D_4X4_H_
#include <vector>
#include <memory> #include <memory>
#include <vector>
#include "mace/core/ops/op_context.h" #include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/core/types.h" #include "mace/core/types.h"
#include "mace/ops/arm/fp32/deconv_2d.h" #include "mace/ops/arm/base/deconv_2d_mxn.h"
#include "mace/ops/common/conv_pool_2d_util.h" #include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
class Deconv2dK4x4S1 : public Deconv2dBase { template<typename T>
class Deconv2dK4x4S1 : public Deconv2dKMxN<T> {
public: public:
explicit Deconv2dK4x4S1(const delegator::Deconv2dParam &param) explicit Deconv2dK4x4S1(const delegator::Deconv2dParam &param)
: Deconv2dBase(param) {} : Deconv2dKMxN<T>(param) {}
virtual ~Deconv2dK4x4S1() {} virtual ~Deconv2dK4x4S1() {}
MaceStatus Compute( MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter,
const OpContext *context, const T *input_data, T *padded_out_data) override;
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
}; };
class Deconv2dK4x4S2 : public Deconv2dBase { template<typename T>
class Deconv2dK4x4S2 : public Deconv2dKMxN<T> {
public: public:
explicit Deconv2dK4x4S2(const delegator::Deconv2dParam &param) explicit Deconv2dK4x4S2(const delegator::Deconv2dParam &param)
: Deconv2dBase(param) {} : Deconv2dKMxN<T>(param) {}
virtual ~Deconv2dK4x4S2() {} virtual ~Deconv2dK4x4S2() {}
MaceStatus Compute( MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter,
const OpContext *context, const T *input_data, T *padded_out_data) override;
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
}; };
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
#endif // MACE_OPS_ARM_FP32_DECONV_2D_4X4_H_ #endif // MACE_OPS_ARM_BASE_DECONV_2D_4X4_H_
// Copyright 2019 The MACE Authors. All Rights Reserved. // Copyright 2020 The MACE Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -12,30 +12,17 @@ ...@@ -12,30 +12,17 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/ops/arm/fp32/deconv_2d.h" #include "mace/ops/arm/base/deconv_2d_general.h"
// TODO(liutuo): optimize it #include <memory>
#include <vector>
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
class Deconv2dGeneral : public Deconv2dBase { template<typename T>
public: MaceStatus Deconv2dGeneral<T>::Compute(const OpContext *context,
explicit Deconv2dGeneral(const delegator::Deconv2dParam &param)
: Deconv2dBase(param) {}
virtual ~Deconv2dGeneral() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
};
MaceStatus Deconv2dGeneral::Compute(const OpContext *context,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *output_shape, const Tensor *output_shape,
...@@ -60,9 +47,9 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context, ...@@ -60,9 +47,9 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context,
Tensor::MappingGuard filter_mapper(filter); Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard output_mapper(output); Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>(); auto input_data = input->data<T>();
auto filter_data = filter->data<float>(); auto filter_data = filter->data<T>();
auto padded_out_data = out_tensor->mutable_data<float>(); auto padded_out_data = out_tensor->mutable_data<T>();
auto &in_shape = input->shape(); auto &in_shape = input->shape();
auto &out_shape = out_tensor->shape(); auto &out_shape = out_tensor->shape();
...@@ -95,7 +82,7 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context, ...@@ -95,7 +82,7 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context,
index_t start1, index_t end1, index_t step1) { index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
for (index_t oc = start1; oc < end1; oc += step1) { for (index_t oc = start1; oc < end1; oc += step1) {
float *out_base = T *out_base =
padded_out_data + (b * out_channels + oc) * out_img_size; padded_out_data + (b * out_channels + oc) * out_img_size;
for (index_t i = 0; i < in_height; ++i) { for (index_t i = 0; i < in_height; ++i) {
for (index_t j = 0; j < in_width; ++j) { for (index_t j = 0; j < in_width; ++j) {
...@@ -104,7 +91,7 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context, ...@@ -104,7 +91,7 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context,
for (int ic = 0; ic < in_channels; ++ic) { for (int ic = 0; ic < in_channels; ++ic) {
const index_t input_idx = const index_t input_idx =
(b * in_channels + ic) * in_img_size + i * in_width + j; (b * in_channels + ic) * in_img_size + i * in_width + j;
const float val = input_data[input_idx]; const T val = input_data[input_idx];
const index_t kernel_offset = const index_t kernel_offset =
(oc * in_channels + ic) * kernel_size; (oc * in_channels + ic) * kernel_size;
for (int k = 0; k < kernel_size; ++k) { for (int k = 0; k < kernel_size; ++k) {
...@@ -126,11 +113,10 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context, ...@@ -126,11 +113,10 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context,
void RegisterDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) { void RegisterDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR( MACE_REGISTER_DELEGATOR(
registry, Deconv2dGeneral, delegator::Deconv2dParam, registry, Deconv2dGeneral<float>, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY(Deconv2d, DeviceType::CPU, float, ImplType::NEON)); MACE_DELEGATOR_KEY(Deconv2d, DeviceType::CPU, float, ImplType::NEON));
} }
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
......
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_ARM_BASE_DECONV_2D_GENERAL_H_
#define MACE_OPS_ARM_BASE_DECONV_2D_GENERAL_H_
#include "mace/ops/arm/base/deconv_2d.h"
// TODO(liutuo): optimize it
namespace mace {
namespace ops {
namespace arm {
template<typename T>
class Deconv2dGeneral : public Deconv2dBase {
public:
explicit Deconv2dGeneral(const delegator::Deconv2dParam &param)
: Deconv2dBase(param, sizeof(T)) {}
virtual ~Deconv2dGeneral() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
};
} // namespace arm
} // namespace ops
} // namespace mace
#endif // MACE_OPS_ARM_BASE_DECONV_2D_GENERAL_H_
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_ARM_BASE_DECONV_2D_MXN_H_
#define MACE_OPS_ARM_BASE_DECONV_2D_MXN_H_
#include <memory>
#include <vector>
#include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h"
#include "mace/ops/arm/base/deconv_2d.h"
#include "mace/public/mace.h"
namespace mace {
namespace ops {
namespace arm {
template<typename T>
class Deconv2dKMxN : public Deconv2dBase {
public:
explicit Deconv2dKMxN(const delegator::Deconv2dParam &param)
: Deconv2dBase(param, sizeof(T)) {}
virtual ~Deconv2dKMxN() {}
MaceStatus Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) {
std::unique_ptr<Tensor> padded_out;
std::vector<int> out_pad_size;
ResizeOutAndPadOut(context, input, filter, output_shape,
output, &out_pad_size, &padded_out);
Tensor *out_tensor = output;
if (padded_out != nullptr) {
out_tensor = padded_out.get();
}
out_tensor->Clear();
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard output_mapper(output);
const T *input_data = input->data<T>();
const T *filter_data = filter->data<T>();
T *padded_out_data = out_tensor->mutable_data<T>();
const DeconvComputeParam p =
PreWorkAndGetDeconvParam(context, input, out_tensor);
DoCompute(p, filter_data, input_data, padded_out_data);
UnPadOutput(*out_tensor, out_pad_size, output);
return MaceStatus::MACE_SUCCESS;
}
virtual MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter,
const T *input_data, T *padded_out_data) = 0;
};
} // namespace arm
} // namespace ops
} // namespace mace
#endif // MACE_OPS_ARM_BASE_DECONV_2D_MXN_H_
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/arm/base/depthwise_conv_2d_3x3.h"
namespace mace {
namespace ops {
namespace arm {
void RegisterDepthwiseConv2dK3x3Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, DepthwiseConv2dK3x3S1<float>, delegator::DepthwiseConv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S1));
MACE_REGISTER_DELEGATOR(
registry, DepthwiseConv2dK3x3S2<float>, delegator::DepthwiseConv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S2));
}
} // namespace arm
} // namespace ops
} // namespace mace
// Copyright 2019 The MACE Authors. All Rights Reserved. // Copyright 2020 The MACE Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -12,51 +12,47 @@ ...@@ -12,51 +12,47 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_OPS_ARM_FP32_DEPTHWISE_CONV_2D_3X3_H_ #ifndef MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_3X3_H_
#define MACE_OPS_ARM_FP32_DEPTHWISE_CONV_2D_3X3_H_ #define MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_3X3_H_
#include <vector> #include <vector>
#include "mace/core/ops/op_context.h" #include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/ops/arm/fp32/conv_2d.h" #include "mace/ops/arm/base/depthwise_conv_2d_mxn.h"
#include "mace/ops/delegator/depthwise_conv_2d.h" #include "mace/ops/delegator/depthwise_conv_2d.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
class DepthwiseConv2dK3x3S1 : public Conv2dBase { template<typename T>
class DepthwiseConv2dK3x3S1 : public DepthwiseConv2dKMxN<T> {
public: public:
explicit DepthwiseConv2dK3x3S1(const delegator::DepthwiseConv2dParam &param) explicit DepthwiseConv2dK3x3S1(const delegator::DepthwiseConv2dParam &param)
: Conv2dBase(param) {} : DepthwiseConv2dKMxN<T>(param) {}
virtual ~DepthwiseConv2dK3x3S1() {} virtual ~DepthwiseConv2dK3x3S1() {}
MaceStatus Compute( MaceStatus DoCompute(
const OpContext *context, const DepthwiseConvComputeParam &p, const T *filter,
const Tensor *input, const T *input_data, T *output_data) override;
const Tensor *filter,
Tensor *output) override;
}; };
class DepthwiseConv2dK3x3S2 : public Conv2dBase { template<typename T>
class DepthwiseConv2dK3x3S2 : public DepthwiseConv2dKMxN<T> {
public: public:
explicit DepthwiseConv2dK3x3S2(const delegator::DepthwiseConv2dParam &param) explicit DepthwiseConv2dK3x3S2(const delegator::DepthwiseConv2dParam &param)
: Conv2dBase(param) {} : DepthwiseConv2dKMxN<T>(param) {}
virtual ~DepthwiseConv2dK3x3S2() {} virtual ~DepthwiseConv2dK3x3S2() {}
MaceStatus Compute( MaceStatus DoCompute(
const OpContext *context, const DepthwiseConvComputeParam &p, const T *filter,
const Tensor *input, const T *input_data, T *output_data) override;
const Tensor *filter,
Tensor *output) override;
}; };
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
#endif // MACE_OPS_ARM_FP32_DEPTHWISE_CONV_2D_3X3_H_ #endif // MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_3X3_H_
// Copyright 2019 The MACE Authors. All Rights Reserved. // Copyright 2020 The MACE Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -12,51 +12,53 @@ ...@@ -12,51 +12,53 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_OPS_ARM_FP32_DECONV_2D_H_ #ifndef MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_MXN_H_
#define MACE_OPS_ARM_FP32_DECONV_2D_H_ #define MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_MXN_H_
#include <vector> #include <vector>
#include <memory>
#include "mace/core/ops/op_context.h" #include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/core/types.h" #include "mace/ops/arm/base/conv_2d.h"
#include "mace/ops/arm/fp32/gemm.h" #include "mace/ops/delegator/depthwise_conv_2d.h"
#include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/ops/delegator/deconv_2d.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
class Deconv2dBase : public delegator::Deconv2d { template<typename T>
class DepthwiseConv2dKMxN : public Conv2dBase {
public: public:
explicit Deconv2dBase(const delegator::Deconv2dParam &param) explicit DepthwiseConv2dKMxN(const delegator::DepthwiseConv2dParam &param)
: delegator::Deconv2d(param), : Conv2dBase(param, sizeof(T)) {}
group_(param.group_) {} virtual ~DepthwiseConv2dKMxN() {}
virtual ~Deconv2dBase() = default; MaceStatus Compute(const OpContext *context, const Tensor *input,
const Tensor *filter, Tensor *output) {
DepthwiseConvComputeParam p =
PreWorkAndGetDepthwiseConv2DParam(context, input, filter, output);
Tensor::MappingGuard in_guard(input);
Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard out_guard(output);
const T *filter_data = filter->data<T>();
const T *input_data = input->data<T>();
T *output_data = output->mutable_data<T>();
DoCompute(p, filter_data, input_data, output_data);
return MaceStatus::MACE_SUCCESS;
}
protected: protected:
MaceStatus ResizeOutAndPadOut(const OpContext *context, virtual MaceStatus DoCompute(
const Tensor *input, const DepthwiseConvComputeParam &p, const T *filter,
const Tensor *filter, const T *input_data, T *output_data) = 0;
const Tensor *output_shape,
Tensor *output,
std::vector<int> *out_pad_size,
std::unique_ptr<Tensor> *padded_output);
void UnPadOutput(const Tensor &src,
const std::vector<int> &out_pad_size,
Tensor *dst);
index_t group_;
}; };
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
#endif // MACE_OPS_ARM_FP32_DECONV_2D_H_ #endif // MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_MXN_H_
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/arm/base/depthwise_deconv_2d_3x3.h"
namespace mace {
namespace ops {
namespace arm {
void RegisterDepthwiseDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2dK3x3S1<float>,
delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S1));
MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2dK3x3S2<float>,
delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S2));
}
void RegisterGroupDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dK3x3S1<float>, delegator::GroupDeconv2dParam,
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S1));
MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dK3x3S2<float>, delegator::GroupDeconv2dParam,
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S2));
}
} // namespace arm
} // namespace ops
} // namespace mace
// Copyright 2019 The MACE Authors. All Rights Reserved. // Copyright 2020 The MACE Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_3X3_H_ #ifndef MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_3X3_H_
#define MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_3X3_H_ #define MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_3X3_H_
#include <vector> #include <vector>
#include <memory> #include <memory>
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
#include "mace/core/ops/op_context.h" #include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/core/types.h" #include "mace/core/types.h"
#include "mace/ops/arm/fp32/deconv_2d.h" #include "mace/ops/arm/base/depthwise_deconv_2d_mxn.h"
#include "mace/ops/common/conv_pool_2d_util.h" #include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/ops/delegator/depthwise_deconv_2d.h" #include "mace/ops/delegator/depthwise_deconv_2d.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
...@@ -29,70 +29,56 @@ ...@@ -29,70 +29,56 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
class DepthwiseDeconv2dK3x3S1 : public Deconv2dBase { template<typename T>
class DepthwiseDeconv2dK3x3S1 : public DepthwiseDeconv2dKMxN<T> {
public: public:
explicit DepthwiseDeconv2dK3x3S1( explicit DepthwiseDeconv2dK3x3S1(
const delegator::DepthwiseDeconv2dParam &param) const delegator::DepthwiseDeconv2dParam &param)
: Deconv2dBase(param) {} : DepthwiseDeconv2dKMxN<T>(param) {}
virtual ~DepthwiseDeconv2dK3x3S1() {} virtual ~DepthwiseDeconv2dK3x3S1() {}
MaceStatus Compute( MaceStatus DoCompute(const DepthwiseDeconvComputeParam &p, const T *filter,
const OpContext *context, const T *input_data, T *padded_out_data) override;
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
}; };
class DepthwiseDeconv2dK3x3S2 : public Deconv2dBase { template<typename T>
class DepthwiseDeconv2dK3x3S2 : public DepthwiseDeconv2dKMxN<T> {
public: public:
explicit DepthwiseDeconv2dK3x3S2( explicit DepthwiseDeconv2dK3x3S2(
const delegator::DepthwiseDeconv2dParam &param) const delegator::DepthwiseDeconv2dParam &param)
: Deconv2dBase(param) {} : DepthwiseDeconv2dKMxN<T>(param) {}
virtual ~DepthwiseDeconv2dK3x3S2() {} virtual ~DepthwiseDeconv2dK3x3S2() {}
MaceStatus Compute( MaceStatus DoCompute(const DepthwiseDeconvComputeParam &p, const T *filter,
const OpContext *context, const T *input_data, T *padded_out_data) override;
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
}; };
class GroupDeconv2dK3x3S1 : public Deconv2dBase { template<typename T>
class GroupDeconv2dK3x3S1 : public GroupDeconv2dKMxN<T> {
public: public:
explicit GroupDeconv2dK3x3S1( explicit GroupDeconv2dK3x3S1(
const delegator::GroupDeconv2dParam &param) const delegator::GroupDeconv2dParam &param)
: Deconv2dBase(param) {} : GroupDeconv2dKMxN<T>(param) {}
virtual ~GroupDeconv2dK3x3S1() {} virtual ~GroupDeconv2dK3x3S1() {}
MaceStatus Compute( MaceStatus DoCompute(const GroupDeconvComputeParam &p, const T *filter,
const OpContext *context, const T *input_data, T *padded_out_data) override;
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
}; };
class GroupDeconv2dK3x3S2 : public Deconv2dBase { template<typename T>
class GroupDeconv2dK3x3S2 : public GroupDeconv2dKMxN<T> {
public: public:
explicit GroupDeconv2dK3x3S2(const delegator::GroupDeconv2dParam &param) explicit GroupDeconv2dK3x3S2(const delegator::GroupDeconv2dParam &param)
: Deconv2dBase(param) {} : GroupDeconv2dKMxN<T>(param) {}
virtual ~GroupDeconv2dK3x3S2() {} virtual ~GroupDeconv2dK3x3S2() {}
MaceStatus Compute( MaceStatus DoCompute(const GroupDeconvComputeParam &p, const T *filter,
const OpContext *context, const T *input_data, T *padded_out_data) override;
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
}; };
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
#endif // MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_3X3_H_ #endif // MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_3X3_H_
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/arm/base/depthwise_deconv_2d_4x4.h"
namespace mace {
namespace ops {
namespace arm {
void RegisterDepthwiseDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2dK4x4S1<float>,
delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K4x4S1));
MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2dK4x4S2<float>,
delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K4x4S2));
}
void RegisterGroupDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dK4x4S1<float>, delegator::GroupDeconv2dParam,
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K4x4S1));
MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dK4x4S2<float>, delegator::GroupDeconv2dParam,
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K4x4S2));
}
} // namespace arm
} // namespace ops
} // namespace mace
// Copyright 2019 The MACE Authors. All Rights Reserved. // Copyright 2020 The MACE Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_4X4_H_ #ifndef MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_4X4_H_
#define MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_4X4_H_ #define MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_4X4_H_
#include <vector> #include <vector>
#include <memory> #include <memory>
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
#include "mace/core/ops/op_context.h" #include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/core/types.h" #include "mace/core/types.h"
#include "mace/ops/arm/fp32/deconv_2d.h" #include "mace/ops/arm/base/depthwise_deconv_2d_mxn.h"
#include "mace/ops/common/conv_pool_2d_util.h" #include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/ops/delegator/depthwise_deconv_2d.h" #include "mace/ops/delegator/depthwise_deconv_2d.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
...@@ -29,69 +29,55 @@ ...@@ -29,69 +29,55 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
class DepthwiseDeconv2dK4x4S1 : public Deconv2dBase { template<typename T>
class DepthwiseDeconv2dK4x4S1 : public DepthwiseDeconv2dKMxN<T> {
public: public:
explicit DepthwiseDeconv2dK4x4S1( explicit DepthwiseDeconv2dK4x4S1(
const delegator::DepthwiseDeconv2dParam &param) const delegator::DepthwiseDeconv2dParam &param)
: Deconv2dBase(param) {} : DepthwiseDeconv2dKMxN<T>(param) {}
virtual ~DepthwiseDeconv2dK4x4S1() {} virtual ~DepthwiseDeconv2dK4x4S1() {}
MaceStatus Compute( MaceStatus DoCompute(const DepthwiseDeconvComputeParam &p, const T *filter,
const OpContext *context, const T *input_data, T *padded_out_data) override;
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
}; };
class DepthwiseDeconv2dK4x4S2 : public Deconv2dBase { template<typename T>
class DepthwiseDeconv2dK4x4S2 : public DepthwiseDeconv2dKMxN<T> {
public: public:
explicit DepthwiseDeconv2dK4x4S2( explicit DepthwiseDeconv2dK4x4S2(
const delegator::DepthwiseDeconv2dParam &param) const delegator::DepthwiseDeconv2dParam &param)
: Deconv2dBase(param) {} : DepthwiseDeconv2dKMxN<T>(param) {}
virtual ~DepthwiseDeconv2dK4x4S2() {} virtual ~DepthwiseDeconv2dK4x4S2() {}
MaceStatus Compute( MaceStatus DoCompute(const DepthwiseDeconvComputeParam &p, const T *filter,
const OpContext *context, const T *input_data, T *padded_out_data) override;
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
}; };
class GroupDeconv2dK4x4S1 : public Deconv2dBase { template<typename T>
class GroupDeconv2dK4x4S1 : public GroupDeconv2dKMxN<T> {
public: public:
explicit GroupDeconv2dK4x4S1(const delegator::GroupDeconv2dParam &param) explicit GroupDeconv2dK4x4S1(const delegator::GroupDeconv2dParam &param)
: Deconv2dBase(param) {} : GroupDeconv2dKMxN<T>(param) {}
virtual ~GroupDeconv2dK4x4S1() {} virtual ~GroupDeconv2dK4x4S1() {}
MaceStatus Compute( MaceStatus DoCompute(const GroupDeconvComputeParam &p, const T *filter,
const OpContext *context, const T *input_data, T *padded_out_data) override;
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
}; };
class GroupDeconv2dK4x4S2 : public Deconv2dBase { template<typename T>
class GroupDeconv2dK4x4S2 : public GroupDeconv2dKMxN<T> {
public: public:
explicit GroupDeconv2dK4x4S2(const delegator::GroupDeconv2dParam &param) explicit GroupDeconv2dK4x4S2(const delegator::GroupDeconv2dParam &param)
: Deconv2dBase(param) {} : GroupDeconv2dKMxN<T>(param) {}
virtual ~GroupDeconv2dK4x4S2() {} virtual ~GroupDeconv2dK4x4S2() {}
MaceStatus Compute( MaceStatus DoCompute(const GroupDeconvComputeParam &p, const T *filter,
const OpContext *context, const T *input_data, T *padded_out_data) override;
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) override;
}; };
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
#endif // MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_4X4_H_ #endif // MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_4X4_H_
// Copyright 2019 The MACE Authors. All Rights Reserved. // Copyright 2020 The MACE Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -12,14 +12,14 @@ ...@@ -12,14 +12,14 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/ops/arm/fp32/depthwise_deconv_2d_general.h" #include "mace/ops/arm/base/depthwise_deconv_2d_general.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
MaceStatus DepthwiseDeconv2dGeneral::Compute(const OpContext *context, template<typename T>
MaceStatus DepthwiseDeconv2dGeneral<T>::Compute(const OpContext *context,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *output_shape, const Tensor *output_shape,
...@@ -46,9 +46,9 @@ MaceStatus DepthwiseDeconv2dGeneral::Compute(const OpContext *context, ...@@ -46,9 +46,9 @@ MaceStatus DepthwiseDeconv2dGeneral::Compute(const OpContext *context,
Tensor::MappingGuard filter_mapper(filter); Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard output_mapper(output); Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>(); const T *input_data = input->data<T>();
auto filter_data = filter->data<float>(); const T *filter_data = filter->data<T>();
auto padded_out_data = out_tensor->mutable_data<float>(); T *padded_out_data = out_tensor->mutable_data<T>();
auto &in_shape = input->shape(); auto &in_shape = input->shape();
auto &out_shape = out_tensor->shape(); auto &out_shape = out_tensor->shape();
...@@ -79,7 +79,7 @@ MaceStatus DepthwiseDeconv2dGeneral::Compute(const OpContext *context, ...@@ -79,7 +79,7 @@ MaceStatus DepthwiseDeconv2dGeneral::Compute(const OpContext *context,
index_t start1, index_t end1, index_t step1) { index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
for (index_t c = start1; c < end1; c += step1) { for (index_t c = start1; c < end1; c += step1) {
float *out_base = T *out_base =
padded_out_data + (b * channels + c) * out_img_size; padded_out_data + (b * channels + c) * out_img_size;
for (index_t i = 0; i < in_height; ++i) { for (index_t i = 0; i < in_height; ++i) {
for (index_t j = 0; j < in_width; ++j) { for (index_t j = 0; j < in_width; ++j) {
...@@ -105,7 +105,8 @@ MaceStatus DepthwiseDeconv2dGeneral::Compute(const OpContext *context, ...@@ -105,7 +105,8 @@ MaceStatus DepthwiseDeconv2dGeneral::Compute(const OpContext *context,
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MaceStatus GroupDeconv2dGeneral::Compute(const OpContext *context, template<typename T>
MaceStatus GroupDeconv2dGeneral<T>::Compute(const OpContext *context,
const Tensor *input, const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *output_shape, const Tensor *output_shape,
...@@ -131,9 +132,9 @@ MaceStatus GroupDeconv2dGeneral::Compute(const OpContext *context, ...@@ -131,9 +132,9 @@ MaceStatus GroupDeconv2dGeneral::Compute(const OpContext *context,
Tensor::MappingGuard filter_mapper(filter); Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard output_mapper(output); Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>(); const T *input_data = input->data<T>();
auto filter_data = filter->data<float>(); const T *filter_data = filter->data<T>();
auto padded_out_data = out_tensor->mutable_data<float>(); T *padded_out_data = out_tensor->mutable_data<T>();
auto &in_shape = input->shape(); auto &in_shape = input->shape();
auto &out_shape = out_tensor->shape(); auto &out_shape = out_tensor->shape();
...@@ -209,19 +210,19 @@ MaceStatus GroupDeconv2dGeneral::Compute(const OpContext *context, ...@@ -209,19 +210,19 @@ MaceStatus GroupDeconv2dGeneral::Compute(const OpContext *context,
void RegisterDepthwiseDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) { void RegisterDepthwiseDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR( MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2dGeneral, delegator::DepthwiseDeconv2dParam, registry, DepthwiseDeconv2dGeneral<float>,
delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY(DepthwiseDeconv2d, DeviceType::CPU, MACE_DELEGATOR_KEY(DepthwiseDeconv2d, DeviceType::CPU,
float, ImplType::NEON)); float, ImplType::NEON));
} }
void RegisterGroupDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) { void RegisterGroupDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR( MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dGeneral, delegator::GroupDeconv2dParam, registry, GroupDeconv2dGeneral<float>, delegator::GroupDeconv2dParam,
MACE_DELEGATOR_KEY(GroupDeconv2d, DeviceType::CPU, MACE_DELEGATOR_KEY(GroupDeconv2d, DeviceType::CPU,
float, ImplType::NEON)); float, ImplType::NEON));
} }
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
// Copyright 2019 The MACE Authors. All Rights Reserved. // Copyright 2020 The MACE Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_GENERAL_H_ #ifndef MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_GENERAL_H_
#define MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_GENERAL_H_ #define MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_GENERAL_H_
#include <vector> #include <vector>
#include <memory> #include <memory>
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
#include "mace/core/ops/op_context.h" #include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/core/types.h" #include "mace/core/types.h"
#include "mace/ops/arm/fp32/deconv_2d.h" #include "mace/ops/arm/base/deconv_2d.h"
#include "mace/ops/common/conv_pool_2d_util.h" #include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/ops/delegator/depthwise_deconv_2d.h" #include "mace/ops/delegator/depthwise_deconv_2d.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
...@@ -29,13 +29,13 @@ ...@@ -29,13 +29,13 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
template<typename T>
class DepthwiseDeconv2dGeneral : public Deconv2dBase { class DepthwiseDeconv2dGeneral : public Deconv2dBase {
public: public:
explicit DepthwiseDeconv2dGeneral( explicit DepthwiseDeconv2dGeneral(
const delegator::DepthwiseDeconv2dParam &param) const delegator::DepthwiseDeconv2dParam &param)
: Deconv2dBase(param) {} : Deconv2dBase(param, sizeof(T)) {}
virtual ~DepthwiseDeconv2dGeneral() {} virtual ~DepthwiseDeconv2dGeneral() {}
MaceStatus Compute( MaceStatus Compute(
...@@ -46,10 +46,11 @@ class DepthwiseDeconv2dGeneral : public Deconv2dBase { ...@@ -46,10 +46,11 @@ class DepthwiseDeconv2dGeneral : public Deconv2dBase {
Tensor *output) override; Tensor *output) override;
}; };
template<typename T>
class GroupDeconv2dGeneral : public Deconv2dBase { class GroupDeconv2dGeneral : public Deconv2dBase {
public: public:
explicit GroupDeconv2dGeneral(const delegator::GroupDeconv2dParam &param) explicit GroupDeconv2dGeneral(const delegator::GroupDeconv2dParam &param)
: Deconv2dBase(param) {} : Deconv2dBase(param, sizeof(T)) {}
virtual ~GroupDeconv2dGeneral() {} virtual ~GroupDeconv2dGeneral() {}
MaceStatus Compute( MaceStatus Compute(
...@@ -60,9 +61,8 @@ class GroupDeconv2dGeneral : public Deconv2dBase { ...@@ -60,9 +61,8 @@ class GroupDeconv2dGeneral : public Deconv2dBase {
Tensor *output) override; Tensor *output) override;
}; };
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
#endif // MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_GENERAL_H_ #endif // MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_GENERAL_H_
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_MXN_H_
#define MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_MXN_H_
#include <vector>
#include <memory>
#include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h"
#include "mace/core/types.h"
#include "mace/ops/arm/base/deconv_2d.h"
#include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/ops/delegator/depthwise_deconv_2d.h"
#include "mace/public/mace.h"
namespace mace {
namespace ops {
namespace arm {
template<typename T>
class DepthwiseDeconv2dKMxN : public Deconv2dBase {
public:
explicit DepthwiseDeconv2dKMxN(
const delegator::DepthwiseDeconv2dParam &param)
: Deconv2dBase(param, sizeof(T)) {}
virtual ~DepthwiseDeconv2dKMxN() {}
MaceStatus Compute(
const OpContext *context, const Tensor *input, const Tensor *filter,
const Tensor *output_shape, Tensor *output) override {
std::unique_ptr<Tensor> padded_out;
std::vector<int> out_pad_size;
group_ = input->dim(1);
ResizeOutAndPadOut(context,
input,
filter,
output_shape,
output,
&out_pad_size,
&padded_out);
Tensor *out_tensor = output;
if (padded_out != nullptr) {
out_tensor = padded_out.get();
}
out_tensor->Clear();
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard output_mapper(output);
const T *input_data = input->data<float>();
const T *filter_data = filter->data<float>();
T *padded_out_data = out_tensor->mutable_data<float>();
DepthwiseDeconvComputeParam p =
PreWorkAndGetDepthwiseDeconvParam(context, input, out_tensor);
DoCompute(p, filter_data, input_data, padded_out_data);
UnPadOutput(*out_tensor, out_pad_size, output);
return MaceStatus::MACE_SUCCESS;
}
virtual MaceStatus DoCompute(
const DepthwiseDeconvComputeParam &p, const T *filter,
const T *input_data, T *padded_out_data) = 0;
};
template<typename T>
class GroupDeconv2dKMxN : public Deconv2dBase {
public:
explicit GroupDeconv2dKMxN(
const delegator::DepthwiseDeconv2dParam &param)
: Deconv2dBase(param, sizeof(T)) {}
virtual ~GroupDeconv2dKMxN() {}
MaceStatus Compute(
const OpContext *context, const Tensor *input, const Tensor *filter,
const Tensor *output_shape, Tensor *output) override {
std::unique_ptr<Tensor> padded_out;
std::vector<int> out_pad_size;
ResizeOutAndPadOut(context,
input,
filter,
output_shape,
output,
&out_pad_size,
&padded_out);
Tensor *out_tensor = output;
if (padded_out != nullptr) {
out_tensor = padded_out.get();
}
out_tensor->Clear();
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>();
auto filter_data = filter->data<float>();
auto padded_out_data = out_tensor->mutable_data<float>();
GroupDeconvComputeParam p =
PreWorkAndGetGroupDeconvParam(context, input, out_tensor);
DoCompute(p, filter_data, input_data, padded_out_data);
UnPadOutput(*out_tensor, out_pad_size, output);
return MaceStatus::MACE_SUCCESS;
}
virtual MaceStatus DoCompute(
const GroupDeconvComputeParam &p, const T *filter,
const T *input_data, T *padded_out_data) = 0;
};
} // namespace arm
} // namespace ops
} // namespace mace
#endif // MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_MXN_H_
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/arm/base/gemm.h"
namespace mace {
namespace ops {
namespace arm {
void RegisterGemmDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Gemm<float>, delegator::GemmParam,
MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float, ImplType::NEON));
}
} // namespace arm
} // namespace ops
} // namespace mace
// Copyright 2019 The MACE Authors. All Rights Reserved. // Copyright 2020 The MACE Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_OPS_ARM_FP32_GEMM_H_ #ifndef MACE_OPS_ARM_BASE_GEMM_H_
#define MACE_OPS_ARM_FP32_GEMM_H_ #define MACE_OPS_ARM_BASE_GEMM_H_
#include "mace/core/ops/op_context.h" #include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
...@@ -28,8 +28,10 @@ ...@@ -28,8 +28,10 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
enum { kNoCache, kCacheLhs, kCacheRhs };
template<typename T>
class Gemm : public delegator::Gemm { class Gemm : public delegator::Gemm {
public: public:
explicit Gemm(const delegator::GemmParam &param) explicit Gemm(const delegator::GemmParam &param)
...@@ -68,26 +70,49 @@ class Gemm : public delegator::Gemm { ...@@ -68,26 +70,49 @@ class Gemm : public delegator::Gemm {
const bool transpose_out, const bool transpose_out,
const bool lhs_batched, const bool lhs_batched,
const bool rhs_batched, const bool rhs_batched,
Tensor *output) override; Tensor *output) override {
index_t rows = transpose_lhs ? lhs_cols : lhs_rows;
index_t depth = transpose_lhs ? lhs_rows : lhs_cols;
index_t cols = transpose_rhs ? rhs_rows : rhs_cols;
index_t depth2 = transpose_rhs ? rhs_cols : rhs_rows;
MACE_CHECK(depth == depth2,
"Matrices that multiply have inconsistent depth dim: ",
depth,
" vs. ",
depth2);
return Compute(context,
lhs,
rhs,
batch,
rows,
cols,
depth,
transpose_lhs ? ColMajor : RowMajor,
transpose_rhs ? ColMajor : RowMajor,
transpose_out ? ColMajor : RowMajor,
lhs_batched,
rhs_batched,
output);
}
private: protected:
void ComputeBlock(const float *packed_lhs_data, void ComputeBlock(const T *packed_lhs_data,
const float *packed_rhs_data, const T *packed_rhs_data,
const index_t depth_padded, const index_t depth_padded,
float *packed_output_data); T *packed_output_data);
void PackLhs(const MatrixMap<const float> &lhs, void PackLhs(const MatrixMap<const T> &lhs,
float *packed_lhs); T *packed_lhs);
void PackRhs(const MatrixMap<const float> &rhs, void PackRhs(const MatrixMap<const T> &rhs,
float *packed_rhs); T *packed_rhs);
void UnpackOutput(const float *packed_output,
MatrixMap<float> *output);
void UnpackOutput(const T *packed_output,
MatrixMap<T> *output);
template<int RowBlockSize, int ColBlockSize> template<int RowBlockSize, int ColBlockSize>
void Unpack(const float *packed_output, void Unpack(const T *packed_output,
MatrixMap<float> *output) { MatrixMap<T> *output) {
const index_t rows = output->rows(); const index_t rows = output->rows();
const index_t cols = output->cols(); const index_t cols = output->cols();
for (index_t r = 0; r < rows; ++r) { for (index_t r = 0; r < rows; ++r) {
...@@ -98,9 +123,9 @@ class Gemm : public delegator::Gemm { ...@@ -98,9 +123,9 @@ class Gemm : public delegator::Gemm {
} }
template<int WidthBlockSize, int DepthBlockSize> template<int WidthBlockSize, int DepthBlockSize>
void Pack(const MatrixMap<const float> &matrix, void Pack(const MatrixMap<const T> &matrix,
MatrixMajor dst_major, MatrixMajor dst_major,
float *packed_matrix) { T *packed_matrix) {
const index_t rows = matrix.rows(); const index_t rows = matrix.rows();
const index_t cols = matrix.cols(); const index_t cols = matrix.cols();
index_t depth = cols; index_t depth = cols;
...@@ -109,7 +134,7 @@ class Gemm : public delegator::Gemm { ...@@ -109,7 +134,7 @@ class Gemm : public delegator::Gemm {
depth = rows; depth = rows;
} }
const index_t depth_padded = RoundUp(depth, static_cast<index_t>(4)); const index_t depth_padded = RoundUp(depth, static_cast<index_t>(4));
memset(packed_matrix, 0, sizeof(float) * WidthBlockSize * depth_padded); memset(packed_matrix, 0, sizeof(T) * WidthBlockSize * depth_padded);
if (dst_major == ColMajor) { if (dst_major == ColMajor) {
for (index_t c = 0; c < cols; ++c) { for (index_t c = 0; c < cols; ++c) {
for (index_t r = 0; r < rows; ++r) { for (index_t r = 0; r < rows; ++r) {
...@@ -125,31 +150,14 @@ class Gemm : public delegator::Gemm { ...@@ -125,31 +150,14 @@ class Gemm : public delegator::Gemm {
} }
} }
private:
Buffer pack_cache_; Buffer pack_cache_;
bool should_cache_pack_; bool should_cache_pack_;
int cached_; int cached_;
}; };
template<>
void Gemm::Pack<4, 4>(const MatrixMap<const float> &matrix,
MatrixMajor dst_major,
float *packed_matrix);
template<>
void Gemm::Pack<8, 4>(const MatrixMap<const float> &matrix,
MatrixMajor dst_major,
float *packed_matrix);
template<>
void Gemm::Unpack<4, 8>(const float *packed_output, MatrixMap<float> *output);
template<>
void Gemm::Unpack<8, 8>(const float *packed_output, MatrixMap<float> *output);
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
#endif // MACE_OPS_ARM_FP32_GEMM_H_ #endif // MACE_OPS_ARM_BASE_GEMM_H_
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/arm/base/gemv.h"
namespace mace {
namespace ops {
namespace arm {
void RegisterGemvDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Gemv<float>, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, float, ImplType::NEON));
}
} // namespace arm
} // namespace ops
} // namespace mace
// Copyright 2019 The MACE Authors. All Rights Reserved. // Copyright 2020 The MACE Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifndef MACE_OPS_ARM_FP32_GEMV_H_ #ifndef MACE_OPS_ARM_BASE_GEMV_H_
#define MACE_OPS_ARM_FP32_GEMV_H_ #define MACE_OPS_ARM_BASE_GEMV_H_
#include "mace/core/ops/op_context.h" #include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
...@@ -23,8 +23,8 @@ ...@@ -23,8 +23,8 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
template<typename T>
class Gemv : public delegator::Gemv { class Gemv : public delegator::Gemv {
public: public:
explicit Gemv(const DelegatorParam &param) : delegator::Gemv(param) {} explicit Gemv(const DelegatorParam &param) : delegator::Gemv(param) {}
...@@ -43,9 +43,8 @@ class Gemv : public delegator::Gemv { ...@@ -43,9 +43,8 @@ class Gemv : public delegator::Gemv {
Tensor *output) override; Tensor *output) override;
}; };
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
#endif // MACE_OPS_ARM_FP32_GEMV_H_ #endif // MACE_OPS_ARM_BASE_GEMV_H_
...@@ -12,60 +12,24 @@ ...@@ -12,60 +12,24 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/ops/delegator/activation.h"
#include <arm_neon.h> #include <arm_neon.h>
#include <algorithm> #include <algorithm>
#include "mace/ops/arm/base/activation.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
class Activation : public delegator::Activation {
public:
explicit Activation(const delegator::ActivationParam &param)
: delegator::Activation(param) {}
~Activation() = default;
MaceStatus Compute(const OpContext *context,
const Tensor *input, Tensor *output) override;
private:
void DoActivation(const OpContext *context,
const Tensor *input, Tensor *output);
};
MaceStatus Activation::Compute(const OpContext *context,
const Tensor *input, Tensor *output) {
Tensor::MappingGuard input_guard(input);
if (input != output) {
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
Tensor::MappingGuard output_guard(output);
DoActivation(context, input, output);
} else {
DoActivation(context, input, output);
}
return MaceStatus::MACE_SUCCESS;
}
void Activation::DoActivation(const OpContext *context,
const Tensor *input,
Tensor *output) {
auto input_data = input->data<float>();
auto output_data = output->mutable_data<float>();
const index_t size = input->size();
utils::ThreadPool &thread_pool =
context->device()->cpu_runtime()->thread_pool();
switch (type_) { template<>
case RELU: { void Activation<float>::ActivateRelu(utils::ThreadPool *thread_pool,
const float *input_data,
const index_t input_size,
float *output_data) {
const float32x4_t vzero = vdupq_n_f32(0.f); const float32x4_t vzero = vdupq_n_f32(0.f);
const index_t block_count = size / 4; const index_t block_count = input_size / 4;
thread_pool.Compute1D( thread_pool->Compute1D(
[=](index_t start, index_t end, index_t step) { [=](index_t start, index_t end, index_t step) {
auto input_ptr = input_data + start * 4; auto input_ptr = input_data + start * 4;
auto output_ptr = output_data + start * 4; auto output_ptr = output_data + start * 4;
...@@ -82,19 +46,21 @@ void Activation::DoActivation(const OpContext *context, ...@@ -82,19 +46,21 @@ void Activation::DoActivation(const OpContext *context,
0, block_count, 1); 0, block_count, 1);
// remain // remain
for (index_t i = block_count * 4; i < size; ++i) { for (index_t i = block_count * 4; i < input_size; ++i) {
output_data[i] = std::max(0.f, input_data[i]); output_data[i] = std::max(0.f, input_data[i]);
} }
}
break; template<>
} void Activation<float>::ActivateRelux(utils::ThreadPool *thread_pool,
const float *input_data,
case RELUX: { const index_t input_size,
float *output_data) {
const float32x4_t vzero = vdupq_n_f32(0.f); const float32x4_t vzero = vdupq_n_f32(0.f);
const float32x4_t vlimit = vdupq_n_f32(limit_); const float32x4_t vlimit = vdupq_n_f32(limit_);
const index_t block_count = size / 4; const index_t block_count = input_size / 4;
thread_pool.Compute1D( thread_pool->Compute1D(
[=](index_t start, index_t end, index_t step) { [=](index_t start, index_t end, index_t step) {
auto input_ptr = input_data + start * 4; auto input_ptr = input_data + start * 4;
auto output_ptr = output_data + start * 4; auto output_ptr = output_data + start * 4;
...@@ -112,19 +78,21 @@ void Activation::DoActivation(const OpContext *context, ...@@ -112,19 +78,21 @@ void Activation::DoActivation(const OpContext *context,
0, block_count, 1); 0, block_count, 1);
// remain // remain
for (index_t i = block_count * 4; i < size; ++i) { for (index_t i = block_count * 4; i < input_size; ++i) {
output_data[i] = std::max(0.f, std::min(limit_, input_data[i])); output_data[i] = std::max(0.f, std::min(limit_, input_data[i]));
} }
}
break; template<>
} void Activation<float>::ActivateLeakyRelu(utils::ThreadPool *thread_pool,
const float *input_data,
case LEAKYRELU: { const index_t input_size,
float *output_data) {
const float32x4_t vzero = vdupq_n_f32(0.f); const float32x4_t vzero = vdupq_n_f32(0.f);
const float32x4_t valpha = vdupq_n_f32(leakyrelu_coefficient_); const float32x4_t valpha = vdupq_n_f32(leakyrelu_coefficient_);
const index_t block_count = size / 4; const index_t block_count = input_size / 4;
thread_pool.Compute1D( thread_pool->Compute1D(
[=](index_t start, index_t end, index_t step) { [=](index_t start, index_t end, index_t step) {
auto input_ptr = input_data + start * 4; auto input_ptr = input_data + start * 4;
auto output_ptr = output_data + start * 4; auto output_ptr = output_data + start * 4;
...@@ -143,55 +111,40 @@ void Activation::DoActivation(const OpContext *context, ...@@ -143,55 +111,40 @@ void Activation::DoActivation(const OpContext *context,
0, block_count, 1); 0, block_count, 1);
// remain // remain
for (index_t i = block_count * 4; i < size; ++i) { for (index_t i = block_count * 4; i < input_size; ++i) {
output_data[i] = std::max(input_data[i], 0.f) + output_data[i] = std::max(input_data[i], 0.f) +
std::min(input_data[i], 0.f) * leakyrelu_coefficient_; std::min(input_data[i], 0.f) * leakyrelu_coefficient_;
} }
}
break; template<>
} void Activation<float>::ActivateTanh(utils::ThreadPool *thread_pool,
const float *input_data,
case TANH: { const index_t input_size,
thread_pool.Compute1D( float *output_data) {
thread_pool->Compute1D(
[=](index_t start, index_t end, index_t step) { [=](index_t start, index_t end, index_t step) {
for (index_t i = start; i < end; i += step) { for (index_t i = start; i < end; i += step) {
output_data[i] = std::tanh(input_data[i]); output_data[i] = std::tanh(input_data[i]);
} }
}, },
0, size, 1); 0, input_size, 1);
}
break;
}
case SIGMOID: { template<>
thread_pool.Compute1D( void Activation<float>::ActivateSigmoid(utils::ThreadPool *thread_pool,
const float *input_data,
const index_t input_size,
float *output_data) {
thread_pool->Compute1D(
[=](index_t start, index_t end, index_t step) { [=](index_t start, index_t end, index_t step) {
for (index_t i = start; i < end; i += step) { for (index_t i = start; i < end; i += step) {
output_data[i] = 1 / (1 + std::exp(-(input_data[i]))); output_data[i] = 1 / (1 + std::exp(-(input_data[i])));
} }
}, },
0, size, 1); 0, input_size, 1);
break;
}
case NOOP: {
break;
}
default: {
MACE_NOT_IMPLEMENTED;
}
}
}
void RegisterActivationDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Activation, delegator::ActivationParam,
MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, float, ImplType::NEON));
} }
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
...@@ -13,69 +13,21 @@ ...@@ -13,69 +13,21 @@
// limitations under the License. // limitations under the License.
#include <arm_neon.h> #include <arm_neon.h>
#include "mace/ops/delegator/bias_add.h"
#include "mace/ops/arm/base/bias_add.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
class BiasAdd : public delegator::BiasAdd {
public:
explicit BiasAdd(const DelegatorParam &param) : delegator::BiasAdd(param) {}
~BiasAdd() = default;
MaceStatus Compute(const OpContext *context, const Tensor *input,
const Tensor *bias, Tensor *output) override;
private:
void AddBias(const OpContext *context, const Tensor *input,
const Tensor *bias, Tensor *output);
};
MaceStatus BiasAdd::Compute(const OpContext *context,
const Tensor *input,
const Tensor *bias,
Tensor *output) {
Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard bias_guard(bias);
if (input != output) {
MACE_RETURN_IF_ERROR(output->ResizeLike(input));
if (bias == nullptr) {
output->Copy(*input);
} else {
Tensor::MappingGuard output_guard(output);
AddBias(context, input, bias, output);
}
} else {
if (bias != nullptr) {
AddBias(context, input, bias, output);
}
}
return MaceStatus::MACE_SUCCESS;
}
void BiasAdd::AddBias(const OpContext *context, template<>
const Tensor *input, void BiasAdd<float>::Add1DimBias(
const Tensor *bias, utils::ThreadPool *thread_pool, const float *input_data,
mace::Tensor *output) { const float *bias_data, float *output_data, const index_t batch,
auto input_data = input->data<float>(); const index_t channels, const index_t image_size) {
auto bias_data = bias->data<float>();
auto output_data = output->mutable_data<float>();
const index_t batch = input->dim(0);
const index_t channels = input->dim(1);
const index_t height = output->dim(2);
const index_t width = output->dim(3);
const index_t image_size = height * width;
const index_t block_count = image_size / 4; const index_t block_count = image_size / 4;
const index_t remain = image_size % 4; const index_t remain = image_size % 4;
thread_pool->Compute2D([=](index_t start0, index_t end0, index_t step0,
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
if (bias->dim_size() == 1) {
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) { index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
const index_t b_offset = b * channels; const index_t b_offset = b * channels;
...@@ -100,8 +52,16 @@ void BiasAdd::AddBias(const OpContext *context, ...@@ -100,8 +52,16 @@ void BiasAdd::AddBias(const OpContext *context,
} }
} }
}, 0, batch, 1, 0, channels, 1); }, 0, batch, 1, 0, channels, 1);
} else { }
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
template<>
void BiasAdd<float>::Add2DimsBias(
utils::ThreadPool *thread_pool, const float *input_data,
const float *bias_data, float *output_data, const index_t batch,
const index_t channels, const index_t image_size) {
const index_t block_count = image_size / 4;
const index_t remain = image_size % 4;
thread_pool->Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) { index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
const index_t b_offset = b * channels; const index_t b_offset = b * channels;
...@@ -126,16 +86,8 @@ void BiasAdd::AddBias(const OpContext *context, ...@@ -126,16 +86,8 @@ void BiasAdd::AddBias(const OpContext *context,
} }
} }
}, 0, batch, 1, 0, channels, 1); }, 0, batch, 1, 0, channels, 1);
}
}
void RegisterBiasAddDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, BiasAdd, DelegatorParam,
MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, float, ImplType::NEON));
} }
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
...@@ -21,7 +21,6 @@ ...@@ -21,7 +21,6 @@
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
inline float32x4_t neon_vfma_lane_0(float32x4_t a, inline float32x4_t neon_vfma_lane_0(float32x4_t a,
float32x4_t b, float32x4_t b,
...@@ -63,7 +62,6 @@ inline float32x4_t neon_vfma_lane_3(float32x4_t a, ...@@ -63,7 +62,6 @@ inline float32x4_t neon_vfma_lane_3(float32x4_t a,
#endif #endif
} }
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
......
// Copyright 2019 The MACE Authors. All Rights Reserved. // Copyright 2020 The MACE Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -12,93 +12,44 @@ ...@@ -12,93 +12,44 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/ops/arm/fp32/conv_2d_1xn.h"
#include <arm_neon.h> #include <arm_neon.h>
#include <memory> #include <memory>
#include "mace/ops/arm/base/conv_2d_1xn.h"
#include "mace/ops/delegator/conv_2d.h" #include "mace/ops/delegator/conv_2d.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
template<>
MaceStatus Conv2dK1x7S1::Compute(const OpContext *context, MaceStatus Conv2dK1x7S1<float>::DoCompute(
const Tensor *input, const ConvComputeParam &p, const float *filter_data,
const Tensor *filter, const float *input_data, float *output_data) {
Tensor *output) { p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
std::unique_ptr<const Tensor> padded_input;
std::unique_ptr<Tensor> padded_output;
ResizeOutAndPadInOut(context,
input,
filter,
output,
1,
4,
&padded_input,
&padded_output);
const Tensor *in_tensor = input;
if (padded_input != nullptr) {
in_tensor = padded_input.get();
}
Tensor *out_tensor = output;
if (padded_output != nullptr) {
out_tensor = padded_output.get();
}
out_tensor->Clear();
Tensor::MappingGuard in_guard(input);
Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard out_guard(output);
auto filter_data = filter->data<float>();
auto input_data = in_tensor->data<float>();
auto output_data = out_tensor->mutable_data<float>();
auto &in_shape = in_tensor->shape();
auto &out_shape = out_tensor->shape();
const index_t batch = in_shape[0];
const index_t in_channels = in_shape[1];
const index_t in_height = in_shape[2];
const index_t in_width = in_shape[3];
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = in_channels * in_image_size;
const index_t out_batch_size = out_channels * out_image_size;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) { index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
for (index_t m = start1; m < end1; m += step1) { for (index_t m = start1; m < end1; m += step1) {
if (m + 3 < out_channels) { if (m + 3 < p.out_channels) {
float *out_ptr0_base = float *out_ptr0_base =
output_data + b * out_batch_size + m * out_image_size; output_data + b * p.out_batch_size + m * p.out_image_size;
float *out_ptr1_base = float *out_ptr1_base =
output_data + b * out_batch_size + (m + 1) * out_image_size; output_data + b * p.out_batch_size + (m + 1) * p.out_image_size;
float *out_ptr2_base = float *out_ptr2_base =
output_data + b * out_batch_size + (m + 2) * out_image_size; output_data + b * p.out_batch_size + (m + 2) * p.out_image_size;
float *out_ptr3_base = float *out_ptr3_base =
output_data + b * out_batch_size + (m + 3) * out_image_size; output_data + b * p.out_batch_size + (m + 3) * p.out_image_size;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < p.in_channels; ++c) {
const float *in_ptr_base = const float *in_ptr_base =
input_data + b * in_batch_size + c * in_image_size; input_data + b * p.in_batch_size + c * p.in_image_size;
const float const float *filter_ptr0 =
*filter_ptr0 = filter_data + m * in_channels * 7 + c * 7; filter_data + m * p.in_channels * 7 + c * 7;
const float const float *filter_ptr1 =
*filter_ptr1 = filter_data + (m + 1) * in_channels * 7 + c * 7; filter_data + (m + 1) * p.in_channels * 7 + c * 7;
const float const float *filter_ptr2 =
*filter_ptr2 = filter_data + (m + 2) * in_channels * 7 + c * 7; filter_data + (m + 2) * p.in_channels * 7 + c * 7;
const float const float *filter_ptr3 =
*filter_ptr3 = filter_data + (m + 3) * in_channels * 7 + c * 7; filter_data + (m + 3) * p.in_channels * 7 + c * 7;
/* load filter (4 outch x 1 height x 4 width) */ /* load filter (4 outch x 1 height x 4 width) */
float32x4_t vf00, vf01; float32x4_t vf00, vf01;
float32x4_t vf10, vf11; float32x4_t vf10, vf11;
...@@ -113,12 +64,12 @@ MaceStatus Conv2dK1x7S1::Compute(const OpContext *context, ...@@ -113,12 +64,12 @@ MaceStatus Conv2dK1x7S1::Compute(const OpContext *context,
vf30 = vld1q_f32(filter_ptr3); vf30 = vld1q_f32(filter_ptr3);
vf31 = vld1q_f32(filter_ptr3 + 3); vf31 = vld1q_f32(filter_ptr3 + 3);
for (index_t h = 0; h < out_height; ++h) { for (index_t h = 0; h < p.out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) { for (index_t w = 0; w + 3 < p.out_width; w += 4) {
// output (4 outch x 1 height x 4 width): vo_outch_height // output (4 outch x 1 height x 4 width): vo_outch_height
float32x4_t vo0, vo1, vo2, vo3; float32x4_t vo0, vo1, vo2, vo3;
// load output // load output
index_t out_offset = h * out_width + w; index_t out_offset = h * p.out_width + w;
vo0 = vld1q_f32(out_ptr0_base + out_offset); vo0 = vld1q_f32(out_ptr0_base + out_offset);
vo1 = vld1q_f32(out_ptr1_base + out_offset); vo1 = vld1q_f32(out_ptr1_base + out_offset);
vo2 = vld1q_f32(out_ptr2_base + out_offset); vo2 = vld1q_f32(out_ptr2_base + out_offset);
...@@ -127,7 +78,7 @@ MaceStatus Conv2dK1x7S1::Compute(const OpContext *context, ...@@ -127,7 +78,7 @@ MaceStatus Conv2dK1x7S1::Compute(const OpContext *context,
// input (3 slide) // input (3 slide)
float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8; float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8;
// input offset // input offset
index_t in_offset = h * in_width + w; index_t in_offset = h * p.in_width + w;
// load input // load input
vi0 = vld1q_f32(in_ptr_base + in_offset); vi0 = vld1q_f32(in_ptr_base + in_offset);
vi4 = vld1q_f32(in_ptr_base + in_offset + 4); vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
...@@ -214,31 +165,31 @@ MaceStatus Conv2dK1x7S1::Compute(const OpContext *context, ...@@ -214,31 +165,31 @@ MaceStatus Conv2dK1x7S1::Compute(const OpContext *context,
} // h } // h
} // c } // c
} else { } else {
for (index_t mm = m; mm < out_channels; ++mm) { for (index_t mm = m; mm < p.out_channels; ++mm) {
float *out_ptr0_base = float *out_ptr0_base =
output_data + b * out_batch_size + mm * out_image_size; output_data + b * p.out_batch_size + mm * p.out_image_size;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < p.in_channels; ++c) {
const float *in_ptr_base = const float *in_ptr_base =
input_data + b * in_batch_size + c * in_image_size; input_data + b * p.in_batch_size + c * p.in_image_size;
const float const float *filter_ptr0 =
*filter_ptr0 = filter_data + mm * in_channels * 7 + c * 7; filter_data + mm * p.in_channels * 7 + c * 7;
/* load filter (1 outch x 1 height x 4 width) */ /* load filter (1 outch x 1 height x 4 width) */
float32x4_t vf00, vf01; float32x4_t vf00, vf01;
vf00 = vld1q_f32(filter_ptr0); vf00 = vld1q_f32(filter_ptr0);
vf01 = vld1q_f32(filter_ptr0 + 3); vf01 = vld1q_f32(filter_ptr0 + 3);
for (index_t h = 0; h < out_height; ++h) { for (index_t h = 0; h < p.out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) { for (index_t w = 0; w + 3 < p.out_width; w += 4) {
// output (1 outch x 1 height x 4 width): vo_outch_height // output (1 outch x 1 height x 4 width): vo_outch_height
float32x4_t vo0; float32x4_t vo0;
// load output // load output
index_t out_offset = h * out_width + w; index_t out_offset = h * p.out_width + w;
vo0 = vld1q_f32(out_ptr0_base + out_offset); vo0 = vld1q_f32(out_ptr0_base + out_offset);
// input (3 slide) // input (3 slide)
float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8; float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8;
// input offset // input offset
index_t in_offset = h * in_width + w; index_t in_offset = h * p.in_width + w;
// load input // load input
vi0 = vld1q_f32(in_ptr_base + in_offset); vi0 = vld1q_f32(in_ptr_base + in_offset);
vi4 = vld1q_f32(in_ptr_base + in_offset + 4); vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
...@@ -275,87 +226,39 @@ MaceStatus Conv2dK1x7S1::Compute(const OpContext *context, ...@@ -275,87 +226,39 @@ MaceStatus Conv2dK1x7S1::Compute(const OpContext *context,
} // if } // if
} // m } // m
} // b } // b
}, 0, batch, 1, 0, out_channels, 4); }, 0, p.batch, 1, 0, p.out_channels, 4);
UnPadOutput(*out_tensor, output);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MaceStatus Conv2dK7x1S1::Compute(const OpContext *context, template<>
const Tensor *input, MaceStatus Conv2dK7x1S1<float>::DoCompute(
const Tensor *filter, const ConvComputeParam &p, const float *filter_data,
Tensor *output) { const float *input_data, float *output_data) {
std::unique_ptr<const Tensor> padded_input; p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
std::unique_ptr<Tensor> padded_output;
ResizeOutAndPadInOut(context,
input,
filter,
output,
4,
1,
&padded_input,
&padded_output);
const Tensor *in_tensor = input;
if (padded_input != nullptr) {
in_tensor = padded_input.get();
}
Tensor *out_tensor = output;
if (padded_output != nullptr) {
out_tensor = padded_output.get();
}
out_tensor->Clear();
Tensor::MappingGuard in_guard(input);
Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard out_guard(output);
auto filter_data = filter->data<float>();
auto input_data = in_tensor->data<float>();
auto output_data = out_tensor->mutable_data<float>();
auto &in_shape = in_tensor->shape();
auto &out_shape = out_tensor->shape();
const index_t batch = in_shape[0];
const index_t in_channels = in_shape[1];
const index_t in_height = in_shape[2];
const index_t in_width = in_shape[3];
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = in_channels * in_image_size;
const index_t out_batch_size = out_channels * out_image_size;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) { index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
for (index_t m = start1; m < end1; m += step1) { for (index_t m = start1; m < end1; m += step1) {
if (m + 3 < out_channels) { if (m + 3 < p.out_channels) {
float *out_ptr0_base = float *out_ptr0_base =
output_data + b * out_batch_size + m * out_image_size; output_data + b * p.out_batch_size + m * p.out_image_size;
float *out_ptr1_base = float *out_ptr1_base =
output_data + b * out_batch_size + (m + 1) * out_image_size; output_data + b * p.out_batch_size + (m + 1) * p.out_image_size;
float *out_ptr2_base = float *out_ptr2_base =
output_data + b * out_batch_size + (m + 2) * out_image_size; output_data + b * p.out_batch_size + (m + 2) * p.out_image_size;
float *out_ptr3_base = float *out_ptr3_base =
output_data + b * out_batch_size + (m + 3) * out_image_size; output_data + b * p.out_batch_size + (m + 3) * p.out_image_size;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < p.in_channels; ++c) {
const float *in_ptr_base = const float *in_ptr_base =
input_data + b * in_batch_size + c * in_image_size; input_data + b * p.in_batch_size + c * p.in_image_size;
const float const float *filter_ptr0 =
*filter_ptr0 = filter_data + m * in_channels * 7 + c * 7; filter_data + m * p.in_channels * 7 + c * 7;
const float const float *filter_ptr1 =
*filter_ptr1 = filter_data + (m + 1) * in_channels * 7 + c * 7; filter_data + (m + 1) * p.in_channels * 7 + c * 7;
const float const float *filter_ptr2 =
*filter_ptr2 = filter_data + (m + 2) * in_channels * 7 + c * 7; filter_data + (m + 2) * p.in_channels * 7 + c * 7;
const float const float *filter_ptr3 =
*filter_ptr3 = filter_data + (m + 3) * in_channels * 7 + c * 7; filter_data + (m + 3) * p.in_channels * 7 + c * 7;
/* load filter (4 outch x 4 height x 1 width) */ /* load filter (4 outch x 4 height x 1 width) */
float32x4_t vf00, vf01; float32x4_t vf00, vf01;
float32x4_t vf10, vf11; float32x4_t vf10, vf11;
...@@ -370,41 +273,41 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context, ...@@ -370,41 +273,41 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context,
vf30 = vld1q_f32(filter_ptr3); vf30 = vld1q_f32(filter_ptr3);
vf31 = vld1q_f32(filter_ptr3 + 3); vf31 = vld1q_f32(filter_ptr3 + 3);
for (index_t h = 0; h + 3 < out_height; h += 4) { for (index_t h = 0; h + 3 < p.out_height; h += 4) {
for (index_t w = 0; w < out_width; ++w) { for (index_t w = 0; w < p.out_width; ++w) {
// load output // load output
index_t out_offset = h * out_width + w; index_t out_offset = h * p.out_width + w;
// output (4 outch x 4 height x 1 width): vo_outch_height // output (4 outch x 4 height x 1 width): vo_outch_height
float32x4_t vo0 = {out_ptr0_base[out_offset], float32x4_t vo0 = {out_ptr0_base[out_offset],
out_ptr0_base[out_offset + out_width], out_ptr0_base[out_offset + p.out_width],
out_ptr0_base[out_offset + 2 * out_width], out_ptr0_base[out_offset + 2 * p.out_width],
out_ptr0_base[out_offset + 3 * out_width]}; out_ptr0_base[out_offset + 3 * p.out_width]};
float32x4_t vo1 = {out_ptr1_base[out_offset], float32x4_t vo1 = {out_ptr1_base[out_offset],
out_ptr1_base[out_offset + out_width], out_ptr1_base[out_offset + p.out_width],
out_ptr1_base[out_offset + 2 * out_width], out_ptr1_base[out_offset + 2 * p.out_width],
out_ptr1_base[out_offset + 3 * out_width]}; out_ptr1_base[out_offset + 3 * p.out_width]};
float32x4_t vo2 = {out_ptr2_base[out_offset], float32x4_t vo2 = {out_ptr2_base[out_offset],
out_ptr2_base[out_offset + out_width], out_ptr2_base[out_offset + p.out_width],
out_ptr2_base[out_offset + 2 * out_width], out_ptr2_base[out_offset + 2 * p.out_width],
out_ptr2_base[out_offset + 3 * out_width]}; out_ptr2_base[out_offset + 3 * p.out_width]};
float32x4_t vo3 = {out_ptr3_base[out_offset], float32x4_t vo3 = {out_ptr3_base[out_offset],
out_ptr3_base[out_offset + out_width], out_ptr3_base[out_offset + p.out_width],
out_ptr3_base[out_offset + 2 * out_width], out_ptr3_base[out_offset + 2 * p.out_width],
out_ptr3_base[out_offset + 3 * out_width]}; out_ptr3_base[out_offset + 3 * p.out_width]};
// input offset // input offset
index_t in_offset = h * in_width + w; index_t in_offset = h * p.in_width + w;
// input (3 slide) // input (3 slide)
float32x4_t vi0 = {in_ptr_base[in_offset], float32x4_t vi0 = {in_ptr_base[in_offset],
in_ptr_base[in_offset + in_width], in_ptr_base[in_offset + p.in_width],
in_ptr_base[in_offset + 2 * in_width], in_ptr_base[in_offset + 2 * p.in_width],
in_ptr_base[in_offset + 3 * in_width]}; in_ptr_base[in_offset + 3 * p.in_width]};
float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width], float32x4_t vi4 = {in_ptr_base[in_offset + 4 * p.in_width],
in_ptr_base[in_offset + 5 * in_width], in_ptr_base[in_offset + 5 * p.in_width],
in_ptr_base[in_offset + 6 * in_width], in_ptr_base[in_offset + 6 * p.in_width],
in_ptr_base[in_offset + 7 * in_width]}; in_ptr_base[in_offset + 7 * p.in_width]};
float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width], float32x4_t vi8 = {in_ptr_base[in_offset + 8 * p.in_width],
in_ptr_base[in_offset + 9 * in_width]}; in_ptr_base[in_offset + 9 * p.in_width]};
float32x4_t vi1 = vextq_f32(vi0, vi4, 1); float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
float32x4_t vi2 = vextq_f32(vi0, vi4, 2); float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
float32x4_t vi3 = vextq_f32(vi0, vi4, 3); float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
...@@ -480,63 +383,65 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context, ...@@ -480,63 +383,65 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context,
#endif #endif
out_ptr0_base[out_offset] = vo0[0]; out_ptr0_base[out_offset] = vo0[0];
out_ptr0_base[out_offset + out_width] = vo0[1]; out_ptr0_base[out_offset + p.out_width] = vo0[1];
out_ptr0_base[out_offset + 2 * out_width] = vo0[2]; out_ptr0_base[out_offset + 2 * p.out_width] = vo0[2];
out_ptr0_base[out_offset + 3 * out_width] = vo0[3]; out_ptr0_base[out_offset + 3 * p.out_width] = vo0[3];
out_ptr1_base[out_offset] = vo1[0]; out_ptr1_base[out_offset] = vo1[0];
out_ptr1_base[out_offset + out_width] = vo1[1]; out_ptr1_base[out_offset + p.out_width] = vo1[1];
out_ptr1_base[out_offset + 2 * out_width] = vo1[2]; out_ptr1_base[out_offset + 2 * p.out_width] = vo1[2];
out_ptr1_base[out_offset + 3 * out_width] = vo1[3]; out_ptr1_base[out_offset + 3 * p.out_width] = vo1[3];
out_ptr2_base[out_offset] = vo2[0]; out_ptr2_base[out_offset] = vo2[0];
out_ptr2_base[out_offset + out_width] = vo2[1]; out_ptr2_base[out_offset + p.out_width] = vo2[1];
out_ptr2_base[out_offset + 2 * out_width] = vo2[2]; out_ptr2_base[out_offset + 2 * p.out_width] = vo2[2];
out_ptr2_base[out_offset + 3 * out_width] = vo2[3]; out_ptr2_base[out_offset + 3 * p.out_width] = vo2[3];
out_ptr3_base[out_offset] = vo3[0]; out_ptr3_base[out_offset] = vo3[0];
out_ptr3_base[out_offset + out_width] = vo3[1]; out_ptr3_base[out_offset + p.out_width] = vo3[1];
out_ptr3_base[out_offset + 2 * out_width] = vo3[2]; out_ptr3_base[out_offset + 2 * p.out_width] = vo3[2];
out_ptr3_base[out_offset + 3 * out_width] = vo3[3]; out_ptr3_base[out_offset + 3 * p.out_width] = vo3[3];
} // w } // w
} // h } // h
} // c } // c
} else { } else {
for (index_t mm = m; mm < out_channels; ++mm) { for (index_t mm = m; mm < p.out_channels; ++mm) {
float *out_ptr0_base = float *out_ptr0_base =
output_data + b * out_batch_size + mm * out_image_size; output_data + b * p.out_batch_size + mm * p.out_image_size;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < p.in_channels; ++c) {
const float *in_ptr_base = const float *in_ptr_base =
input_data + b * in_batch_size + c * in_image_size; input_data + b * p.in_batch_size + c * p.in_image_size;
const float const float *filter_ptr0 =
*filter_ptr0 = filter_data + mm * in_channels * 7 + c * 7; filter_data + mm * p.in_channels * 7 + c * 7;
/* load filter (1 outch x 4 height x 1 width) */ /* load filter (1 outch x 4 height x 1 width) */
float32x4_t vf00, vf01; float32x4_t vf00, vf01;
vf00 = vld1q_f32(filter_ptr0); vf00 = vld1q_f32(filter_ptr0);
vf01 = vld1q_f32(filter_ptr0 + 3); vf01 = vld1q_f32(filter_ptr0 + 3);
for (index_t h = 0; h + 3 < out_height; h += 4) { for (index_t h = 0; h + 3 < p.out_height; h += 4) {
for (index_t w = 0; w < out_width; ++w) { for (index_t w = 0; w < p.out_width; ++w) {
// load output // load output
index_t out_offset = h * out_width + w; index_t out_offset = h * p.out_width + w;
// output (1 outch x 4 height x 1 width): vo_outch_height // output (1 outch x 4 height x 1 width): vo_outch_height
float32x4_t vo0 = {out_ptr0_base[out_offset], float32x4_t vo0 = {out_ptr0_base[out_offset],
out_ptr0_base[out_offset + out_width], out_ptr0_base[out_offset + p.out_width],
out_ptr0_base[out_offset + 2 * out_width], out_ptr0_base[out_offset
out_ptr0_base[out_offset + 3 * out_width]}; + 2 * p.out_width],
out_ptr0_base[out_offset
+ 3 * p.out_width]};
// input offset // input offset
index_t in_offset = h * in_width + w; index_t in_offset = h * p.in_width + w;
// input (3 slide) // input (3 slide)
float32x4_t vi0 = {in_ptr_base[in_offset], float32x4_t vi0 = {in_ptr_base[in_offset],
in_ptr_base[in_offset + in_width], in_ptr_base[in_offset + p.in_width],
in_ptr_base[in_offset + 2 * in_width], in_ptr_base[in_offset + 2 * p.in_width],
in_ptr_base[in_offset + 3 * in_width]}; in_ptr_base[in_offset + 3 * p.in_width]};
float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width], float32x4_t vi4 = {in_ptr_base[in_offset + 4 * p.in_width],
in_ptr_base[in_offset + 5 * in_width], in_ptr_base[in_offset + 5 * p.in_width],
in_ptr_base[in_offset + 6 * in_width], in_ptr_base[in_offset + 6 * p.in_width],
in_ptr_base[in_offset + 7 * in_width]}; in_ptr_base[in_offset + 7 * p.in_width]};
float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width], float32x4_t vi8 = {in_ptr_base[in_offset + 8 * p.in_width],
in_ptr_base[in_offset + 9 * in_width], in_ptr_base[in_offset + 9 * p.in_width],
in_ptr_base[in_offset + 10 * in_width], in_ptr_base[in_offset + 10 * p.in_width],
in_ptr_base[in_offset + 11 * in_width]}; in_ptr_base[in_offset + 11 * p.in_width]};
float32x4_t vi1 = vextq_f32(vi0, vi4, 1); float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
float32x4_t vi2 = vextq_f32(vi0, vi4, 2); float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
float32x4_t vi3 = vextq_f32(vi0, vi4, 3); float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
...@@ -562,9 +467,9 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context, ...@@ -562,9 +467,9 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context,
#endif #endif
out_ptr0_base[out_offset] = vo0[0]; out_ptr0_base[out_offset] = vo0[0];
out_ptr0_base[out_offset + out_width] = vo0[1]; out_ptr0_base[out_offset + p.out_width] = vo0[1];
out_ptr0_base[out_offset + 2 * out_width] = vo0[2]; out_ptr0_base[out_offset + 2 * p.out_width] = vo0[2];
out_ptr0_base[out_offset + 3 * out_width] = vo0[3]; out_ptr0_base[out_offset + 3 * p.out_width] = vo0[3];
} // w } // w
} // h } // h
} // c } // c
...@@ -572,78 +477,30 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context, ...@@ -572,78 +477,30 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context,
} // if } // if
} // m } // m
} // b } // b
}, 0, batch, 1, 0, out_channels, 4); }, 0, p.batch, 1, 0, p.out_channels, 4);
UnPadOutput(*out_tensor, output);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MaceStatus Conv2dK1x15S1::Compute(const OpContext *context, template<>
const Tensor *input, MaceStatus Conv2dK1x15S1<float>::DoCompute(
const Tensor *filter, const ConvComputeParam &p, const float *filter_data,
Tensor *output) { const float *input_data, float *output_data) {
std::unique_ptr<const Tensor> padded_input;
std::unique_ptr<Tensor> padded_output;
ResizeOutAndPadInOut(context,
input,
filter,
output,
1,
4,
&padded_input,
&padded_output);
const Tensor *in_tensor = input;
if (padded_input.get() != nullptr) {
in_tensor = padded_input.get();
}
Tensor *out_tensor = output;
if (padded_output.get() != nullptr) {
out_tensor = padded_output.get();
}
out_tensor->Clear();
Tensor::MappingGuard in_guard(input);
Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard out_guard(output);
auto filter_data = filter->data<float>();
auto input_data = in_tensor->data<float>();
auto output_data = out_tensor->mutable_data<float>();
auto &in_shape = in_tensor->shape();
auto &out_shape = out_tensor->shape();
const index_t batch = in_shape[0];
const index_t in_channels = in_shape[1];
const index_t in_height = in_shape[2];
const index_t in_width = in_shape[3];
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = in_channels * in_image_size;
const index_t out_batch_size = out_channels * out_image_size;
const index_t tile_height = const index_t tile_height =
out_channels < 4 ? RoundUpDiv4(out_height) : out_height; p.out_channels < 4 ? RoundUpDiv4(p.out_height) : p.out_height;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0, p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) { index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
for (index_t m = start1; m < end1; m += step1) { for (index_t m = start1; m < end1; m += step1) {
for (index_t h = 0; h < out_height; h += tile_height) { for (index_t h = 0; h < p.out_height; h += tile_height) {
float *out_ptr_base = float *out_ptr_base =
output_data + b * out_batch_size + m * out_image_size; output_data + b * p.out_batch_size + m * p.out_image_size;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < p.in_channels; ++c) {
const float *in_ptr_base = const float *in_ptr_base =
input_data + b * in_batch_size + c * in_image_size; input_data + b * p.in_batch_size + c * p.in_image_size;
const float const float *filter_ptr =
*filter_ptr = filter_data + m * in_channels * 15 + c * 15; filter_data + m * p.in_channels * 15 + c * 15;
/* load filter (1 outch x 4 height x 1 width) */ /* load filter (1 outch x 4 height x 1 width) */
float32x4_t vf0, vf1, vf2, vf3; float32x4_t vf0, vf1, vf2, vf3;
vf0 = vld1q_f32(filter_ptr); vf0 = vld1q_f32(filter_ptr);
...@@ -651,20 +508,20 @@ MaceStatus Conv2dK1x15S1::Compute(const OpContext *context, ...@@ -651,20 +508,20 @@ MaceStatus Conv2dK1x15S1::Compute(const OpContext *context,
vf2 = vld1q_f32(filter_ptr + 8); vf2 = vld1q_f32(filter_ptr + 8);
vf3 = vld1q_f32(filter_ptr + 11); vf3 = vld1q_f32(filter_ptr + 11);
for (index_t ht = 0; ht < tile_height && h + ht < out_height; for (index_t ht = 0; ht < tile_height && h + ht < p.out_height;
++ht) { ++ht) {
for (index_t w = 0; w + 3 < out_width; w += 4) { for (index_t w = 0; w + 3 < p.out_width; w += 4) {
// output (1 outch x 1 height x 4 width): vo_outch_height // output (1 outch x 1 height x 4 width): vo_outch_height
float32x4_t vo; float32x4_t vo;
// load output // load output
index_t out_offset = (h + ht) * out_width + w; index_t out_offset = (h + ht) * p.out_width + w;
vo = vld1q_f32(out_ptr_base + out_offset); vo = vld1q_f32(out_ptr_base + out_offset);
// input (3 slide) // input (3 slide)
float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7, vi8, vi9, float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7, vi8, vi9,
vi10, vi11, vi12, vi13, vi14, vi16; vi10, vi11, vi12, vi13, vi14, vi16;
// input offset // input offset
index_t in_offset = (h + ht) * in_width + w; index_t in_offset = (h + ht) * p.in_width + w;
// load input // load input
vi0 = vld1q_f32(in_ptr_base + in_offset); vi0 = vld1q_f32(in_ptr_base + in_offset);
vi4 = vld1q_f32(in_ptr_base + in_offset + 4); vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
...@@ -706,78 +563,30 @@ MaceStatus Conv2dK1x15S1::Compute(const OpContext *context, ...@@ -706,78 +563,30 @@ MaceStatus Conv2dK1x15S1::Compute(const OpContext *context,
} // h } // h
} // m } // m
} // b } // b
}, 0, batch, 1, 0, out_channels, 1); }, 0, p.batch, 1, 0, p.out_channels, 1);
UnPadOutput(*out_tensor, output);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MaceStatus Conv2dK15x1S1::Compute(const OpContext *context, template<>
const Tensor *input, MaceStatus Conv2dK15x1S1<float>::DoCompute(
const Tensor *filter, const ConvComputeParam &p, const float *filter_data,
Tensor *output) { const float *input_data, float *output_data) {
std::unique_ptr<const Tensor> padded_input;
std::unique_ptr<Tensor> padded_output;
ResizeOutAndPadInOut(context,
input,
filter,
output,
4,
1,
&padded_input,
&padded_output);
const Tensor *in_tensor = input;
if (padded_input.get() != nullptr) {
in_tensor = padded_input.get();
}
Tensor *out_tensor = output;
if (padded_output.get() != nullptr) {
out_tensor = padded_output.get();
}
out_tensor->Clear();
Tensor::MappingGuard in_guard(input);
Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard out_guard(output);
auto filter_data = filter->data<float>();
auto input_data = in_tensor->data<float>();
auto output_data = out_tensor->mutable_data<float>();
auto &in_shape = in_tensor->shape();
auto &out_shape = out_tensor->shape();
const index_t batch = in_shape[0];
const index_t in_channels = in_shape[1];
const index_t in_height = in_shape[2];
const index_t in_width = in_shape[3];
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = in_channels * in_image_size;
const index_t out_batch_size = out_channels * out_image_size;
const index_t tile_width = const index_t tile_width =
out_channels < 4 ? RoundUpDiv4(out_width) : out_width; p.out_channels < 4 ? RoundUpDiv4(p.out_width) : p.out_width;
p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) { index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
for (index_t m = start1; m < end1; m += step1) { for (index_t m = start1; m < end1; m += step1) {
for (index_t w = 0; w < out_width; w += tile_width) { for (index_t w = 0; w < p.out_width; w += tile_width) {
float *out_ptr_base = float *out_ptr_base =
output_data + b * out_batch_size + m * out_image_size; output_data + b * p.out_batch_size + m * p.out_image_size;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < p.in_channels; ++c) {
const float *in_ptr_base = const float *in_ptr_base =
input_data + b * in_batch_size + c * in_image_size; input_data + b * p.in_batch_size + c * p.in_image_size;
const float const float *filter_ptr =
*filter_ptr = filter_data + m * in_channels * 15 + c * 15; filter_data + m * p.in_channels * 15 + c * 15;
/* load filter (1 outch x 4 height x 1 width) */ /* load filter (1 outch x 4 height x 1 width) */
float32x4_t vf0, vf1, vf2, vf3; float32x4_t vf0, vf1, vf2, vf3;
vf0 = vld1q_f32(filter_ptr); vf0 = vld1q_f32(filter_ptr);
...@@ -785,38 +594,38 @@ MaceStatus Conv2dK15x1S1::Compute(const OpContext *context, ...@@ -785,38 +594,38 @@ MaceStatus Conv2dK15x1S1::Compute(const OpContext *context,
vf2 = vld1q_f32(filter_ptr + 8); vf2 = vld1q_f32(filter_ptr + 8);
vf3 = vld1q_f32(filter_ptr + 11); vf3 = vld1q_f32(filter_ptr + 11);
for (index_t h = 0; h + 3 < out_height; h += 4) { for (index_t h = 0; h + 3 < p.out_height; h += 4) {
for (index_t wt = 0; wt < tile_width && w + wt < out_width; for (index_t wt = 0; wt < tile_width && w + wt < p.out_width;
++wt) { ++wt) {
// load output // load output
index_t out_offset = h * out_width + w + wt; index_t out_offset = h * p.out_width + w + wt;
// output (1 outch x 4 height x 1 width): vo_outch_height // output (1 outch x 4 height x 1 width): vo_outch_height
float32x4_t vo = {out_ptr_base[out_offset], float32x4_t vo = {out_ptr_base[out_offset],
out_ptr_base[out_offset + out_width], out_ptr_base[out_offset + p.out_width],
out_ptr_base[out_offset + 2 * out_width], out_ptr_base[out_offset + 2 * p.out_width],
out_ptr_base[out_offset + 3 * out_width]}; out_ptr_base[out_offset + 3 * p.out_width]};
// input offset // input offset
index_t in_offset = h * in_width + w + wt; index_t in_offset = h * p.in_width + w + wt;
// input (3 slide) // input (3 slide)
float32x4_t vi0 = {in_ptr_base[in_offset], float32x4_t vi0 = {in_ptr_base[in_offset],
in_ptr_base[in_offset + in_width], in_ptr_base[in_offset + p.in_width],
in_ptr_base[in_offset + 2 * in_width], in_ptr_base[in_offset + 2 * p.in_width],
in_ptr_base[in_offset + 3 * in_width]}; in_ptr_base[in_offset + 3 * p.in_width]};
float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width], float32x4_t vi4 = {in_ptr_base[in_offset + 4 * p.in_width],
in_ptr_base[in_offset + 5 * in_width], in_ptr_base[in_offset + 5 * p.in_width],
in_ptr_base[in_offset + 6 * in_width], in_ptr_base[in_offset + 6 * p.in_width],
in_ptr_base[in_offset + 7 * in_width]}; in_ptr_base[in_offset + 7 * p.in_width]};
float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width], float32x4_t vi8 = {in_ptr_base[in_offset + 8 * p.in_width],
in_ptr_base[in_offset + 9 * in_width], in_ptr_base[in_offset + 9 * p.in_width],
in_ptr_base[in_offset + 10 * in_width], in_ptr_base[in_offset + 10 * p.in_width],
in_ptr_base[in_offset + 11 * in_width]}; in_ptr_base[in_offset + 11 * p.in_width]};
float32x4_t vi12 = {in_ptr_base[in_offset + 12 * in_width], float32x4_t vi12 = {in_ptr_base[in_offset + 12 * p.in_width],
in_ptr_base[in_offset + 13 * in_width], in_ptr_base[in_offset + 13 * p.in_width],
in_ptr_base[in_offset + 14 * in_width], in_ptr_base[in_offset + 14 * p.in_width],
in_ptr_base[in_offset + 15 * in_width]}; in_ptr_base[in_offset + 15 * p.in_width]};
float32x4_t vi16 = {in_ptr_base[in_offset + 16 * in_width], float32x4_t vi16 = {in_ptr_base[in_offset + 16 * p.in_width],
in_ptr_base[in_offset + 17 * in_width]}; in_ptr_base[in_offset + 17 * p.in_width]};
float32x4_t vi1 = vextq_f32(vi0, vi4, 1); float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
float32x4_t vi2 = vextq_f32(vi0, vi4, 2); float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
float32x4_t vi3 = vextq_f32(vi0, vi4, 3); float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
...@@ -846,44 +655,20 @@ MaceStatus Conv2dK15x1S1::Compute(const OpContext *context, ...@@ -846,44 +655,20 @@ MaceStatus Conv2dK15x1S1::Compute(const OpContext *context,
vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1); vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1);
out_ptr_base[out_offset] = vo[0]; out_ptr_base[out_offset] = vo[0];
out_ptr_base[out_offset + out_width] = vo[1]; out_ptr_base[out_offset + p.out_width] = vo[1];
out_ptr_base[out_offset + 2 * out_width] = vo[2]; out_ptr_base[out_offset + 2 * p.out_width] = vo[2];
out_ptr_base[out_offset + 3 * out_width] = vo[3]; out_ptr_base[out_offset + 3 * p.out_width] = vo[3];
} // wt } // wt
} // h } // h
} // c } // c
} // w } // w
} // m } // m
} // b } // b
}, 0, batch, 1, 0, out_channels, 1); }, 0, p.batch, 1, 0, p.out_channels, 1);
UnPadOutput(*out_tensor, output);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
void RegisterConv2dK1xNDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Conv2dK1x7S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K1x7S1));
MACE_REGISTER_DELEGATOR(
registry, Conv2dK7x1S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K7x1S1));
MACE_REGISTER_DELEGATOR(
registry, Conv2dK1x15S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K1x15S1));
MACE_REGISTER_DELEGATOR(
registry, Conv2dK15x1S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K15x1S1));
}
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
// Copyright 2019 The MACE Authors. All Rights Reserved. // Copyright 2020 The MACE Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -12,95 +12,47 @@ ...@@ -12,95 +12,47 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/ops/arm/fp32/conv_2d_3x3.h"
#include <arm_neon.h> #include <arm_neon.h>
#include <memory> #include <memory>
#include "mace/ops/arm/base/conv_2d_3x3.h"
#include "mace/ops/delegator/conv_2d.h" #include "mace/ops/delegator/conv_2d.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
template<>
MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, MaceStatus Conv2dK3x3S1<float>::DoCompute(
const Tensor *input, const ConvComputeParam &p, const float *filter_data,
const Tensor *filter, const float *input_data, float *output_data) {
Tensor *output) { p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
std::unique_ptr<const Tensor> padded_input;
std::unique_ptr<Tensor> padded_output;
ResizeOutAndPadInOut(context,
input,
filter,
output,
2,
4,
&padded_input,
&padded_output);
const Tensor *in_tensor = input;
if (padded_input != nullptr) {
in_tensor = padded_input.get();
}
Tensor *out_tensor = output;
if (padded_output != nullptr) {
out_tensor = padded_output.get();
}
out_tensor->Clear();
Tensor::MappingGuard in_guard(input);
Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard out_guard(output);
auto filter_data = filter->data<float>();
auto input_data = in_tensor->data<float>();
auto output_data = out_tensor->mutable_data<float>();
auto &in_shape = in_tensor->shape();
auto &out_shape = out_tensor->shape();
const index_t batch = in_shape[0];
const index_t in_channels = in_shape[1];
const index_t in_height = in_shape[2];
const index_t in_width = in_shape[3];
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = in_channels * in_image_size;
const index_t out_batch_size = out_channels * out_image_size;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) { index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
for (index_t m = start1; m < end1; m += step1) { for (index_t m = start1; m < end1; m += step1) {
if (m + 1 < out_channels) { if (m + 1 < p.out_channels) {
float *out_ptr0_base = float *out_ptr0_base =
output_data + b * out_batch_size + m * out_image_size; output_data + b * p.out_batch_size + m * p.out_image_size;
float *out_ptr1_base = float *out_ptr1_base =
output_data + b * out_batch_size + (m + 1) * out_image_size; output_data + b * p.out_batch_size + (m + 1) * p.out_image_size;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < p.in_channels; ++c) {
const float const float *in_ptr0 =
*in_ptr0 = input_data + b * in_batch_size + c * in_image_size; input_data + b * p.in_batch_size + c * p.in_image_size;
const float const float
*filter_ptr0 = filter_data + m * in_channels * 9 + c * 9; *filter_ptr0 = filter_data + m * p.in_channels * 9 + c * 9;
float *out_ptr1 = out_ptr1_base; float *out_ptr1 = out_ptr1_base;
const float *in_ptr1 = const float *in_ptr1 =
input_data + b * in_batch_size + c * in_image_size input_data + b * p.in_batch_size + c * p.in_image_size
+ 1 * in_width; + 1 * p.in_width;
const float *in_ptr2 = const float *in_ptr2 =
input_data + b * in_batch_size + c * in_image_size input_data + b * p.in_batch_size + c * p.in_image_size
+ 2 * in_width; + 2 * p.in_width;
const float *in_ptr3 = const float *in_ptr3 =
input_data + b * in_batch_size + c * in_image_size input_data + b * p.in_batch_size + c * p.in_image_size
+ 3 * in_width; + 3 * p.in_width;
const float const float *filter_ptr1 =
*filter_ptr1 = filter_data + (m + 1) * in_channels * 9 + c * 9; filter_data + (m + 1) * p.in_channels * 9 + c * 9;
#if defined(__aarch64__) #if defined(__aarch64__)
float *out_ptr0 = out_ptr0_base; float *out_ptr0 = out_ptr0_base;
...@@ -116,8 +68,8 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, ...@@ -116,8 +68,8 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
vf11 = vld1q_f32(filter_ptr1 + 3); vf11 = vld1q_f32(filter_ptr1 + 3);
vf12 = vld1q_f32(filter_ptr1 + 6); vf12 = vld1q_f32(filter_ptr1 + 6);
for (index_t h = 0; h + 1 < out_height; h += 2) { for (index_t h = 0; h + 1 < p.out_height; h += 2) {
for (index_t w = 0; w + 3 < out_width; w += 4) { for (index_t w = 0; w + 3 < p.out_width; w += 4) {
// input (4 height x 3 slide): vi_height_slide // input (4 height x 3 slide): vi_height_slide
float32x4_t vi00, vi01, vi02; // reg count: 14 float32x4_t vi00, vi01, vi02; // reg count: 14
float32x4_t vi10, vi11, vi12; float32x4_t vi10, vi11, vi12;
...@@ -150,9 +102,9 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, ...@@ -150,9 +102,9 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
// load ouptut // load ouptut
vo00 = vld1q_f32(out_ptr0); vo00 = vld1q_f32(out_ptr0);
vo01 = vld1q_f32(out_ptr0 + out_width); vo01 = vld1q_f32(out_ptr0 + p.out_width);
vo10 = vld1q_f32(out_ptr1); vo10 = vld1q_f32(out_ptr1);
vo11 = vld1q_f32(out_ptr1 + out_width); vo11 = vld1q_f32(out_ptr1 + p.out_width);
// outch 0, height 0 // outch 0, height 0
vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0); // reg count: 18 vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0); // reg count: 18
...@@ -199,9 +151,9 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, ...@@ -199,9 +151,9 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
vo11 = vfmaq_laneq_f32(vo11, vi32, vf12, 2); vo11 = vfmaq_laneq_f32(vo11, vi32, vf12, 2);
vst1q_f32(out_ptr0, vo00); vst1q_f32(out_ptr0, vo00);
vst1q_f32(out_ptr0 + out_width, vo01); vst1q_f32(out_ptr0 + p.out_width, vo01);
vst1q_f32(out_ptr1, vo10); vst1q_f32(out_ptr1, vo10);
vst1q_f32(out_ptr1 + out_width, vo11); vst1q_f32(out_ptr1 + p.out_width, vo11);
in_ptr0 += 4; in_ptr0 += 4;
in_ptr1 += 4; in_ptr1 += 4;
...@@ -212,13 +164,13 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, ...@@ -212,13 +164,13 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
out_ptr1 += 4; out_ptr1 += 4;
} // w } // w
in_ptr0 += 2 + in_width; in_ptr0 += 2 + p.in_width;
in_ptr1 += 2 + in_width; in_ptr1 += 2 + p.in_width;
in_ptr2 += 2 + in_width; in_ptr2 += 2 + p.in_width;
in_ptr3 += 2 + in_width; in_ptr3 += 2 + p.in_width;
out_ptr0 += out_width; out_ptr0 += p.out_width;
out_ptr1 += out_width; out_ptr1 += p.out_width;
} // h } // h
#else // arm v7 #else // arm v7
float *out_ptr0 = out_ptr0_base; float *out_ptr0 = out_ptr0_base;
...@@ -238,8 +190,8 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, ...@@ -238,8 +190,8 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
vf167 = vld1_f32(filter_ptr1 + 6); vf167 = vld1_f32(filter_ptr1 + 6);
vf189 = vld1_f32(filter_ptr1 + 8); vf189 = vld1_f32(filter_ptr1 + 8);
for (index_t h = 0; h + 1 < out_height; h += 2) { for (index_t h = 0; h + 1 < p.out_height; h += 2) {
for (index_t w = 0; w + 3 < out_width; w += 4) { for (index_t w = 0; w + 3 < p.out_width; w += 4) {
// input (4 height x 3 slide): vi_height_slide // input (4 height x 3 slide): vi_height_slide
float32x4_t vi00, vi01, vi02; // reg count: 14 float32x4_t vi00, vi01, vi02; // reg count: 14
float32x4_t vi10, vi11, vi12; float32x4_t vi10, vi11, vi12;
...@@ -272,9 +224,9 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, ...@@ -272,9 +224,9 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
// load ouptut // load ouptut
vo00 = vld1q_f32(out_ptr0); vo00 = vld1q_f32(out_ptr0);
vo01 = vld1q_f32(out_ptr0 + out_width); vo01 = vld1q_f32(out_ptr0 + p.out_width);
vo10 = vld1q_f32(out_ptr1); vo10 = vld1q_f32(out_ptr1);
vo11 = vld1q_f32(out_ptr1 + out_width); vo11 = vld1q_f32(out_ptr1 + p.out_width);
// outch 0, height 0 // outch 0, height 0
vo00 = vmlaq_lane_f32(vo00, vi00, vf001, 0); vo00 = vmlaq_lane_f32(vo00, vi00, vf001, 0);
...@@ -321,9 +273,9 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, ...@@ -321,9 +273,9 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
vo11 = vmlaq_lane_f32(vo11, vi32, vf189, 0); vo11 = vmlaq_lane_f32(vo11, vi32, vf189, 0);
vst1q_f32(out_ptr0, vo00); vst1q_f32(out_ptr0, vo00);
vst1q_f32(out_ptr0 + out_width, vo01); vst1q_f32(out_ptr0 + p.out_width, vo01);
vst1q_f32(out_ptr1, vo10); vst1q_f32(out_ptr1, vo10);
vst1q_f32(out_ptr1 + out_width, vo11); vst1q_f32(out_ptr1 + p.out_width, vo11);
in_ptr0 += 4; in_ptr0 += 4;
in_ptr1 += 4; in_ptr1 += 4;
...@@ -334,34 +286,34 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, ...@@ -334,34 +286,34 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
out_ptr1 += 4; out_ptr1 += 4;
} // w } // w
in_ptr0 += 2 + in_width; in_ptr0 += 2 + p.in_width;
in_ptr1 += 2 + in_width; in_ptr1 += 2 + p.in_width;
in_ptr2 += 2 + in_width; in_ptr2 += 2 + p.in_width;
in_ptr3 += 2 + in_width; in_ptr3 += 2 + p.in_width;
out_ptr0 += out_width; out_ptr0 += p.out_width;
out_ptr1 += out_width; out_ptr1 += p.out_width;
} // h } // h
#endif #endif
} // c } // c
} else { } else {
for (index_t mm = m; mm < out_channels; ++mm) { for (index_t mm = m; mm < p.out_channels; ++mm) {
float *out_ptr0_base = float *out_ptr0_base =
output_data + b * out_batch_size + mm * out_image_size; output_data + b * p.out_batch_size + mm * p.out_image_size;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < p.in_channels; ++c) {
const float *in_ptr0 = const float *in_ptr0 =
input_data + b * in_batch_size + c * in_image_size; input_data + b * p.in_batch_size + c * p.in_image_size;
const float *in_ptr1 = const float *in_ptr1 =
input_data + b * in_batch_size + c * in_image_size input_data + b * p.in_batch_size + c * p.in_image_size
+ 1 * in_width; + 1 * p.in_width;
const float *in_ptr2 = const float *in_ptr2 =
input_data + b * in_batch_size + c * in_image_size input_data + b * p.in_batch_size + c * p.in_image_size
+ 2 * in_width; + 2 * p.in_width;
const float *in_ptr3 = const float *in_ptr3 =
input_data + b * in_batch_size + c * in_image_size input_data + b * p.in_batch_size + c * p.in_image_size
+ 3 * in_width; + 3 * p.in_width;
const float const float
*filter_ptr0 = filter_data + mm * in_channels * 9 + c * 9; *filter_ptr0 = filter_data + mm * p.in_channels * 9 + c * 9;
#if defined(__aarch64__) #if defined(__aarch64__)
float *out_ptr0 = out_ptr0_base; float *out_ptr0 = out_ptr0_base;
...@@ -372,8 +324,8 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, ...@@ -372,8 +324,8 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
vf01 = vld1q_f32(filter_ptr0 + 3); vf01 = vld1q_f32(filter_ptr0 + 3);
vf02 = vld1q_f32(filter_ptr0 + 5); vf02 = vld1q_f32(filter_ptr0 + 5);
for (index_t h = 0; h + 1 < out_height; h += 2) { for (index_t h = 0; h + 1 < p.out_height; h += 2) {
for (index_t w = 0; w + 3 < out_width; w += 4) { for (index_t w = 0; w + 3 < p.out_width; w += 4) {
// input (4 height x 3 slide): vi_height_slide // input (4 height x 3 slide): vi_height_slide
float32x4_t vi00, vi01, vi02, vi0n; float32x4_t vi00, vi01, vi02, vi0n;
float32x4_t vi10, vi11, vi12, vi1n; float32x4_t vi10, vi11, vi12, vi1n;
...@@ -404,7 +356,7 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, ...@@ -404,7 +356,7 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
// load ouptut // load ouptut
vo00 = vld1q_f32(out_ptr0); vo00 = vld1q_f32(out_ptr0);
vo01 = vld1q_f32(out_ptr0 + out_width); vo01 = vld1q_f32(out_ptr0 + p.out_width);
// outch 0, height 0 // outch 0, height 0
vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0); vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0);
...@@ -429,7 +381,7 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, ...@@ -429,7 +381,7 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
vo01 = vfmaq_laneq_f32(vo01, vi32, vf02, 3); vo01 = vfmaq_laneq_f32(vo01, vi32, vf02, 3);
vst1q_f32(out_ptr0, vo00); vst1q_f32(out_ptr0, vo00);
vst1q_f32(out_ptr0 + out_width, vo01); vst1q_f32(out_ptr0 + p.out_width, vo01);
in_ptr0 += 4; in_ptr0 += 4;
in_ptr1 += 4; in_ptr1 += 4;
...@@ -439,12 +391,12 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, ...@@ -439,12 +391,12 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
out_ptr0 += 4; out_ptr0 += 4;
} // w } // w
in_ptr0 += 2 + in_width; in_ptr0 += 2 + p.in_width;
in_ptr1 += 2 + in_width; in_ptr1 += 2 + p.in_width;
in_ptr2 += 2 + in_width; in_ptr2 += 2 + p.in_width;
in_ptr3 += 2 + in_width; in_ptr3 += 2 + p.in_width;
out_ptr0 += out_width; out_ptr0 += p.out_width;
} // h } // h
#else // arm v7 #else // arm v7
float *out_ptr0 = out_ptr0_base; float *out_ptr0 = out_ptr0_base;
...@@ -457,8 +409,8 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, ...@@ -457,8 +409,8 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
vf67 = vld1_f32(filter_ptr0 + 6); vf67 = vld1_f32(filter_ptr0 + 6);
vf78 = vld1_f32(filter_ptr0 + 7); vf78 = vld1_f32(filter_ptr0 + 7);
for (index_t h = 0; h + 1 < out_height; h += 2) { for (index_t h = 0; h + 1 < p.out_height; h += 2) {
for (index_t w = 0; w + 3 < out_width; w += 4) { for (index_t w = 0; w + 3 < p.out_width; w += 4) {
// input (4 height x 3 slide): vi_height_slide // input (4 height x 3 slide): vi_height_slide
float32x4_t vi00, vi01, vi02, vi0n; float32x4_t vi00, vi01, vi02, vi0n;
float32x4_t vi10, vi11, vi12, vi1n; float32x4_t vi10, vi11, vi12, vi1n;
...@@ -489,7 +441,7 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, ...@@ -489,7 +441,7 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
// load ouptut // load ouptut
vo00 = vld1q_f32(out_ptr0); vo00 = vld1q_f32(out_ptr0);
vo01 = vld1q_f32(out_ptr0 + out_width); vo01 = vld1q_f32(out_ptr0 + p.out_width);
// outch 0, height 0 // outch 0, height 0
vo00 = vmlaq_lane_f32(vo00, vi00, vf01, 0); vo00 = vmlaq_lane_f32(vo00, vi00, vf01, 0);
...@@ -514,7 +466,7 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, ...@@ -514,7 +466,7 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
vo01 = vmlaq_lane_f32(vo01, vi32, vf78, 1); vo01 = vmlaq_lane_f32(vo01, vi32, vf78, 1);
vst1q_f32(out_ptr0, vo00); vst1q_f32(out_ptr0, vo00);
vst1q_f32(out_ptr0 + out_width, vo01); vst1q_f32(out_ptr0 + p.out_width, vo01);
in_ptr0 += 4; in_ptr0 += 4;
in_ptr1 += 4; in_ptr1 += 4;
...@@ -524,12 +476,12 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, ...@@ -524,12 +476,12 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
out_ptr0 += 4; out_ptr0 += 4;
} // w } // w
in_ptr0 += 2 + in_width; in_ptr0 += 2 + p.in_width;
in_ptr1 += 2 + in_width; in_ptr1 += 2 + p.in_width;
in_ptr2 += 2 + in_width; in_ptr2 += 2 + p.in_width;
in_ptr3 += 2 + in_width; in_ptr3 += 2 + p.in_width;
out_ptr0 += out_width; out_ptr0 += p.out_width;
} // h } // h
#endif #endif
} // c } // c
...@@ -537,73 +489,25 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context, ...@@ -537,73 +489,25 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
} // if } // if
} // m } // m
} // b } // b
}, 0, batch, 1, 0, out_channels, 2); }, 0, p.batch, 1, 0, p.out_channels, 2);
UnPadOutput(*out_tensor, output);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MaceStatus Conv2dK3x3S2::Compute(const OpContext *context, template<>
const Tensor *input, MaceStatus Conv2dK3x3S2<float>::DoCompute(
const Tensor *filter, const ConvComputeParam &p, const float *filter_data,
Tensor *output) { const float *input_data, float *output_data) {
std::unique_ptr<const Tensor> padded_input; p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
std::unique_ptr<Tensor> padded_output;
ResizeOutAndPadInOut(context,
input,
filter,
output,
1,
4,
&padded_input,
&padded_output);
const Tensor *in_tensor = input;
if (padded_input != nullptr) {
in_tensor = padded_input.get();
}
Tensor *out_tensor = output;
if (padded_output != nullptr) {
out_tensor = padded_output.get();
}
out_tensor->Clear();
Tensor::MappingGuard in_guard(input);
Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard out_guard(output);
auto filter_data = filter->data<float>();
auto input_data = in_tensor->data<float>();
auto output_data = out_tensor->mutable_data<float>();
auto &in_shape = in_tensor->shape();
auto &out_shape = out_tensor->shape();
const index_t batch = in_shape[0];
const index_t in_channels = in_shape[1];
const index_t in_height = in_shape[2];
const index_t in_width = in_shape[3];
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = in_channels * in_image_size;
const index_t out_batch_size = out_channels * out_image_size;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) { index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
for (index_t m = start1; m < end1; m += step1) { for (index_t m = start1; m < end1; m += step1) {
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < p.in_channels; ++c) {
const float const float
*in_base = input_data + b * in_batch_size + c * in_image_size; *in_base = input_data + b * p.in_batch_size + c * p.in_image_size;
const float *filter_ptr = filter_data + m * in_channels * 9 + c * 9; const float *filter_ptr = filter_data + m * p.in_channels * 9 + c * 9;
float float *out_base =
*out_base = output_data + b * out_batch_size + m * out_image_size; output_data + b * p.out_batch_size + m * p.out_image_size;
#if defined(__aarch64__) #if defined(__aarch64__)
// load filter (1 outch x 3 height x 3 width): vf_outch_height // load filter (1 outch x 3 height x 3 width): vf_outch_height
...@@ -612,8 +516,8 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context, ...@@ -612,8 +516,8 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context,
vf01 = vld1q_f32(filter_ptr + 3); vf01 = vld1q_f32(filter_ptr + 3);
vf02 = vld1q_f32(filter_ptr + 5); vf02 = vld1q_f32(filter_ptr + 5);
for (index_t h = 0; h < out_height; ++h) { for (index_t h = 0; h < p.out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) { for (index_t w = 0; w + 3 < p.out_width; w += 4) {
float32x4x2_t vi0, vi1, vi2; float32x4x2_t vi0, vi1, vi2;
float32x4_t vi0n, vi1n, vi2n; float32x4_t vi0n, vi1n, vi2n;
...@@ -628,17 +532,17 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context, ...@@ -628,17 +532,17 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context,
// load input // load input
index_t in_h = h * 2; index_t in_h = h * 2;
index_t in_w = w * 2; index_t in_w = w * 2;
index_t in_offset = in_h * in_width + in_w; index_t in_offset = in_h * p.in_width + in_w;
vi0 = vld2q_f32(in_base + in_offset); // [0.2.4.6, 1.3.5.7] vi0 = vld2q_f32(in_base + in_offset); // [0.2.4.6, 1.3.5.7]
vi1 = vld2q_f32(in_base + in_offset + in_width); vi1 = vld2q_f32(in_base + in_offset + p.in_width);
vi2 = vld2q_f32(in_base + in_offset + 2 * in_width); vi2 = vld2q_f32(in_base + in_offset + 2 * p.in_width);
vi0n = vld1q_f32(in_base + in_offset + 8); // [8.9.10.11] vi0n = vld1q_f32(in_base + in_offset + 8); // [8.9.10.11]
vi1n = vld1q_f32(in_base + in_offset + in_width + 8); vi1n = vld1q_f32(in_base + in_offset + p.in_width + 8);
vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 8); vi2n = vld1q_f32(in_base + in_offset + 2 * p.in_width + 8);
// load ouptut // load ouptut
index_t out_offset = h * out_width + w; index_t out_offset = h * p.out_width + w;
vo = vld1q_f32(out_base + out_offset); vo = vld1q_f32(out_base + out_offset);
vi00 = vi0.val[0]; // [0.2.4.6] vi00 = vi0.val[0]; // [0.2.4.6]
...@@ -674,8 +578,8 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context, ...@@ -674,8 +578,8 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context,
vf67 = vld1_f32(filter_ptr + 6); vf67 = vld1_f32(filter_ptr + 6);
vf78 = vld1_f32(filter_ptr + 7); vf78 = vld1_f32(filter_ptr + 7);
for (index_t h = 0; h < out_height; ++h) { for (index_t h = 0; h < p.out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) { for (index_t w = 0; w + 3 < p.out_width; w += 4) {
float32x4x2_t vi0, vi1, vi2; float32x4x2_t vi0, vi1, vi2;
float32x4_t vi0n, vi1n, vi2n; float32x4_t vi0n, vi1n, vi2n;
...@@ -690,17 +594,17 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context, ...@@ -690,17 +594,17 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context,
// load input // load input
index_t in_h = h * 2; index_t in_h = h * 2;
index_t in_w = w * 2; index_t in_w = w * 2;
index_t in_offset = in_h * in_width + in_w; index_t in_offset = in_h * p.in_width + in_w;
vi0 = vld2q_f32(in_base + in_offset); // [0.2.4.6, 1.3.5.7] vi0 = vld2q_f32(in_base + in_offset); // [0.2.4.6, 1.3.5.7]
vi1 = vld2q_f32(in_base + in_offset + in_width); vi1 = vld2q_f32(in_base + in_offset + p.in_width);
vi2 = vld2q_f32(in_base + in_offset + 2 * in_width); vi2 = vld2q_f32(in_base + in_offset + 2 * p.in_width);
vi0n = vld1q_f32(in_base + in_offset + 8); // [8.9.10.11] vi0n = vld1q_f32(in_base + in_offset + 8); // [8.9.10.11]
vi1n = vld1q_f32(in_base + in_offset + in_width + 8); vi1n = vld1q_f32(in_base + in_offset + p.in_width + 8);
vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 8); vi2n = vld1q_f32(in_base + in_offset + 2 * p.in_width + 8);
// load ouptut // load ouptut
index_t out_offset = h * out_width + w; index_t out_offset = h * p.out_width + w;
vo = vld1q_f32(out_base + out_offset); vo = vld1q_f32(out_base + out_offset);
vi00 = vi0.val[0]; // [0.2.4.6] vi00 = vi0.val[0]; // [0.2.4.6]
...@@ -731,24 +635,11 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context, ...@@ -731,24 +635,11 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context,
} // c } // c
} // m } // m
} // b } // b
}, 0, batch, 1, 0, out_channels, 1); }, 0, p.batch, 1, 0, p.out_channels, 1);
UnPadOutput(*out_tensor, output);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
void RegisterConv2dK3x3Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Conv2dK3x3S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S1));
MACE_REGISTER_DELEGATOR(
registry, Conv2dK3x3S2, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S2));
}
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
...@@ -18,8 +18,8 @@ ...@@ -18,8 +18,8 @@
#include "mace/ops/common/conv_pool_2d_util.h" #include "mace/ops/common/conv_pool_2d_util.h"
#include "mace/ops/delegator/conv_2d.h" #include "mace/ops/delegator/conv_2d.h"
#include "mace/utils/memory.h"
#include "mace/utils/math.h" #include "mace/utils/math.h"
#include "mace/utils/memory.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
......
...@@ -20,8 +20,8 @@ ...@@ -20,8 +20,8 @@
#include "mace/core/ops/op_context.h" #include "mace/core/ops/op_context.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/ops/arm/fp32/conv_2d.h" #include "mace/ops/arm/base/conv_2d.h"
#include "mace/ops/arm/fp32/gemm.h" #include "mace/ops/arm/base/gemm.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
namespace mace { namespace mace {
...@@ -32,7 +32,7 @@ namespace fp32 { ...@@ -32,7 +32,7 @@ namespace fp32 {
class Conv2dK3x3Winograd : public Conv2dBase { class Conv2dK3x3Winograd : public Conv2dBase {
public: public:
explicit Conv2dK3x3Winograd(const delegator::Conv2dParam &param) explicit Conv2dK3x3Winograd(const delegator::Conv2dParam &param)
: Conv2dBase(param), : Conv2dBase(param, sizeof(float)),
gemm_(delegator::GemmParam()), gemm_(delegator::GemmParam()),
transformed_filter_(nullptr), transformed_filter_(nullptr),
out_tile_size_(0) {} out_tile_size_(0) {}
...@@ -94,7 +94,7 @@ class Conv2dK3x3Winograd : public Conv2dBase { ...@@ -94,7 +94,7 @@ class Conv2dK3x3Winograd : public Conv2dBase {
index_t tile_count, index_t tile_count,
float *output); float *output);
Gemm gemm_; Gemm<float> gemm_;
std::unique_ptr<Tensor> transformed_filter_; std::unique_ptr<Tensor> transformed_filter_;
index_t out_tile_size_; index_t out_tile_size_;
}; };
......
...@@ -15,26 +15,12 @@ ...@@ -15,26 +15,12 @@
#include <arm_neon.h> #include <arm_neon.h>
#include <memory> #include <memory>
#include "mace/ops/arm/fp32/conv_2d.h" #include "mace/ops/arm/base/conv_2d_5x5.h"
#include "mace/ops/delegator/conv_2d.h" #include "mace/ops/delegator/conv_2d.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
class Conv2dK5x5S1 : public Conv2dBase {
public:
explicit Conv2dK5x5S1(const delegator::Conv2dParam &param)
: Conv2dBase(param) {}
virtual ~Conv2dK5x5S1() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
};
#define MACE_Conv2dNeonK5x5SnLoadCalc4 \ #define MACE_Conv2dNeonK5x5SnLoadCalc4 \
/* load filter (4 outch x 1 height x 4 width) */ \ /* load filter (4 outch x 1 height x 4 width) */ \
...@@ -91,89 +77,43 @@ class Conv2dK5x5S1 : public Conv2dBase { ...@@ -91,89 +77,43 @@ class Conv2dK5x5S1 : public Conv2dBase {
vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \ vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \
vo0 = vmlaq_lane_f32(vo0, vi4, vf01, 1); vo0 = vmlaq_lane_f32(vo0, vi4, vf01, 1);
MaceStatus Conv2dK5x5S1::Compute(const OpContext *context, template<>
const Tensor *input, MaceStatus Conv2dK5x5S1<float>::DoCompute(
const Tensor *filter, const ConvComputeParam &p, const float *filter_data,
Tensor *output) { const float *input_data, float *output_data) {
std::unique_ptr<const Tensor> padded_input; p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
std::unique_ptr<Tensor> padded_output;
ResizeOutAndPadInOut(context,
input,
filter,
output,
1,
4,
&padded_input,
&padded_output);
const Tensor *in_tensor = input;
if (padded_input != nullptr) {
in_tensor = padded_input.get();
}
Tensor *out_tensor = output;
if (padded_output != nullptr) {
out_tensor = padded_output.get();
}
out_tensor->Clear();
Tensor::MappingGuard in_guard(input);
Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard out_guard(output);
auto filter_data = filter->data<float>();
auto input_data = in_tensor->data<float>();
auto output_data = out_tensor->mutable_data<float>();
auto &in_shape = in_tensor->shape();
auto &out_shape = out_tensor->shape();
const index_t batch = in_shape[0];
const index_t in_channels = in_shape[1];
const index_t in_height = in_shape[2];
const index_t in_width = in_shape[3];
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = in_channels * in_image_size;
const index_t out_batch_size = out_channels * out_image_size;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) { index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
for (index_t m = start1; m < end1; m += step1) { for (index_t m = start1; m < end1; m += step1) {
if (m + 3 < out_channels) { if (m + 3 < p.out_channels) {
float *out_ptr0_base = float *out_ptr0_base =
output_data + b * out_batch_size + m * out_image_size; output_data + b * p.out_batch_size + m * p.out_image_size;
float *out_ptr1_base = float *out_ptr1_base =
output_data + b * out_batch_size + (m + 1) * out_image_size; output_data + b * p.out_batch_size + (m + 1) * p.out_image_size;
float *out_ptr2_base = float *out_ptr2_base =
output_data + b * out_batch_size + (m + 2) * out_image_size; output_data + b * p.out_batch_size + (m + 2) * p.out_image_size;
float *out_ptr3_base = float *out_ptr3_base =
output_data + b * out_batch_size + (m + 3) * out_image_size; output_data + b * p.out_batch_size + (m + 3) * p.out_image_size;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < p.in_channels; ++c) {
const float *in_ptr_base = const float *in_ptr_base =
input_data + b * in_batch_size + c * in_image_size; input_data + b * p.in_batch_size + c * p.in_image_size;
const float const float
*filter_ptr0 = filter_data + m * in_channels * 25 + c * 25; *filter_ptr0 = filter_data + m * p.in_channels * 25 + c * 25;
const float *filter_ptr1 = const float *filter_ptr1 =
filter_data + (m + 1) * in_channels * 25 + c * 25; filter_data + (m + 1) * p.in_channels * 25 + c * 25;
const float *filter_ptr2 = const float *filter_ptr2 =
filter_data + (m + 2) * in_channels * 25 + c * 25; filter_data + (m + 2) * p.in_channels * 25 + c * 25;
const float *filter_ptr3 = const float *filter_ptr3 =
filter_data + (m + 3) * in_channels * 25 + c * 25; filter_data + (m + 3) * p.in_channels * 25 + c * 25;
for (index_t h = 0; h < out_height; ++h) { for (index_t h = 0; h < p.out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) { for (index_t w = 0; w + 3 < p.out_width; w += 4) {
// input offset // input offset
index_t in_offset = h * in_width + w; index_t in_offset = h * p.in_width + w;
// output (4 outch x 1 height x 4 width): vo_outch_height // output (4 outch x 1 height x 4 width): vo_outch_height
float32x4_t vo0, vo1, vo2, vo3; float32x4_t vo0, vo1, vo2, vo3;
// load output // load output
index_t out_offset = h * out_width + w; index_t out_offset = h * p.out_width + w;
vo0 = vld1q_f32(out_ptr0_base + out_offset); vo0 = vld1q_f32(out_ptr0_base + out_offset);
vo1 = vld1q_f32(out_ptr1_base + out_offset); vo1 = vld1q_f32(out_ptr1_base + out_offset);
vo2 = vld1q_f32(out_ptr2_base + out_offset); vo2 = vld1q_f32(out_ptr2_base + out_offset);
...@@ -190,7 +130,7 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context, ...@@ -190,7 +130,7 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,
MACE_Conv2dNeonK5x5SnLoadCalc4; MACE_Conv2dNeonK5x5SnLoadCalc4;
in_offset += in_width; in_offset += p.in_width;
filter_ptr0 += 5; filter_ptr0 += 5;
filter_ptr1 += 5; filter_ptr1 += 5;
filter_ptr2 += 5; filter_ptr2 += 5;
...@@ -210,22 +150,22 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context, ...@@ -210,22 +150,22 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,
} // h } // h
} // c } // c
} else { } else {
for (index_t mm = m; mm < out_channels; ++mm) { for (index_t mm = m; mm < p.out_channels; ++mm) {
float *out_ptr0_base = float *out_ptr0_base =
output_data + b * out_batch_size + mm * out_image_size; output_data + b * p.out_batch_size + mm * p.out_image_size;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < p.in_channels; ++c) {
const float *in_ptr_base = const float *in_ptr_base =
input_data + b * in_batch_size + c * in_image_size; input_data + b * p.in_batch_size + c * p.in_image_size;
const float const float
*filter_ptr0 = filter_data + mm * in_channels * 25 + c * 25; *filter_ptr0 = filter_data + mm * p.in_channels * 25 + c * 25;
for (index_t h = 0; h < out_height; ++h) { for (index_t h = 0; h < p.out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) { for (index_t w = 0; w + 3 < p.out_width; w += 4) {
// input offset // input offset
index_t in_offset = h * in_width + w; index_t in_offset = h * p.in_width + w;
// output (1 outch x 1 height x 4 width): vo_outch_height // output (1 outch x 1 height x 4 width): vo_outch_height
float32x4_t vo0; float32x4_t vo0;
// load output // load output
index_t out_offset = h * out_width + w; index_t out_offset = h * p.out_width + w;
vo0 = vld1q_f32(out_ptr0_base + out_offset); vo0 = vld1q_f32(out_ptr0_base + out_offset);
for (index_t r = 0; r < 5; ++r) { for (index_t r = 0; r < 5; ++r) {
// input (3 slide) // input (3 slide)
...@@ -239,7 +179,7 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context, ...@@ -239,7 +179,7 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,
MACE_Conv2dNeonK5x5SnLoadCalc1; MACE_Conv2dNeonK5x5SnLoadCalc1;
in_offset += in_width; in_offset += p.in_width;
filter_ptr0 += 5; filter_ptr0 += 5;
} // r } // r
...@@ -252,20 +192,11 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context, ...@@ -252,20 +192,11 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,
} // if } // if
} // m } // m
} // b } // b
}, 0, batch, 1, 0, out_channels, 4); }, 0, p.batch, 1, 0, p.out_channels, 4);
UnPadOutput(*out_tensor, output);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
void RegisterConv2dK5x5Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Conv2dK5x5S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K5x5S1));
}
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
...@@ -12,17 +12,15 @@ ...@@ -12,17 +12,15 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/ops/arm/fp32/conv_2d_7x7.h"
#include <arm_neon.h> #include <arm_neon.h>
#include <memory> #include <memory>
#include "mace/ops/arm/base/conv_2d_7x7.h"
#include "mace/ops/delegator/conv_2d.h" #include "mace/ops/delegator/conv_2d.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
#define MACE_Conv2dArmv8NeonK7x7SnLoadCalc4 \ #define MACE_Conv2dArmv8NeonK7x7SnLoadCalc4 \
/* load filter (4 outch x 1 height x 4 width) */ \ /* load filter (4 outch x 1 height x 4 width) */ \
...@@ -156,88 +154,43 @@ namespace fp32 { ...@@ -156,88 +154,43 @@ namespace fp32 {
vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); \ vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); \
vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1); vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
MaceStatus Conv2dK7x7S1::Compute(const OpContext *context, template<>
const Tensor *input, MaceStatus Conv2dK7x7S1<float>::DoCompute(
const Tensor *filter, const ConvComputeParam &p, const float *filter_data,
Tensor *output) { const float *input_data, float *output_data) {
std::unique_ptr<const Tensor> padded_input;
std::unique_ptr<Tensor> padded_output; p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
ResizeOutAndPadInOut(context,
input,
filter,
output,
1,
4,
&padded_input,
&padded_output);
const Tensor *in_tensor = input;
if (padded_input != nullptr) {
in_tensor = padded_input.get();
}
Tensor *out_tensor = output;
if (padded_output != nullptr) {
out_tensor = padded_output.get();
}
out_tensor->Clear();
Tensor::MappingGuard in_guard(input);
Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard out_guard(output);
auto filter_data = filter->data<float>();
auto input_data = in_tensor->data<float>();
auto output_data = out_tensor->mutable_data<float>();
auto &in_shape = in_tensor->shape();
auto &out_shape = out_tensor->shape();
const index_t batch = in_shape[0];
const index_t in_channels = in_shape[1];
const index_t in_height = in_shape[2];
const index_t in_width = in_shape[3];
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = in_channels * in_image_size;
const index_t out_batch_size = out_channels * out_image_size;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) { index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
for (index_t m = start1; m < end1; m += step1) { for (index_t m = start1; m < end1; m += step1) {
if (m + 3 < out_channels) { if (m + 3 < p.out_channels) {
float *out_ptr0_base = float *out_ptr0_base =
output_data + b * out_batch_size + m * out_image_size; output_data + b * p.out_batch_size + m * p.out_image_size;
float *out_ptr1_base = float *out_ptr1_base =
output_data + b * out_batch_size + (m + 1) * out_image_size; output_data + b * p.out_batch_size + (m + 1) * p.out_image_size;
float *out_ptr2_base = float *out_ptr2_base =
output_data + b * out_batch_size + (m + 2) * out_image_size; output_data + b * p.out_batch_size + (m + 2) * p.out_image_size;
float *out_ptr3_base = float *out_ptr3_base =
output_data + b * out_batch_size + (m + 3) * out_image_size; output_data + b * p.out_batch_size + (m + 3) * p.out_image_size;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < p.in_channels; ++c) {
const float *in_ptr_base = const float *in_ptr_base =
input_data + b * in_batch_size + c * in_image_size; input_data + b * p.in_batch_size + c * p.in_image_size;
const float const float
*filter_ptr0 = filter_data + m * in_channels * 49 + c * 49; *filter_ptr0 = filter_data + m * p.in_channels * 49 + c * 49;
const float *filter_ptr1 = const float *filter_ptr1 =
filter_data + (m + 1) * in_channels * 49 + c * 49; filter_data + (m + 1) * p.in_channels * 49 + c * 49;
const float *filter_ptr2 = const float *filter_ptr2 =
filter_data + (m + 2) * in_channels * 49 + c * 49; filter_data + (m + 2) * p.in_channels * 49 + c * 49;
const float *filter_ptr3 = const float *filter_ptr3 =
filter_data + (m + 3) * in_channels * 49 + c * 49; filter_data + (m + 3) * p.in_channels * 49 + c * 49;
for (index_t h = 0; h < out_height; ++h) { for (index_t h = 0; h < p.out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) { for (index_t w = 0; w + 3 < p.out_width; w += 4) {
// input offset // input offset
index_t in_offset = h * in_width + w; index_t in_offset = h * p.in_width + w;
// output (4 outch x 1 height x 4 width): vo_outch_height // output (4 outch x 1 height x 4 width): vo_outch_height
float32x4_t vo0, vo1, vo2, vo3; float32x4_t vo0, vo1, vo2, vo3;
// load output // load output
index_t out_offset = h * out_width + w; index_t out_offset = h * p.out_width + w;
vo0 = vld1q_f32(out_ptr0_base + out_offset); vo0 = vld1q_f32(out_ptr0_base + out_offset);
vo1 = vld1q_f32(out_ptr1_base + out_offset); vo1 = vld1q_f32(out_ptr1_base + out_offset);
vo2 = vld1q_f32(out_ptr2_base + out_offset); vo2 = vld1q_f32(out_ptr2_base + out_offset);
...@@ -262,7 +215,7 @@ MaceStatus Conv2dK7x7S1::Compute(const OpContext *context, ...@@ -262,7 +215,7 @@ MaceStatus Conv2dK7x7S1::Compute(const OpContext *context,
MACE_Conv2dArmv7NeonK7x7SnLoadCalc4; MACE_Conv2dArmv7NeonK7x7SnLoadCalc4;
#endif #endif
in_offset += in_width; in_offset += p.in_width;
filter_ptr0 += 7; filter_ptr0 += 7;
filter_ptr1 += 7; filter_ptr1 += 7;
filter_ptr2 += 7; filter_ptr2 += 7;
...@@ -282,22 +235,22 @@ MaceStatus Conv2dK7x7S1::Compute(const OpContext *context, ...@@ -282,22 +235,22 @@ MaceStatus Conv2dK7x7S1::Compute(const OpContext *context,
} // h } // h
} // c } // c
} else { } else {
for (index_t mm = m; mm < out_channels; ++mm) { for (index_t mm = m; mm < p.out_channels; ++mm) {
float *out_ptr0_base = float *out_ptr0_base =
output_data + b * out_batch_size + mm * out_image_size; output_data + b * p.out_batch_size + mm * p.out_image_size;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < p.in_channels; ++c) {
const float *in_ptr_base = const float *in_ptr_base =
input_data + b * in_batch_size + c * in_image_size; input_data + b * p.in_batch_size + c * p.in_image_size;
const float const float
*filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49; *filter_ptr0 = filter_data + mm * p.in_channels * 49 + c * 49;
for (index_t h = 0; h < out_height; ++h) { for (index_t h = 0; h < p.out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) { for (index_t w = 0; w + 3 < p.out_width; w += 4) {
// input offset // input offset
index_t in_offset = h * in_width + w; index_t in_offset = h * p.in_width + w;
// output (1 outch x 1 height x 4 width): vo_outch_height // output (1 outch x 1 height x 4 width): vo_outch_height
float32x4_t vo0; float32x4_t vo0;
// load output // load output
index_t out_offset = h * out_width + w; index_t out_offset = h * p.out_width + w;
vo0 = vld1q_f32(out_ptr0_base + out_offset); vo0 = vld1q_f32(out_ptr0_base + out_offset);
for (index_t r = 0; r < 7; ++r) { for (index_t r = 0; r < 7; ++r) {
// input (3 slide) // input (3 slide)
...@@ -319,7 +272,7 @@ MaceStatus Conv2dK7x7S1::Compute(const OpContext *context, ...@@ -319,7 +272,7 @@ MaceStatus Conv2dK7x7S1::Compute(const OpContext *context,
MACE_Conv2dArmv7NeonK7x7SnLoadCalc1; MACE_Conv2dArmv7NeonK7x7SnLoadCalc1;
#endif #endif
in_offset += in_width; in_offset += p.in_width;
filter_ptr0 += 7; filter_ptr0 += 7;
} // r } // r
...@@ -332,96 +285,49 @@ MaceStatus Conv2dK7x7S1::Compute(const OpContext *context, ...@@ -332,96 +285,49 @@ MaceStatus Conv2dK7x7S1::Compute(const OpContext *context,
} // if } // if
} // m } // m
} // b } // b
}, 0, batch, 1, 0, out_channels, 4); }, 0, p.batch, 1, 0, p.out_channels, 4);
UnPadOutput(*out_tensor, output);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MaceStatus Conv2dK7x7S2::Compute(const OpContext *context, template<>
const Tensor *input, MaceStatus Conv2dK7x7S2<float>::DoCompute(
const Tensor *filter, const ConvComputeParam &p, const float *filter_data,
Tensor *output) { const float *input_data, float *output_data) {
std::unique_ptr<const Tensor> padded_input; p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
std::unique_ptr<Tensor> padded_output;
ResizeOutAndPadInOut(context,
input,
filter,
output,
1,
4,
&padded_input,
&padded_output);
const Tensor *in_tensor = input;
if (padded_input != nullptr) {
in_tensor = padded_input.get();
}
Tensor *out_tensor = output;
if (padded_output != nullptr) {
out_tensor = padded_output.get();
}
out_tensor->Clear();
Tensor::MappingGuard in_guard(input);
Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard out_guard(output);
auto filter_data = filter->data<float>();
auto input_data = in_tensor->data<float>();
auto output_data = out_tensor->mutable_data<float>();
auto &in_shape = in_tensor->shape();
auto &out_shape = out_tensor->shape();
const index_t batch = in_shape[0];
const index_t in_channels = in_shape[1];
const index_t in_height = in_shape[2];
const index_t in_width = in_shape[3];
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = in_channels * in_image_size;
const index_t out_batch_size = out_channels * out_image_size;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) { index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
for (index_t m = start1; m < end1; m += step1) { for (index_t m = start1; m < end1; m += step1) {
if (m + 3 < out_channels) { if (m + 3 < p.out_channels) {
float *out_ptr0_base = float *out_ptr0_base =
output_data + b * out_batch_size + m * out_image_size; output_data + b * p.out_batch_size + m * p.out_image_size;
float *out_ptr1_base = float *out_ptr1_base =
output_data + b * out_batch_size + (m + 1) * out_image_size; output_data + b * p.out_batch_size + (m + 1) * p.out_image_size;
float *out_ptr2_base = float *out_ptr2_base =
output_data + b * out_batch_size + (m + 2) * out_image_size; output_data + b * p.out_batch_size + (m + 2) * p.out_image_size;
float *out_ptr3_base = float *out_ptr3_base =
output_data + b * out_batch_size + (m + 3) * out_image_size; output_data + b * p.out_batch_size + (m + 3) * p.out_image_size;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < p.in_channels; ++c) {
const float *in_ptr_base = const float *in_ptr_base =
input_data + b * in_batch_size + c * in_image_size; input_data + b * p.in_batch_size + c * p.in_image_size;
const float const float
*filter_ptr0 = filter_data + m * in_channels * 49 + c * 49; *filter_ptr0 = filter_data + m * p.in_channels * 49 + c * 49;
const float *filter_ptr1 = const float *filter_ptr1 =
filter_data + (m + 1) * in_channels * 49 + c * 49; filter_data + (m + 1) * p.in_channels * 49 + c * 49;
const float *filter_ptr2 = const float *filter_ptr2 =
filter_data + (m + 2) * in_channels * 49 + c * 49; filter_data + (m + 2) * p.in_channels * 49 + c * 49;
const float *filter_ptr3 = const float *filter_ptr3 =
filter_data + (m + 3) * in_channels * 49 + c * 49; filter_data + (m + 3) * p.in_channels * 49 + c * 49;
for (index_t h = 0; h < out_height; ++h) { for (index_t h = 0; h < p.out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) { for (index_t w = 0; w + 3 < p.out_width; w += 4) {
// input offset // input offset
index_t in_h = h * 2; index_t in_h = h * 2;
index_t in_w = w * 2; index_t in_w = w * 2;
index_t in_offset = in_h * in_width + in_w; index_t in_offset = in_h * p.in_width + in_w;
// output (4 outch x 1 height x 4 width): vo_outch_height // output (4 outch x 1 height x 4 width): vo_outch_height
float32x4_t vo0, vo1, vo2, vo3; float32x4_t vo0, vo1, vo2, vo3;
// load output // load output
index_t out_offset = h * out_width + w; index_t out_offset = h * p.out_width + w;
vo0 = vld1q_f32(out_ptr0_base + out_offset); vo0 = vld1q_f32(out_ptr0_base + out_offset);
vo1 = vld1q_f32(out_ptr1_base + out_offset); vo1 = vld1q_f32(out_ptr1_base + out_offset);
vo2 = vld1q_f32(out_ptr2_base + out_offset); vo2 = vld1q_f32(out_ptr2_base + out_offset);
...@@ -449,7 +355,7 @@ MaceStatus Conv2dK7x7S2::Compute(const OpContext *context, ...@@ -449,7 +355,7 @@ MaceStatus Conv2dK7x7S2::Compute(const OpContext *context,
MACE_Conv2dArmv7NeonK7x7SnLoadCalc4; MACE_Conv2dArmv7NeonK7x7SnLoadCalc4;
#endif #endif
in_offset += in_width; in_offset += p.in_width;
filter_ptr0 += 7; filter_ptr0 += 7;
filter_ptr1 += 7; filter_ptr1 += 7;
filter_ptr2 += 7; filter_ptr2 += 7;
...@@ -469,24 +375,24 @@ MaceStatus Conv2dK7x7S2::Compute(const OpContext *context, ...@@ -469,24 +375,24 @@ MaceStatus Conv2dK7x7S2::Compute(const OpContext *context,
} // h } // h
} // c } // c
} else { } else {
for (index_t mm = m; mm < out_channels; ++mm) { for (index_t mm = m; mm < p.out_channels; ++mm) {
float *out_ptr0_base = float *out_ptr0_base =
output_data + b * out_batch_size + mm * out_image_size; output_data + b * p.out_batch_size + mm * p.out_image_size;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < p.in_channels; ++c) {
const float *in_ptr_base = const float *in_ptr_base =
input_data + b * in_batch_size + c * in_image_size; input_data + b * p.in_batch_size + c * p.in_image_size;
const float const float
*filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49; *filter_ptr0 = filter_data + mm * p.in_channels * 49 + c * 49;
for (index_t h = 0; h < out_height; ++h) { for (index_t h = 0; h < p.out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) { for (index_t w = 0; w + 3 < p.out_width; w += 4) {
// input offset // input offset
index_t in_h = h * 2; index_t in_h = h * 2;
index_t in_w = w * 2; index_t in_w = w * 2;
index_t in_offset = in_h * in_width + in_w; index_t in_offset = in_h * p.in_width + in_w;
// output (1 outch x 1 height x 4 width): vo_outch_height // output (1 outch x 1 height x 4 width): vo_outch_height
float32x4_t vo0; float32x4_t vo0;
// load ouput // load ouput
index_t out_offset = h * out_width + w; index_t out_offset = h * p.out_width + w;
vo0 = vld1q_f32(out_ptr0_base + out_offset); vo0 = vld1q_f32(out_ptr0_base + out_offset);
for (index_t r = 0; r < 7; ++r) { for (index_t r = 0; r < 7; ++r) {
// input (3 slide) // input (3 slide)
...@@ -511,7 +417,7 @@ MaceStatus Conv2dK7x7S2::Compute(const OpContext *context, ...@@ -511,7 +417,7 @@ MaceStatus Conv2dK7x7S2::Compute(const OpContext *context,
MACE_Conv2dArmv7NeonK7x7SnLoadCalc1; MACE_Conv2dArmv7NeonK7x7SnLoadCalc1;
#endif #endif
in_offset += in_width; in_offset += p.in_width;
filter_ptr0 += 7; filter_ptr0 += 7;
} // r } // r
...@@ -524,96 +430,49 @@ MaceStatus Conv2dK7x7S2::Compute(const OpContext *context, ...@@ -524,96 +430,49 @@ MaceStatus Conv2dK7x7S2::Compute(const OpContext *context,
} // if } // if
} // m } // m
} // b } // b
}, 0, batch, 1, 0, out_channels, 4); }, 0, p.batch, 1, 0, p.out_channels, 4);
UnPadOutput(*out_tensor, output);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MaceStatus Conv2dK7x7S3::Compute(const OpContext *context, template<>
const Tensor *input, MaceStatus Conv2dK7x7S3<float>::DoCompute(
const Tensor *filter, const ConvComputeParam &p, const float *filter_data,
Tensor *output) { const float *input_data, float *output_data) {
std::unique_ptr<const Tensor> padded_input; p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
std::unique_ptr<Tensor> padded_output;
ResizeOutAndPadInOut(context,
input,
filter,
output,
1,
4,
&padded_input,
&padded_output);
const Tensor *in_tensor = input;
if (padded_input != nullptr) {
in_tensor = padded_input.get();
}
Tensor *out_tensor = output;
if (padded_output != nullptr) {
out_tensor = padded_output.get();
}
out_tensor->Clear();
Tensor::MappingGuard in_guard(input);
Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard out_guard(output);
auto filter_data = filter->data<float>();
auto input_data = in_tensor->data<float>();
auto output_data = out_tensor->mutable_data<float>();
auto &in_shape = in_tensor->shape();
auto &out_shape = out_tensor->shape();
const index_t batch = in_shape[0];
const index_t in_channels = in_shape[1];
const index_t in_height = in_shape[2];
const index_t in_width = in_shape[3];
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = in_channels * in_image_size;
const index_t out_batch_size = out_channels * out_image_size;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) { index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
for (index_t m = start1; m < end1; m += step1) { for (index_t m = start1; m < end1; m += step1) {
if (m + 3 < out_channels) { if (m + 3 < p.out_channels) {
float *out_ptr0_base = float *out_ptr0_base =
output_data + b * out_batch_size + m * out_image_size; output_data + b * p.out_batch_size + m * p.out_image_size;
float *out_ptr1_base = float *out_ptr1_base =
output_data + b * out_batch_size + (m + 1) * out_image_size; output_data + b * p.out_batch_size + (m + 1) * p.out_image_size;
float *out_ptr2_base = float *out_ptr2_base =
output_data + b * out_batch_size + (m + 2) * out_image_size; output_data + b * p.out_batch_size + (m + 2) * p.out_image_size;
float *out_ptr3_base = float *out_ptr3_base =
output_data + b * out_batch_size + (m + 3) * out_image_size; output_data + b * p.out_batch_size + (m + 3) * p.out_image_size;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < p.in_channels; ++c) {
const float *in_ptr_base = const float *in_ptr_base =
input_data + b * in_batch_size + c * in_image_size; input_data + b * p.in_batch_size + c * p.in_image_size;
const float const float
*filter_ptr0 = filter_data + m * in_channels * 49 + c * 49; *filter_ptr0 = filter_data + m * p.in_channels * 49 + c * 49;
const float *filter_ptr1 = const float *filter_ptr1 =
filter_data + (m + 1) * in_channels * 49 + c * 49; filter_data + (m + 1) * p.in_channels * 49 + c * 49;
const float *filter_ptr2 = const float *filter_ptr2 =
filter_data + (m + 2) * in_channels * 49 + c * 49; filter_data + (m + 2) * p.in_channels * 49 + c * 49;
const float *filter_ptr3 = const float *filter_ptr3 =
filter_data + (m + 3) * in_channels * 49 + c * 49; filter_data + (m + 3) * p.in_channels * 49 + c * 49;
for (index_t h = 0; h < out_height; ++h) { for (index_t h = 0; h < p.out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) { for (index_t w = 0; w + 3 < p.out_width; w += 4) {
// input offset // input offset
index_t in_h = h * 3; index_t in_h = h * 3;
index_t in_w = w * 3; index_t in_w = w * 3;
index_t in_offset = in_h * in_width + in_w; index_t in_offset = in_h * p.in_width + in_w;
// output (4 outch x 1 height x 4 width): vo_outch_height // output (4 outch x 1 height x 4 width): vo_outch_height
float32x4_t vo0, vo1, vo2, vo3; float32x4_t vo0, vo1, vo2, vo3;
// load output // load output
index_t out_offset = h * out_width + w; index_t out_offset = h * p.out_width + w;
vo0 = vld1q_f32(out_ptr0_base + out_offset); vo0 = vld1q_f32(out_ptr0_base + out_offset);
vo1 = vld1q_f32(out_ptr1_base + out_offset); vo1 = vld1q_f32(out_ptr1_base + out_offset);
vo2 = vld1q_f32(out_ptr2_base + out_offset); vo2 = vld1q_f32(out_ptr2_base + out_offset);
...@@ -641,7 +500,7 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context, ...@@ -641,7 +500,7 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context,
MACE_Conv2dArmv7NeonK7x7SnLoadCalc4; MACE_Conv2dArmv7NeonK7x7SnLoadCalc4;
#endif #endif
in_offset += in_width; in_offset += p.in_width;
filter_ptr0 += 7; filter_ptr0 += 7;
filter_ptr1 += 7; filter_ptr1 += 7;
filter_ptr2 += 7; filter_ptr2 += 7;
...@@ -661,24 +520,24 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context, ...@@ -661,24 +520,24 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context,
} // h } // h
} // c } // c
} else { } else {
for (index_t mm = m; mm < out_channels; ++mm) { for (index_t mm = m; mm < p.out_channels; ++mm) {
float *out_ptr0_base = float *out_ptr0_base =
output_data + b * out_batch_size + mm * out_image_size; output_data + b * p.out_batch_size + mm * p.out_image_size;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < p.in_channels; ++c) {
const float *in_ptr_base = const float *in_ptr_base =
input_data + b * in_batch_size + c * in_image_size; input_data + b * p.in_batch_size + c * p.in_image_size;
const float const float
*filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49; *filter_ptr0 = filter_data + mm * p.in_channels * 49 + c * 49;
for (index_t h = 0; h < out_height; ++h) { for (index_t h = 0; h < p.out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) { for (index_t w = 0; w + 3 < p.out_width; w += 4) {
// input offset // input offset
index_t in_h = h * 3; index_t in_h = h * 3;
index_t in_w = w * 3; index_t in_w = w * 3;
index_t in_offset = in_h * in_width + in_w; index_t in_offset = in_h * p.in_width + in_w;
// output (1 outch x 1 height x 4 width): vo_outch_height // output (1 outch x 1 height x 4 width): vo_outch_height
float32x4_t vo0; float32x4_t vo0;
// load output // load output
index_t out_offset = h * out_width + w; index_t out_offset = h * p.out_width + w;
vo0 = vld1q_f32(out_ptr0_base + out_offset); vo0 = vld1q_f32(out_ptr0_base + out_offset);
for (index_t r = 0; r < 7; ++r) { for (index_t r = 0; r < 7; ++r) {
// input (3 slide) // input (3 slide)
...@@ -703,7 +562,7 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context, ...@@ -703,7 +562,7 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context,
MACE_Conv2dArmv7NeonK7x7SnLoadCalc1; MACE_Conv2dArmv7NeonK7x7SnLoadCalc1;
#endif #endif
in_offset += in_width; in_offset += p.in_width;
filter_ptr0 += 7; filter_ptr0 += 7;
} // r } // r
...@@ -716,28 +575,11 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context, ...@@ -716,28 +575,11 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context,
} // if } // if
} // m } // m
} // b } // b
}, 0, batch, 1, 0, out_channels, 4); }, 0, p.batch, 1, 0, p.out_channels, 4);
UnPadOutput(*out_tensor, output);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
void RegisterConv2dK7x7Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Conv2dK7x7S1, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K7x7S1));
MACE_REGISTER_DELEGATOR(
registry, Conv2dK7x7S2, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K7x7S2));
MACE_REGISTER_DELEGATOR(
registry, Conv2dK7x7S3, delegator::Conv2dParam,
MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
float, ImplType::NEON, K7x7S3));
}
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
...@@ -12,87 +12,25 @@ ...@@ -12,87 +12,25 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/ops/arm/fp32/conv_2d.h"
#include <memory> #include <memory>
#include "mace/ops/arm/base/conv_2d_general.h"
#include "mace/ops/delegator/conv_2d.h" #include "mace/ops/delegator/conv_2d.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
class Conv2dGeneral : public Conv2dBase {
public:
explicit Conv2dGeneral(const delegator::Conv2dParam &param)
: Conv2dBase(param) {}
virtual ~Conv2dGeneral() {}
MaceStatus Compute(
const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) override;
};
MaceStatus Conv2dGeneral::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
Tensor *output) {
std::unique_ptr<const Tensor> padded_input;
std::unique_ptr<Tensor> padded_output;
ResizeOutAndPadInOut(context,
input,
filter,
output,
1,
4,
&padded_input,
&padded_output);
const Tensor *in_tensor = input;
if (padded_input != nullptr) {
in_tensor = padded_input.get();
}
Tensor *out_tensor = output;
if (padded_output != nullptr) {
out_tensor = padded_output.get();
}
out_tensor->Clear();
Tensor::MappingGuard in_guard(input);
Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard out_guard(output);
auto filter_data = filter->data<float>();
auto input_data = in_tensor->data<float>();
auto output_data = out_tensor->mutable_data<float>();
auto &in_shape = in_tensor->shape(); template<>
auto &out_shape = out_tensor->shape(); MaceStatus Conv2dGeneral<float>::DoCompute(
auto &filter_shape = filter->shape(); const ConvComputeParam &p, const float *filter_data,
const float *input_data, float *output_data,
const index_t batch = in_shape[0]; const std::vector<index_t> &filter_shape) {
const index_t in_channels = in_shape[1];
const index_t in_height = in_shape[2];
const index_t in_width = in_shape[3];
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t filter_height = filter_shape[2]; const index_t filter_height = filter_shape[2];
const index_t filter_width = filter_shape[3]; const index_t filter_width = filter_shape[3];
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = in_channels * in_image_size;
const index_t out_batch_size = out_channels * out_image_size;
const index_t filter_size = filter_height * filter_width; const index_t filter_size = filter_height * filter_width;
utils::ThreadPool p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) { index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
for (index_t m = start1; m < end1; m += step1) { for (index_t m = start1; m < end1; m += step1) {
...@@ -100,30 +38,33 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context, ...@@ -100,30 +38,33 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context,
const int stride_w = strides_[1]; const int stride_w = strides_[1];
const int dilation_h = dilations_[0]; const int dilation_h = dilations_[0];
const int dilation_w = dilations_[1]; const int dilation_w = dilations_[1];
if (m + 3 < out_channels) { if (m + 3 < p.out_channels) {
float *out_ptr0_base = float *out_ptr0_base =
output_data + b * out_batch_size + m * out_image_size; output_data + b * p.out_batch_size + m * p.out_image_size;
float *out_ptr1_base = out_ptr0_base + out_image_size; float *out_ptr1_base = out_ptr0_base + p.out_image_size;
float *out_ptr2_base = out_ptr1_base + out_image_size; float *out_ptr2_base = out_ptr1_base + p.out_image_size;
float *out_ptr3_base = out_ptr2_base + out_image_size; float *out_ptr3_base = out_ptr2_base + p.out_image_size;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < p.in_channels; ++c) {
const float *in_ptr_base = const float *in_ptr_base =
input_data + b * in_batch_size + c * in_image_size; input_data + b * p.in_batch_size + c * p.in_image_size;
const float *filter_ptr0 = const float *filter_ptr0 =
filter_data + m * in_channels * filter_size + c * filter_size; filter_data + m * p.in_channels * filter_size + c * filter_size;
const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size; const float *filter_ptr1 =
const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size; filter_ptr0 + p.in_channels * filter_size;
const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size; const float *filter_ptr2 =
for (index_t h = 0; h < out_height; ++h) { filter_ptr1 + p.in_channels * filter_size;
for (index_t w = 0; w + 3 < out_width; w += 4) { const float *filter_ptr3 =
filter_ptr2 + p.in_channels * filter_size;
for (index_t h = 0; h < p.out_height; ++h) {
for (index_t w = 0; w + 3 < p.out_width; w += 4) {
// input offset // input offset
index_t ih = h * stride_h; index_t ih = h * stride_h;
index_t iw = w * stride_w; index_t iw = w * stride_w;
index_t in_offset = ih * in_width + iw; index_t in_offset = ih * p.in_width + iw;
// output (4 outch x 1 height x 4 width): vo_outch_height // output (4 outch x 1 height x 4 width): vo_outch_height
float vo0[4], vo1[4], vo2[4], vo3[4]; float vo0[4], vo1[4], vo2[4], vo3[4];
// load output // load output
index_t out_offset = h * out_width + w; index_t out_offset = h * p.out_width + w;
for (index_t ow = 0; ow < 4; ++ow) { for (index_t ow = 0; ow < 4; ++ow) {
vo0[ow] = out_ptr0_base[out_offset + ow]; vo0[ow] = out_ptr0_base[out_offset + ow];
vo1[ow] = out_ptr1_base[out_offset + ow]; vo1[ow] = out_ptr1_base[out_offset + ow];
...@@ -171,7 +112,7 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context, ...@@ -171,7 +112,7 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context,
+ kw * dilation_w] * filter_ptr3[kw]; + kw * dilation_w] * filter_ptr3[kw];
} // kw } // kw
in_offset += dilation_h * in_width; in_offset += dilation_h * p.in_width;
filter_ptr0 += filter_width; filter_ptr0 += filter_width;
filter_ptr1 += filter_width; filter_ptr1 += filter_width;
filter_ptr2 += filter_width; filter_ptr2 += filter_width;
...@@ -193,26 +134,26 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context, ...@@ -193,26 +134,26 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context,
} // h } // h
} // c } // c
} else { } else {
for (index_t mm = m; mm < out_channels; ++mm) { for (index_t mm = m; mm < p.out_channels; ++mm) {
float *out_ptr0_base = float *out_ptr0_base =
output_data + b * out_batch_size + mm * out_image_size; output_data + b * p.out_batch_size + mm * p.out_image_size;
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < p.in_channels; ++c) {
const float *in_ptr_base = const float *in_ptr_base =
input_data + b * in_batch_size + c * in_image_size; input_data + b * p.in_batch_size + c * p.in_image_size;
const float *filter_ptr0 = const float *filter_ptr0 =
filter_data + mm * in_channels * filter_size filter_data + mm * p.in_channels * filter_size
+ c * filter_size; + c * filter_size;
for (index_t h = 0; h < out_height; ++h) { for (index_t h = 0; h < p.out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) { for (index_t w = 0; w + 3 < p.out_width; w += 4) {
// input offset // input offset
index_t ih = h * stride_h; index_t ih = h * stride_h;
index_t iw = w * stride_w; index_t iw = w * stride_w;
index_t in_offset = ih * in_width + iw; index_t in_offset = ih * p.in_width + iw;
// output (1 outch x 1 height x 4 width): vo_outch_height // output (1 outch x 1 height x 4 width): vo_outch_height
float vo0[4]; float vo0[4];
// load output // load output
index_t out_offset = h * out_width + w; index_t out_offset = h * p.out_width + w;
for (index_t ow = 0; ow < 4; ++ow) { for (index_t ow = 0; ow < 4; ++ow) {
vo0[ow] = out_ptr0_base[out_offset + ow]; vo0[ow] = out_ptr0_base[out_offset + ow];
} }
...@@ -231,7 +172,7 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context, ...@@ -231,7 +172,7 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context,
+ kw * dilation_w] * filter_ptr0[kw]; + kw * dilation_w] * filter_ptr0[kw];
} // kw } // kw
in_offset += dilation_h * in_width; in_offset += dilation_h * p.in_width;
filter_ptr0 += filter_width; filter_ptr0 += filter_width;
} // kh } // kh
...@@ -246,19 +187,11 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context, ...@@ -246,19 +187,11 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context,
} // if } // if
} // m } // m
} // b } // b
}, 0, batch, 1, 0, out_channels, 4); }, 0, p.batch, 1, 0, p.out_channels, 4);
UnPadOutput(*out_tensor, output);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
void RegisterConv2dGeneralDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Conv2dGeneral, delegator::Conv2dParam,
MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, float, ImplType::NEON));
}
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
...@@ -12,74 +12,33 @@ ...@@ -12,74 +12,33 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/ops/arm/fp32/deconv_2d_2x2.h"
#include <arm_neon.h> #include <arm_neon.h>
#include "mace/ops/arm/base/deconv_2d_2x2.h"
#include "mace/ops/arm/fp32/common_neon.h" #include "mace/ops/arm/fp32/common_neon.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) {
std::unique_ptr<Tensor> padded_out;
std::vector<int> out_pad_size;
ResizeOutAndPadOut(context,
input,
filter,
output_shape,
output,
&out_pad_size,
&padded_out);
Tensor *out_tensor = output;
if (padded_out != nullptr) {
out_tensor = padded_out.get();
}
out_tensor->Clear();
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>();
auto filter_data = filter->data<float>();
auto padded_out_data = out_tensor->mutable_data<float>();
auto &in_shape = input->shape(); template<>
auto &out_shape = out_tensor->shape(); MaceStatus Deconv2dK2x2S1<float>::DoCompute(
const DeconvComputeParam &p, const float *filter_data,
const index_t batch = in_shape[0]; const float *input_data, float *padded_out_data) {
const index_t inch = in_shape[1]; p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
const index_t h = in_shape[2];
const index_t w = in_shape[3];
const index_t outch = out_shape[1];
const index_t outh = out_shape[2];
const index_t outw = out_shape[3];
const index_t out_img_size = outh * outw;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) { index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
for (index_t oc = start1; oc < end1; oc += step1) { for (index_t oc = start1; oc < end1; oc += step1) {
if (oc + 1 < outch) { if (oc + 1 < p.out_channels) {
float *out_base0 = padded_out_data + (b * outch + oc) * out_img_size; float *out_base0 =
float *out_base1 = out_base0 + out_img_size; padded_out_data + (b * p.out_channels + oc) * p.out_img_size;
for (index_t ic = 0; ic < inch; ++ic) { float *out_base1 = out_base0 + p.out_img_size;
const float *input_base = input_data + (b * inch + ic) * h * w; for (index_t ic = 0; ic < p.in_channels; ++ic) {
const float *kernel_base0 = filter_data + (oc * inch + ic) * 4; const float *input_base = input_data +
const float *kernel_base1 = kernel_base0 + inch * 4; (b * p.in_channels + ic) * p.in_height * p.in_width;
const float *kernel_base0 =
filter_data + (oc * p.in_channels + ic) * 4;
const float *kernel_base1 = kernel_base0 + p.in_channels * 4;
const float *in = input_base; const float *in = input_base;
// output channel 0 // output channel 0
const float *k0 = kernel_base0; const float *k0 = kernel_base0;
...@@ -89,18 +48,18 @@ MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context, ...@@ -89,18 +48,18 @@ MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context,
float32x4_t k0_vec = vld1q_f32(k0); float32x4_t k0_vec = vld1q_f32(k0);
float32x4_t k1_vec = vld1q_f32(k1); float32x4_t k1_vec = vld1q_f32(k1);
for (index_t i = 0; i < h; ++i) { for (index_t i = 0; i < p.in_height; ++i) {
float *out_row_base0 = out_base0 + i * outw; float *out_row_base0 = out_base0 + i * p.out_width;
float *out_row0_0 = out_row_base0; float *out_row0_0 = out_row_base0;
float *out_row0_1 = out_row_base0 + outw; float *out_row0_1 = out_row_base0 + p.out_width;
float *out_row_base1 = out_base1 + i * outw; float *out_row_base1 = out_base1 + i * p.out_width;
float *out_row1_0 = out_row_base1; float *out_row1_0 = out_row_base1;
float *out_row1_1 = out_row_base1 + outw; float *out_row1_1 = out_row_base1 + p.out_width;
index_t j = 0; index_t j = 0;
for (; j + 3 < w; j += 4) { for (; j + 3 < p.in_width; j += 4) {
float32x4_t in_vec = vld1q_f32(in); float32x4_t in_vec = vld1q_f32(in);
float32x4_t out00, out01, out02, out03; float32x4_t out00, out01, out02, out03;
...@@ -145,7 +104,7 @@ MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context, ...@@ -145,7 +104,7 @@ MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context,
out_row1_1 += 4; out_row1_1 += 4;
} }
for (; j < w; ++j) { for (; j < p.in_width; ++j) {
float val = in[0]; float val = in[0];
for (int k = 0; k < 2; ++k) { for (int k = 0; k < 2; ++k) {
out_row0_0[k] += val * k0[k]; out_row0_0[k] += val * k0[k];
...@@ -162,23 +121,26 @@ MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context, ...@@ -162,23 +121,26 @@ MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context,
} }
} }
} else { } else {
float *out_base0 = padded_out_data + (b * outch + oc) * outh * outw; float *out_base0 = padded_out_data +
for (index_t ic = 0; ic < inch; ++ic) { (b * p.out_channels + oc) * p.out_height * p.out_width;
const float *input_base = input_data + (b * inch + ic) * h * w; for (index_t ic = 0; ic < p.in_channels; ++ic) {
const float *kernel_base0 = filter_data + (oc * inch + ic) * 4; const float *input_base = input_data +
(b * p.in_channels + ic) * p.in_height * p.in_width;
const float *kernel_base0 =
filter_data + (oc * p.in_channels + ic) * 4;
const float *in = input_base; const float *in = input_base;
const float *k0 = kernel_base0; const float *k0 = kernel_base0;
// load filter // load filter
float32x4_t k0_vec = vld1q_f32(k0); float32x4_t k0_vec = vld1q_f32(k0);
for (index_t i = 0; i < h; ++i) { for (index_t i = 0; i < p.in_height; ++i) {
float *out_row_base0 = out_base0 + i * outw; float *out_row_base0 = out_base0 + i * p.out_width;
float *out_row0_0 = out_row_base0; float *out_row0_0 = out_row_base0;
float *out_row0_1 = out_row_base0 + outw; float *out_row0_1 = out_row_base0 + p.out_width;
index_t j = 0; index_t j = 0;
for (; j + 3 < w; j += 4) { for (; j + 3 < p.in_width; j += 4) {
float32x4_t in_vec = vld1q_f32(in); float32x4_t in_vec = vld1q_f32(in);
float32x4_t out00, out01, out02, out03; float32x4_t out00, out01, out02, out03;
...@@ -203,7 +165,7 @@ MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context, ...@@ -203,7 +165,7 @@ MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context,
out_row0_1 += 4; out_row0_1 += 4;
} }
for (; j < w; ++j) { for (; j < p.in_width; ++j) {
float val = in[0]; float val = in[0];
for (int k = 0; k < 2; ++k) { for (int k = 0; k < 2; ++k) {
out_row0_0[k] += val * k0[k]; out_row0_0[k] += val * k0[k];
...@@ -218,79 +180,39 @@ MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context, ...@@ -218,79 +180,39 @@ MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context,
} }
} }
} }
}, 0, batch, 1, 0, outch, 2); }, 0, p.batch, 1, 0, p.out_channels, 2);
UnPadOutput(*out_tensor, out_pad_size, output);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MaceStatus Deconv2dK2x2S2::Compute(const OpContext *context, template<>
const Tensor *input, MaceStatus Deconv2dK2x2S2<float>::DoCompute(
const Tensor *filter, const DeconvComputeParam &p, const float *filter_data,
const Tensor *output_shape, const float *input_data, float *padded_out_data) {
Tensor *output) {
std::unique_ptr<Tensor> padded_out;
std::vector<int> out_pad_size;
ResizeOutAndPadOut(context,
input,
filter,
output_shape,
output,
&out_pad_size,
&padded_out);
Tensor *out_tensor = output;
if (padded_out != nullptr) {
out_tensor = padded_out.get();
}
out_tensor->Clear();
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>();
auto filter_data = filter->data<float>();
auto padded_out_data = out_tensor->mutable_data<float>();
auto &in_shape = input->shape();
auto &out_shape = out_tensor->shape();
const index_t batch = in_shape[0];
const index_t inch = in_shape[1];
const index_t h = in_shape[2];
const index_t w = in_shape[3];
const index_t outch = out_shape[1]; p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
const index_t outh = out_shape[2];
const index_t outw = out_shape[3];
const index_t out_img_size = outh * outw;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) { index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
for (index_t oc = start1; oc < end1; oc += step1) { for (index_t oc = start1; oc < end1; oc += step1) {
float *out_base = padded_out_data + (b * outch + oc) * out_img_size; float *out_base =
for (index_t ic = 0; ic < inch; ++ic) { padded_out_data + (b * p.out_channels + oc) * p.out_img_size;
const float *input_base = input_data + (b * inch + ic) * h * w; for (index_t ic = 0; ic < p.in_channels; ++ic) {
const float *kernel_base = filter_data + (oc * inch + ic) * 4; const float *input_base = input_data +
(b * p.in_channels + ic) * p.in_height * p.in_width;
const float *kernel_base =
filter_data + (oc * p.in_channels + ic) * 4;
const float *in = input_base; const float *in = input_base;
const float *k0 = kernel_base; const float *k0 = kernel_base;
float32x4_t k0_vec = vld1q_f32(k0); float32x4_t k0_vec = vld1q_f32(k0);
for (index_t i = 0; i < h; ++i) { for (index_t i = 0; i < p.in_height; ++i) {
float *out_row_base = out_base + i * 2 * outw; float *out_row_base = out_base + i * 2 * p.out_width;
float *out_row_0 = out_row_base; float *out_row_0 = out_row_base;
float *out_row_1 = out_row_0 + outw; float *out_row_1 = out_row_0 + p.out_width;
index_t j = 0; index_t j = 0;
for (; j + 3 < w; j += 4) { for (; j + 3 < p.in_width; j += 4) {
float32x4_t in_vec = vld1q_f32(in); float32x4_t in_vec = vld1q_f32(in);
// out row 0 // out row 0
...@@ -314,7 +236,7 @@ MaceStatus Deconv2dK2x2S2::Compute(const OpContext *context, ...@@ -314,7 +236,7 @@ MaceStatus Deconv2dK2x2S2::Compute(const OpContext *context,
out_row_1 += 8; out_row_1 += 8;
} }
for (; j < w; ++j) { for (; j < p.in_width; ++j) {
float val = in[0]; float val = in[0];
for (int k = 0; k < 2; ++k) { for (int k = 0; k < 2; ++k) {
out_row_0[k] += val * k0[k]; out_row_0[k] += val * k0[k];
...@@ -328,25 +250,11 @@ MaceStatus Deconv2dK2x2S2::Compute(const OpContext *context, ...@@ -328,25 +250,11 @@ MaceStatus Deconv2dK2x2S2::Compute(const OpContext *context,
} }
} }
} }
}, 0, batch, 1, 0, outch, 1); }, 0, p.batch, 1, 0, p.out_channels, 1);
UnPadOutput(*out_tensor, out_pad_size, output);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
void RegisterDeconv2dK2x2Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Deconv2dK2x2S1, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
float, ImplType::NEON, K2x2S1));
MACE_REGISTER_DELEGATOR(
registry, Deconv2dK2x2S2, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
float, ImplType::NEON, K2x2S2));
}
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
...@@ -12,73 +12,33 @@ ...@@ -12,73 +12,33 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/ops/arm/fp32/deconv_2d_3x3.h"
#include <arm_neon.h> #include <arm_neon.h>
#include "mace/ops/arm/base/deconv_2d_3x3.h"
#include "mace/ops/arm/fp32/common_neon.h" #include "mace/ops/arm/fp32/common_neon.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) {
std::unique_ptr<Tensor> padded_out;
std::vector<int> out_pad_size;
ResizeOutAndPadOut(context,
input,
filter,
output_shape,
output,
&out_pad_size,
&padded_out);
Tensor *out_tensor = output;
if (padded_out != nullptr) {
out_tensor = padded_out.get();
}
out_tensor->Clear();
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>();
auto filter_data = filter->data<float>();
auto padded_out_data = out_tensor->mutable_data<float>();
auto &in_shape = input->shape(); template<>
auto &out_shape = out_tensor->shape(); MaceStatus Deconv2dK3x3S1<float>::DoCompute(
const DeconvComputeParam &p, const float *filter_data,
const index_t batch = out_shape[0]; const float *input_data, float *padded_out_data) {
const index_t inch = in_shape[1]; p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
const index_t h = in_shape[2];
const index_t w = in_shape[3];
const index_t outch = out_shape[1];
const index_t outh = out_shape[2];
const index_t outw = out_shape[3];
const index_t out_img_size = outh * outw;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) { index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
for (index_t oc = start1; oc < end1; oc += step1) { for (index_t oc = start1; oc < end1; oc += step1) {
if (oc + 1 < outch) { if (oc + 1 < p.out_channels) {
float *out_base0 = padded_out_data + (b * outch + oc) * out_img_size; float *out_base0 =
float *out_base1 = out_base0 + out_img_size; padded_out_data + (b * p.out_channels + oc) * p.out_img_size;
for (index_t ic = 0; ic < inch; ++ic) { float *out_base1 = out_base0 + p.out_img_size;
const float *input_base = input_data + (b * inch + ic) * h * w; for (index_t ic = 0; ic < p.in_channels; ++ic) {
const float *kernel_base0 = filter_data + (oc * inch + ic) * 9; const float *input_base = input_data +
const float *kernel_base1 = kernel_base0 + inch * 9; (b * p.in_channels + ic) * p.in_height * p.in_width;
const float *kernel_base0 =
filter_data + (oc * p.in_channels + ic) * 9;
const float *kernel_base1 = kernel_base0 + p.in_channels * 9;
const float *in = input_base; const float *in = input_base;
// output channel 0 // output channel 0
...@@ -102,20 +62,20 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context, ...@@ -102,20 +62,20 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context,
k11_vec = vld1q_f32(k1_1); k11_vec = vld1q_f32(k1_1);
k12_vec = vld1q_f32(k1_2); k12_vec = vld1q_f32(k1_2);
for (index_t i = 0; i < h; ++i) { for (index_t i = 0; i < p.in_height; ++i) {
float *out_row_base0 = out_base0 + i * outw; float *out_row_base0 = out_base0 + i * p.out_width;
float *out_row0_0 = out_row_base0; float *out_row0_0 = out_row_base0;
float *out_row0_1 = out_row_base0 + outw; float *out_row0_1 = out_row_base0 + p.out_width;
float *out_row0_2 = out_row_base0 + 2 * outw; float *out_row0_2 = out_row_base0 + 2 * p.out_width;
float *out_row_base1 = out_base1 + i * outw; float *out_row_base1 = out_base1 + i * p.out_width;
float *out_row1_0 = out_row_base1; float *out_row1_0 = out_row_base1;
float *out_row1_1 = out_row_base1 + outw; float *out_row1_1 = out_row_base1 + p.out_width;
float *out_row1_2 = out_row_base1 + 2 * outw; float *out_row1_2 = out_row_base1 + 2 * p.out_width;
index_t j = 0; index_t j = 0;
for (; j + 3 < w; j += 4) { for (; j + 3 < p.in_width; j += 4) {
float32x4_t in_vec = vld1q_f32(in); float32x4_t in_vec = vld1q_f32(in);
float32x4_t out00, out01, out02; float32x4_t out00, out01, out02;
...@@ -203,7 +163,7 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context, ...@@ -203,7 +163,7 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context,
out_row1_2 += 4; out_row1_2 += 4;
} }
for (; j < w; ++j) { for (; j < p.in_width; ++j) {
float val = in[0]; float val = in[0];
for (int k = 0; k < 3; ++k) { for (int k = 0; k < 3; ++k) {
out_row0_0[k] += val * k0_0[k]; out_row0_0[k] += val * k0_0[k];
...@@ -224,10 +184,13 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context, ...@@ -224,10 +184,13 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context,
} }
} }
} else { } else {
float *out_base0 = padded_out_data + (b * outch + oc) * outh * outw; float *out_base0 = padded_out_data +
for (index_t ic = 0; ic < inch; ++ic) { (b * p.out_channels + oc) * p.out_height * p.out_width;
const float *input_base = input_data + (b * inch + ic) * h * w; for (index_t ic = 0; ic < p.in_channels; ++ic) {
const float *kernel_base0 = filter_data + (oc * inch + ic) * 9; const float *input_base = input_data +
(b * p.in_channels + ic) * p.in_height * p.in_width;
const float *kernel_base0 =
filter_data + (oc * p.in_channels + ic) * 9;
const float *in = input_base; const float *in = input_base;
const float *k0_0 = kernel_base0; const float *k0_0 = kernel_base0;
const float *k0_1 = kernel_base0 + 3; const float *k0_1 = kernel_base0 + 3;
...@@ -238,14 +201,14 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context, ...@@ -238,14 +201,14 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context,
float32x4_t k01_vec = vld1q_f32(k0_1); float32x4_t k01_vec = vld1q_f32(k0_1);
float32x4_t k02_vec = vld1q_f32(k0_2); float32x4_t k02_vec = vld1q_f32(k0_2);
for (index_t i = 0; i < h; ++i) { for (index_t i = 0; i < p.in_height; ++i) {
float *out_row_base0 = out_base0 + i * outw; float *out_row_base0 = out_base0 + i * p.out_width;
float *out_row0_0 = out_row_base0; float *out_row0_0 = out_row_base0;
float *out_row0_1 = out_row_base0 + outw; float *out_row0_1 = out_row_base0 + p.out_width;
float *out_row0_2 = out_row_base0 + 2 * outw; float *out_row0_2 = out_row_base0 + 2 * p.out_width;
index_t j = 0; index_t j = 0;
for (; j + 3 < w; j += 4) { for (; j + 3 < p.in_width; j += 4) {
float32x4_t in_vec = vld1q_f32(in); float32x4_t in_vec = vld1q_f32(in);
float32x4_t out00, out01, out02; float32x4_t out00, out01, out02;
...@@ -294,7 +257,7 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context, ...@@ -294,7 +257,7 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context,
out_row0_2 += 4; out_row0_2 += 4;
} }
for (; j < w; ++j) { for (; j < p.in_width; ++j) {
float val = in[0]; float val = in[0];
for (int k = 0; k < 3; ++k) { for (int k = 0; k < 3; ++k) {
out_row0_0[k] += val * k0_0[k]; out_row0_0[k] += val * k0_0[k];
...@@ -311,67 +274,26 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context, ...@@ -311,67 +274,26 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context,
} }
} }
} }
}, 0, batch, 1, 0, outch, 2); }, 0, p.batch, 1, 0, p.out_channels, 2);
UnPadOutput(*out_tensor, out_pad_size, output);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MaceStatus Deconv2dK3x3S2::Compute(const OpContext *context, template<>
const Tensor *input, MaceStatus Deconv2dK3x3S2<float>::DoCompute(
const Tensor *filter, const DeconvComputeParam &p, const float *filter_data,
const Tensor *output_shape, const float *input_data, float *padded_out_data) {
Tensor *output) { p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
std::unique_ptr<Tensor> padded_out;
std::vector<int> out_pad_size;
ResizeOutAndPadOut(context,
input,
filter,
output_shape,
output,
&out_pad_size,
&padded_out);
Tensor *out_tensor = output;
if (padded_out != nullptr) {
out_tensor = padded_out.get();
}
out_tensor->Clear();
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>();
auto filter_data = filter->data<float>();
auto padded_out_data = out_tensor->mutable_data<float>();
auto &in_shape = input->shape();
auto &out_shape = out_tensor->shape();
const index_t batch = in_shape[0];
const index_t inch = in_shape[1];
const index_t h = in_shape[2];
const index_t w = in_shape[3];
const index_t outch = out_shape[1];
const index_t outh = out_shape[2];
const index_t outw = out_shape[3];
const index_t out_img_size = outh * outw;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) { index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
for (index_t oc = start1; oc < end1; oc += step1) { for (index_t oc = start1; oc < end1; oc += step1) {
float *out_base = padded_out_data + (b * outch + oc) * out_img_size; float *out_base =
for (index_t ic = 0; ic < inch; ++ic) { padded_out_data + (b * p.out_channels + oc) * p.out_img_size;
const float *input_base = input_data + (b * inch + ic) * h * w; for (index_t ic = 0; ic < p.in_channels; ++ic) {
const float *kernel_base = filter_data + (oc * inch + ic) * 9; const float *input_base =
input_data + (b * p.in_channels + ic) * p.in_height * p.in_width;
const float *kernel_base =
filter_data + (oc * p.in_channels + ic) * 9;
const float *in = input_base; const float *in = input_base;
const float *k0 = kernel_base; const float *k0 = kernel_base;
...@@ -382,15 +304,15 @@ MaceStatus Deconv2dK3x3S2::Compute(const OpContext *context, ...@@ -382,15 +304,15 @@ MaceStatus Deconv2dK3x3S2::Compute(const OpContext *context,
float32x4_t k1_vec = vld1q_f32(k1); float32x4_t k1_vec = vld1q_f32(k1);
float32x4_t k2_vec = vld1q_f32(k2); float32x4_t k2_vec = vld1q_f32(k2);
for (index_t i = 0; i < h; ++i) { for (index_t i = 0; i < p.in_height; ++i) {
float *out_row_base = out_base + i * 2 * outw; float *out_row_base = out_base + i * 2 * p.out_width;
float *out_row_0 = out_row_base; float *out_row_0 = out_row_base;
float *out_row_1 = out_row_0 + outw; float *out_row_1 = out_row_0 + p.out_width;
float *out_row_2 = out_row_1 + outw; float *out_row_2 = out_row_1 + p.out_width;
index_t j = 0; index_t j = 0;
for (index_t n = 0; n + 9 < outw; n += 8) { for (index_t n = 0; n + 9 < p.out_width; n += 8) {
float32x4_t in_vec = vld1q_f32(in); float32x4_t in_vec = vld1q_f32(in);
// out row 0 // out row 0
...@@ -439,7 +361,7 @@ MaceStatus Deconv2dK3x3S2::Compute(const OpContext *context, ...@@ -439,7 +361,7 @@ MaceStatus Deconv2dK3x3S2::Compute(const OpContext *context,
j += 4; j += 4;
} }
for (; j < w; ++j) { for (; j < p.in_width; ++j) {
float val = in[0]; float val = in[0];
for (int k = 0; k < 3; ++k) { for (int k = 0; k < 3; ++k) {
...@@ -457,25 +379,11 @@ MaceStatus Deconv2dK3x3S2::Compute(const OpContext *context, ...@@ -457,25 +379,11 @@ MaceStatus Deconv2dK3x3S2::Compute(const OpContext *context,
} }
} }
} }
}, 0, batch, 1, 0, outch, 1); }, 0, p.batch, 1, 0, p.out_channels, 1);
UnPadOutput(*out_tensor, out_pad_size, output);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
void RegisterDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Deconv2dK3x3S1, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S1));
MACE_REGISTER_DELEGATOR(
registry, Deconv2dK3x3S2, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S2));
}
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
...@@ -12,78 +12,39 @@ ...@@ -12,78 +12,39 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/ops/arm/fp32/deconv_2d_4x4.h"
#include <arm_neon.h> #include <arm_neon.h>
#include "mace/ops/arm/base/deconv_2d_4x4.h"
#include "mace/ops/arm/fp32/common_neon.h" #include "mace/ops/arm/fp32/common_neon.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context,
const Tensor *input,
const Tensor *filter,
const Tensor *output_shape,
Tensor *output) {
std::unique_ptr<Tensor> padded_out;
std::vector<int> out_pad_size;
ResizeOutAndPadOut(context,
input,
filter,
output_shape,
output,
&out_pad_size,
&padded_out);
Tensor *out_tensor = output;
if (padded_out != nullptr) {
out_tensor = padded_out.get();
}
out_tensor->Clear();
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>();
auto filter_data = filter->data<float>();
auto padded_out_data = out_tensor->mutable_data<float>();
auto &in_shape = input->shape();
auto &out_shape = out_tensor->shape();
const index_t batch = in_shape[0]; template<>
const index_t inch = in_shape[1]; MaceStatus Deconv2dK4x4S1<float>::DoCompute(
const index_t h = in_shape[2]; const DeconvComputeParam &p, const float *filter_data,
const index_t w = in_shape[3]; const float *input_data, float *padded_out_data) {
p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
const index_t outch = out_shape[1];
const index_t outh = out_shape[2];
const index_t outw = out_shape[3];
const index_t out_img_size = outh * outw;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) { index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
for (index_t oc = start1; oc < end1; oc += step1) { for (index_t oc = start1; oc < end1; oc += step1) {
if (oc + 1 < outch) { if (oc + 1 < p.out_channels) {
float *out_base = padded_out_data + (b * outch + oc) * out_img_size; float *out_base =
float *out_base1 = out_base + out_img_size; padded_out_data + (b * p.out_channels + oc) * p.out_img_size;
for (index_t q = 0; q < inch; q++) { float *out_base1 = out_base + p.out_img_size;
const float *input_base = input_data + (b * inch + q) * h * w; for (index_t q = 0; q < p.in_channels; q++) {
const float *input_base = input_data +
(b * p.in_channels + q) * p.in_height * p.in_width;
const float *in = input_base; const float *in = input_base;
const float *kernel_base = filter_data + (oc * inch + q) * 16; const float *kernel_base =
filter_data + (oc * p.in_channels + q) * 16;
const float *k0 = kernel_base; const float *k0 = kernel_base;
const float *k1 = kernel_base + 4; const float *k1 = kernel_base + 4;
const float *k2 = kernel_base + 8; const float *k2 = kernel_base + 8;
const float *k3 = kernel_base + 12; const float *k3 = kernel_base + 12;
const float *kernel_base1 = kernel_base + inch * 16; const float *kernel_base1 = kernel_base + p.in_channels * 16;
const float *k10 = kernel_base1; const float *k10 = kernel_base1;
const float *k11 = kernel_base1 + 4; const float *k11 = kernel_base1 + 4;
const float *k12 = kernel_base1 + 8; const float *k12 = kernel_base1 + 8;
...@@ -99,24 +60,24 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context, ...@@ -99,24 +60,24 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context,
float32x4_t k12_vec = vld1q_f32(k12); float32x4_t k12_vec = vld1q_f32(k12);
float32x4_t k13_vec = vld1q_f32(k13); float32x4_t k13_vec = vld1q_f32(k13);
for (index_t i = 0; i < h; i++) { for (index_t i = 0; i < p.in_height; i++) {
float *out_row = out_base + i * outw; float *out_row = out_base + i * p.out_width;
float *out_row_0 = out_row; float *out_row_0 = out_row;
float *out_row_1 = out_row_0 + outw; float *out_row_1 = out_row_0 + p.out_width;
float *out_row_2 = out_row_1 + outw; float *out_row_2 = out_row_1 + p.out_width;
float *out_row_3 = out_row_2 + outw; float *out_row_3 = out_row_2 + p.out_width;
float *out_row1 = out_base1 + i * outw; float *out_row1 = out_base1 + i * p.out_width;
float *out_row1_0 = out_row1; float *out_row1_0 = out_row1;
float *out_row1_1 = out_row1_0 + outw; float *out_row1_1 = out_row1_0 + p.out_width;
float *out_row1_2 = out_row1_1 + outw; float *out_row1_2 = out_row1_1 + p.out_width;
float *out_row1_3 = out_row1_2 + outw; float *out_row1_3 = out_row1_2 + p.out_width;
index_t j = 0; index_t j = 0;
for (; j + 3 < w; j += 4) { for (; j + 3 < p.in_width; j += 4) {
float32x4_t in_vec = vld1q_f32(in); float32x4_t in_vec = vld1q_f32(in);
float32x4_t out00, out01, out02, out03; float32x4_t out00, out01, out02, out03;
float32x4_t out10, out11, out12, out13; float32x4_t out10, out11, out12, out13;
...@@ -260,7 +221,7 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context, ...@@ -260,7 +221,7 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context,
out_row1_3 += 4; out_row1_3 += 4;
} }
for (; j < w; j++) { for (; j < p.in_width; j++) {
float val = in[0]; float val = in[0];
for (int k = 0; k < 4; ++k) { for (int k = 0; k < 4; ++k) {
out_row_0[k] += val * k0[k]; out_row_0[k] += val * k0[k];
...@@ -285,10 +246,13 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context, ...@@ -285,10 +246,13 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context,
} }
} }
} else { } else {
float *out_base = padded_out_data + (b * outch + oc) * out_img_size; float *out_base =
for (index_t q = 0; q < inch; q++) { padded_out_data + (b * p.out_channels + oc) * p.out_img_size;
const float *input_base = input_data + (b * inch + q) * h * w; for (index_t q = 0; q < p.in_channels; q++) {
const float *kernel_base = filter_data + (oc * inch + q) * 16; const float *input_base = input_data +
(b * p.in_channels + q) * p.in_height * p.in_width;
const float *kernel_base =
filter_data + (oc * p.in_channels + q) * 16;
const float *in = input_base; const float *in = input_base;
const float *k0 = kernel_base; const float *k0 = kernel_base;
const float *k1 = kernel_base + 4; const float *k1 = kernel_base + 4;
...@@ -300,15 +264,15 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context, ...@@ -300,15 +264,15 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context,
float32x4_t k2_vec = vld1q_f32(k2); float32x4_t k2_vec = vld1q_f32(k2);
float32x4_t k3_vec = vld1q_f32(k3); float32x4_t k3_vec = vld1q_f32(k3);
for (index_t i = 0; i < h; i++) { for (index_t i = 0; i < p.in_height; i++) {
float *out_row = out_base + i * outw; float *out_row = out_base + i * p.out_width;
float *out_row_0 = out_row; float *out_row_0 = out_row;
float *out_row_1 = out_row_0 + outw; float *out_row_1 = out_row_0 + p.out_width;
float *out_row_2 = out_row_1 + outw; float *out_row_2 = out_row_1 + p.out_width;
float *out_row_3 = out_row_2 + outw; float *out_row_3 = out_row_2 + p.out_width;
int j = 0; int j = 0;
for (; j + 3 < w; j += 4) { for (; j + 3 < p.in_width; j += 4) {
float32x4_t in_vec = vld1q_f32(in); float32x4_t in_vec = vld1q_f32(in);
float32x4_t out00 = vld1q_f32(out_row_0); float32x4_t out00 = vld1q_f32(out_row_0);
...@@ -382,7 +346,7 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context, ...@@ -382,7 +346,7 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context,
out_row_3 += 4; out_row_3 += 4;
} }
for (; j < w; j++) { for (; j < p.in_width; j++) {
float val = in[0]; float val = in[0];
for (int k = 0; k < 4; ++k) { for (int k = 0; k < 4; ++k) {
out_row_0[k] += val * k0[k]; out_row_0[k] += val * k0[k];
...@@ -401,65 +365,25 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context, ...@@ -401,65 +365,25 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context,
} }
} }
} }
}, 0, batch, 1, 0, outch, 2); }, 0, p.batch, 1, 0, p.out_channels, 2);
UnPadOutput(*out_tensor, out_pad_size, output);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context, template<>
const Tensor *input, MaceStatus Deconv2dK4x4S2<float>::DoCompute(
const Tensor *filter, const DeconvComputeParam &p, const float *filter_data,
const Tensor *output_shape, const float *input_data, float *padded_out_data) {
Tensor *output) { p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
std::unique_ptr<Tensor> padded_out;
std::vector<int> out_pad_size;
ResizeOutAndPadOut(context,
input,
filter,
output_shape,
output,
&out_pad_size,
&padded_out);
Tensor *out_tensor = output;
if (padded_out != nullptr) {
out_tensor = padded_out.get();
}
out_tensor->Clear();
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>();
auto filter_data = filter->data<float>();
auto padded_out_data = out_tensor->mutable_data<float>();
auto &in_shape = input->shape();
auto &out_shape = out_tensor->shape();
const index_t batch = in_shape[0];
const index_t inch = in_shape[1];
const index_t h = in_shape[2];
const index_t w = in_shape[3];
const index_t outch = out_shape[1];
const index_t outh = out_shape[2];
const index_t outw = out_shape[3];
const index_t out_img_size = outh * outw;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) { index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
for (index_t p = start1; p < end1; p += step1) { for (index_t k = start1; k < end1; k += step1) {
float *out_base = padded_out_data + (b * outch + p) * out_img_size; float *out_base =
for (index_t q = 0; q < inch; q++) { padded_out_data + (b * p.out_channels + k) * p.out_img_size;
const float *input_base = input_data + (b * inch + q) * h * w; for (index_t q = 0; q < p.in_channels; q++) {
const float *kernel_base = filter_data + (p * inch + q) * 16; const float *input_base = input_data +
(b * p.in_channels + q) * p.in_height * p.in_width;
const float *kernel_base = filter_data + (k * p.in_channels + q) * 16;
const float *in = input_base; const float *in = input_base;
const float *k0 = kernel_base; const float *k0 = kernel_base;
...@@ -472,17 +396,17 @@ MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context, ...@@ -472,17 +396,17 @@ MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context,
float32x4_t k2_vec = vld1q_f32(k2); float32x4_t k2_vec = vld1q_f32(k2);
float32x4_t k3_vec = vld1q_f32(k3); float32x4_t k3_vec = vld1q_f32(k3);
for (index_t i = 0; i < h; i++) { for (index_t i = 0; i < p.in_height; i++) {
float *out_row = out_base + 2 * i * outw; float *out_row = out_base + 2 * i * p.out_width;
float *out_row_0 = out_row; float *out_row_0 = out_row;
float *out_row_1 = out_row_0 + outw; float *out_row_1 = out_row_0 + p.out_width;
float *out_row_2 = out_row_1 + outw; float *out_row_2 = out_row_1 + p.out_width;
float *out_row_3 = out_row_2 + outw; float *out_row_3 = out_row_2 + p.out_width;
index_t j = 0; index_t j = 0;
for (index_t n = 0; n + 9 < outw; n += 8) { for (index_t n = 0; n + 9 < p.out_width; n += 8) {
float32x4_t in_vec = vld1q_f32(in); float32x4_t in_vec = vld1q_f32(in);
// row 0 // row 0
...@@ -549,7 +473,7 @@ MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context, ...@@ -549,7 +473,7 @@ MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context,
j += 4; j += 4;
} }
for (; j < w; j++) { for (; j < p.in_width; j++) {
float val = in[0]; float val = in[0];
for (int k = 0; k < 4; ++k) { for (int k = 0; k < 4; ++k) {
out_row_0[k] += val * k0[k]; out_row_0[k] += val * k0[k];
...@@ -567,25 +491,11 @@ MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context, ...@@ -567,25 +491,11 @@ MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context,
} }
} }
} }
}, 0, batch, 1, 0, outch, 1); }, 0, p.batch, 1, 0, p.out_channels, 1);
UnPadOutput(*out_tensor, out_pad_size, output);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
void RegisterDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Deconv2dK4x4S1, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
float, ImplType::NEON, K4x4S1));
MACE_REGISTER_DELEGATOR(
registry, Deconv2dK4x4S2, delegator::Deconv2dParam,
MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
float, ImplType::NEON, K4x4S2));
}
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
...@@ -12,14 +12,13 @@ ...@@ -12,14 +12,13 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/ops/arm/fp32/depthwise_conv_2d_3x3.h"
#include <arm_neon.h> #include <arm_neon.h>
#include "mace/ops/arm/base/depthwise_conv_2d_3x3.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
namespace { namespace {
void DepthwiseConv2dPixel(const float *in_base, void DepthwiseConv2dPixel(const float *in_base,
...@@ -48,79 +47,36 @@ void DepthwiseConv2dPixel(const float *in_base, ...@@ -48,79 +47,36 @@ void DepthwiseConv2dPixel(const float *in_base,
} }
} // namespace } // namespace
MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context, template<>
const mace::Tensor *input, MaceStatus DepthwiseConv2dK3x3S1<float>::DoCompute(
const mace::Tensor *filter, const DepthwiseConvComputeParam &p, const float *filter_data,
mace::Tensor *output) { const float *input_data, float *output_data) {
MACE_UNUSED(context); p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
std::vector<index_t> out_shape(4);
std::vector<int> paddings(2);
auto &in_shape = input->shape();
auto &filter_shape = filter->shape();
CalOutputShapeAndInputPadSize(in_shape, filter_shape, &out_shape, &paddings);
out_shape[1] *= filter_shape[1];
MACE_RETURN_IF_ERROR(output->Resize(out_shape));
output->Clear();
const int pad_top = paddings[0] / 2;
const int pad_left = paddings[1] / 2;
const index_t batch = in_shape[0];
const index_t in_channels = in_shape[1];
const index_t in_height = in_shape[2];
const index_t in_width = in_shape[3];
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = in_channels * in_image_size;
const index_t out_batch_size = out_channels * out_image_size;
const index_t multiplier = out_channels / in_channels;
std::vector<index_t> out_bounds;
CalOutputBoundaryWithoutUsingInputPad(out_shape, paddings, &out_bounds);
const index_t valid_h_start = out_bounds[0];
const index_t valid_h_stop = out_bounds[1];
const index_t valid_w_start = out_bounds[2];
const index_t valid_w_stop = out_bounds[3];
Tensor::MappingGuard in_guard(input);
Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard out_guard(output);
auto filter_data = filter->data<float>();
auto input_data = input->data<float>();
auto output_data = output->mutable_data<float>();
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) { index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
for (index_t m = start1; m < end1; m += step1) { for (index_t m = start1; m < end1; m += step1) {
const index_t c = m / multiplier; const index_t c = m / p.multiplier;
const index_t multi_index = m % multiplier; const index_t multi_index = m % p.multiplier;
const float const float
*in_base = input_data + b * in_batch_size + c * in_image_size; *in_base = input_data + b * p.in_batch_size + c * p.in_image_size;
const float const float
*filter_ptr = filter_data + multi_index * in_channels * 9 + c * 9; *filter_ptr = filter_data + multi_index * p.in_channels * 9 + c * 9;
float *out_base = output_data + b * out_batch_size + m * out_image_size; float *out_base =
output_data + b * p.out_batch_size + m * p.out_image_size;
index_t h, w; index_t h, w;
// top // top
for (h = 0; h < valid_h_start; ++h) { for (h = 0; h < p.valid_h_start; ++h) {
for (w = 0; w < out_width; ++w) { for (w = 0; w < p.out_width; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base,
filter_ptr, filter_ptr,
h, h,
w, w,
h - pad_top, h - p.pad_top,
w - pad_left, w - p.pad_left,
out_width, p.out_width,
in_height, p.in_height,
in_width, p.in_width,
3, 3,
3, 3,
out_base); out_base);
...@@ -133,18 +89,18 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context, ...@@ -133,18 +89,18 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context,
vf01 = vld1q_f32(filter_ptr + 3); vf01 = vld1q_f32(filter_ptr + 3);
vf02 = vld1q_f32(filter_ptr + 5); vf02 = vld1q_f32(filter_ptr + 5);
for (h = valid_h_start; h + 1 < valid_h_stop; h += 2) { for (h = p.valid_h_start; h + 1 < p.valid_h_stop; h += 2) {
// left // left
for (w = 0; w < valid_w_start; ++w) { for (w = 0; w < p.valid_w_start; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base,
filter_ptr, filter_ptr,
h, h,
w, w,
h - pad_top, h - p.pad_top,
w - pad_left, w - p.pad_left,
out_width, p.out_width,
in_height, p.in_height,
in_width, p.in_width,
3, 3,
3, 3,
out_base); out_base);
...@@ -152,17 +108,17 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context, ...@@ -152,17 +108,17 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context,
filter_ptr, filter_ptr,
h + 1, h + 1,
w, w,
h + 1 - pad_top, h + 1 - p.pad_top,
w - pad_left, w - p.pad_left,
out_width, p.out_width,
in_height, p.in_height,
in_width, p.in_width,
3, 3,
3, 3,
out_base); out_base);
} }
for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) { for (w = p.valid_w_start; w + 3 < p.valid_w_stop; w += 4) {
// input (4 height x 3 slide): vi_height_slide // input (4 height x 3 slide): vi_height_slide
float32x4_t vi00, vi01, vi02, vi0n; float32x4_t vi00, vi01, vi02, vi0n;
float32x4_t vi10, vi11, vi12, vi1n; float32x4_t vi10, vi11, vi12, vi1n;
...@@ -173,17 +129,17 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context, ...@@ -173,17 +129,17 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context,
float32x4_t vo00, vo01; float32x4_t vo00, vo01;
// load input // load input
index_t in_h = h - pad_top; index_t in_h = h - p.pad_top;
index_t in_w = w - pad_left; index_t in_w = w - p.pad_left;
index_t in_offset = in_h * in_width + in_w; index_t in_offset = in_h * p.in_width + in_w;
vi00 = vld1q_f32(in_base + in_offset); vi00 = vld1q_f32(in_base + in_offset);
vi0n = vld1q_f32(in_base + in_offset + 4); vi0n = vld1q_f32(in_base + in_offset + 4);
vi10 = vld1q_f32(in_base + in_offset + in_width); vi10 = vld1q_f32(in_base + in_offset + p.in_width);
vi1n = vld1q_f32(in_base + in_offset + in_width + 4); vi1n = vld1q_f32(in_base + in_offset + p.in_width + 4);
vi20 = vld1q_f32(in_base + in_offset + 2 * in_width); vi20 = vld1q_f32(in_base + in_offset + 2 * p.in_width);
vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 4); vi2n = vld1q_f32(in_base + in_offset + 2 * p.in_width + 4);
vi30 = vld1q_f32(in_base + in_offset + 3 * in_width); vi30 = vld1q_f32(in_base + in_offset + 3 * p.in_width);
vi3n = vld1q_f32(in_base + in_offset + 3 * in_width + 4); vi3n = vld1q_f32(in_base + in_offset + 3 * p.in_width + 4);
vi01 = vextq_f32(vi00, vi0n, 1); vi01 = vextq_f32(vi00, vi0n, 1);
vi02 = vextq_f32(vi00, vi0n, 2); vi02 = vextq_f32(vi00, vi0n, 2);
...@@ -195,9 +151,9 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context, ...@@ -195,9 +151,9 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context,
vi32 = vextq_f32(vi30, vi3n, 2); vi32 = vextq_f32(vi30, vi3n, 2);
// load ouptut // load ouptut
index_t out_offset = h * out_width + w; index_t out_offset = h * p.out_width + w;
vo00 = vld1q_f32(out_base + out_offset); vo00 = vld1q_f32(out_base + out_offset);
vo01 = vld1q_f32(out_base + out_offset + out_width); vo01 = vld1q_f32(out_base + out_offset + p.out_width);
#if defined(__aarch64__) #if defined(__aarch64__)
// outch 0, height 0 // outch 0, height 0
...@@ -245,20 +201,20 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context, ...@@ -245,20 +201,20 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context,
vo01 = vmlaq_lane_f32(vo01, vi32, vget_high_f32(vf02), 1); vo01 = vmlaq_lane_f32(vo01, vi32, vget_high_f32(vf02), 1);
#endif #endif
vst1q_f32(out_base + out_offset, vo00); vst1q_f32(out_base + out_offset, vo00);
vst1q_f32(out_base + out_offset + out_width, vo01); vst1q_f32(out_base + out_offset + p.out_width, vo01);
} // w } // w
// right // right
for (; w < out_width; ++w) { for (; w < p.out_width; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base,
filter_ptr, filter_ptr,
h, h,
w, w,
h - pad_top, h - p.pad_top,
w - pad_left, w - p.pad_left,
out_width, p.out_width,
in_height, p.in_height,
in_width, p.in_width,
3, 3,
3, 3,
out_base); out_base);
...@@ -266,11 +222,11 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context, ...@@ -266,11 +222,11 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context,
filter_ptr, filter_ptr,
h + 1, h + 1,
w, w,
h + 1 - pad_top, h + 1 - p.pad_top,
w - pad_left, w - p.pad_left,
out_width, p.out_width,
in_height, p.in_height,
in_width, p.in_width,
3, 3,
3, 3,
out_base); out_base);
...@@ -279,17 +235,17 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context, ...@@ -279,17 +235,17 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context,
// bottom // bottom
for (; h < out_height; ++h) { for (; h < p.out_height; ++h) {
for (w = 0; w < out_width; ++w) { for (w = 0; w < p.out_width; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base,
filter_ptr, filter_ptr,
h, h,
w, w,
h - pad_top, h - p.pad_top,
w - pad_left, w - p.pad_left,
out_width, p.out_width,
in_height, p.in_height,
in_width, p.in_width,
3, 3,
3, 3,
out_base); out_base);
...@@ -297,86 +253,41 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context, ...@@ -297,86 +253,41 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context,
} }
} // m } // m
} // b } // b
}, 0, batch, 1, 0, out_channels, 1); // threadpool }, 0, p.batch, 1, 0, p.out_channels, 1); // threadpool
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context, template<>
const mace::Tensor *input, MaceStatus DepthwiseConv2dK3x3S2<float>::DoCompute(
const mace::Tensor *filter, const DepthwiseConvComputeParam &p, const float *filter_data,
mace::Tensor *output) { const float *input_data, float *output_data) {
MACE_UNUSED(context); p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
std::vector<index_t> out_shape(4);
std::vector<int> paddings(2);
auto &in_shape = input->shape();
auto &filter_shape = filter->shape();
CalOutputShapeAndInputPadSize(in_shape, filter_shape, &out_shape, &paddings);
out_shape[1] *= in_shape[1];
MACE_RETURN_IF_ERROR(output->Resize(out_shape));
output->Clear();
const int pad_top = paddings[0] / 2;
const int pad_left = paddings[1] / 2;
const index_t batch = in_shape[0];
const index_t in_channels = in_shape[1];
const index_t in_height = in_shape[2];
const index_t in_width = in_shape[3];
const index_t out_channels = out_shape[1];
const index_t out_height = out_shape[2];
const index_t out_width = out_shape[3];
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = in_channels * in_image_size;
const index_t out_batch_size = out_channels * out_image_size;
const index_t multiplier = out_channels / in_channels;
std::vector<index_t> out_bounds;
CalOutputBoundaryWithoutUsingInputPad(out_shape, paddings, &out_bounds);
const index_t valid_h_start = out_bounds[0];
const index_t valid_h_stop = out_bounds[1];
const index_t valid_w_start = out_bounds[2];
const index_t valid_w_stop = out_bounds[3];
Tensor::MappingGuard in_guard(input);
Tensor::MappingGuard filter_guard(filter);
Tensor::MappingGuard out_guard(output);
auto filter_data = filter->data<float>();
auto input_data = input->data<float>();
auto output_data = output->mutable_data<float>();
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) { index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
for (index_t m = start1; m < end1; m += step1) { for (index_t m = start1; m < end1; m += step1) {
index_t c = m / multiplier; index_t c = m / p.multiplier;
index_t multi_index = m % multiplier; index_t multi_index = m % p.multiplier;
const float const float
*in_base = input_data + b * in_batch_size + c * in_image_size; *in_base = input_data + b * p.in_batch_size + c * p.in_image_size;
const float const float
*filter_ptr = filter_data + multi_index * in_channels * 9 + c * 9; *filter_ptr = filter_data + multi_index * p.in_channels * 9 + c * 9;
float *out_base = output_data + b * out_batch_size + m * out_image_size; float *out_base =
output_data + b * p.out_batch_size + m * p.out_image_size;
index_t h, w; index_t h, w;
// top // top
for (h = 0; h < valid_h_start; ++h) { for (h = 0; h < p.valid_h_start; ++h) {
for (w = 0; w < out_width; ++w) { for (w = 0; w < p.out_width; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base,
filter_ptr, filter_ptr,
h, h,
w, w,
h * 2 - pad_top, h * 2 - p.pad_top,
w * 2 - pad_left, w * 2 - p.pad_left,
out_width, p.out_width,
in_height, p.in_height,
in_width, p.in_width,
3, 3,
3, 3,
out_base); out_base);
...@@ -389,24 +300,24 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context, ...@@ -389,24 +300,24 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context,
vf01 = vld1q_f32(filter_ptr + 3); vf01 = vld1q_f32(filter_ptr + 3);
vf02 = vld1q_f32(filter_ptr + 5); vf02 = vld1q_f32(filter_ptr + 5);
for (h = valid_h_start; h < valid_h_stop; ++h) { for (h = p.valid_h_start; h < p.valid_h_stop; ++h) {
// left // left
for (w = 0; w < valid_w_start; ++w) { for (w = 0; w < p.valid_w_start; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base,
filter_ptr, filter_ptr,
h, h,
w, w,
h * 2 - pad_top, h * 2 - p.pad_top,
w * 2 - pad_left, w * 2 - p.pad_left,
out_width, p.out_width,
in_height, p.in_height,
in_width, p.in_width,
3, 3,
3, 3,
out_base); out_base);
} }
for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) { for (w = p.valid_w_start; w + 3 < p.valid_w_stop; w += 4) {
float32x4x2_t vi0, vi1, vi2; float32x4x2_t vi0, vi1, vi2;
float32x4_t vi0n, vi1n, vi2n; float32x4_t vi0n, vi1n, vi2n;
...@@ -419,19 +330,19 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context, ...@@ -419,19 +330,19 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context,
float32x4_t vo; float32x4_t vo;
// load input // load input
index_t in_h = h * 2 - pad_top; index_t in_h = h * 2 - p.pad_top;
index_t in_w = w * 2 - pad_left; index_t in_w = w * 2 - p.pad_left;
index_t in_offset = in_h * in_width + in_w; index_t in_offset = in_h * p.in_width + in_w;
vi0 = vld2q_f32(in_base + in_offset); // [0.2.4.6, 1.3.5.7] vi0 = vld2q_f32(in_base + in_offset); // [0.2.4.6, 1.3.5.7]
vi1 = vld2q_f32(in_base + in_offset + in_width); vi1 = vld2q_f32(in_base + in_offset + p.in_width);
vi2 = vld2q_f32(in_base + in_offset + 2 * in_width); vi2 = vld2q_f32(in_base + in_offset + 2 * p.in_width);
vi0n = vld1q_f32(in_base + in_offset + 8); // [8.9.10.11] vi0n = vld1q_f32(in_base + in_offset + 8); // [8.9.10.11]
vi1n = vld1q_f32(in_base + in_offset + in_width + 8); vi1n = vld1q_f32(in_base + in_offset + p.in_width + 8);
vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 8); vi2n = vld1q_f32(in_base + in_offset + 2 * p.in_width + 8);
// load ouptut // load ouptut
index_t out_offset = h * out_width + w; index_t out_offset = h * p.out_width + w;
vo = vld1q_f32(out_base + out_offset); vo = vld1q_f32(out_base + out_offset);
vi00 = vi0.val[0]; // [0.2.4.6] vi00 = vi0.val[0]; // [0.2.4.6]
...@@ -471,16 +382,16 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context, ...@@ -471,16 +382,16 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context,
} // w } // w
// right // right
for (; w < out_width; ++w) { for (; w < p.out_width; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base,
filter_ptr, filter_ptr,
h, h,
w, w,
h * 2 - pad_top, h * 2 - p.pad_top,
w * 2 - pad_left, w * 2 - p.pad_left,
out_width, p.out_width,
in_height, p.in_height,
in_width, p.in_width,
3, 3,
3, 3,
out_base); out_base);
...@@ -489,17 +400,17 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context, ...@@ -489,17 +400,17 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context,
// bottom // bottom
for (; h < out_height; ++h) { for (; h < p.out_height; ++h) {
for (w = 0; w < out_width; ++w) { for (w = 0; w < p.out_width; ++w) {
DepthwiseConv2dPixel(in_base, DepthwiseConv2dPixel(in_base,
filter_ptr, filter_ptr,
h, h,
w, w,
h * 2 - pad_top, h * 2 - p.pad_top,
w * 2 - pad_left, w * 2 - p.pad_left,
out_width, p.out_width,
in_height, p.in_height,
in_width, p.in_width,
3, 3,
3, 3,
out_base); out_base);
...@@ -507,23 +418,11 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context, ...@@ -507,23 +418,11 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context,
} }
} // m } // m
} // b } // b
}, 0, batch, 1, 0, out_channels, 1); }, 0, p.batch, 1, 0, p.out_channels, 1);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
void RegisterDepthwiseConv2dK3x3Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, DepthwiseConv2dK3x3S1, delegator::DepthwiseConv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S1));
MACE_REGISTER_DELEGATOR(
registry, DepthwiseConv2dK3x3S2, delegator::DepthwiseConv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S2));
}
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
...@@ -12,69 +12,26 @@ ...@@ -12,69 +12,26 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h"
#include <arm_neon.h> #include <arm_neon.h>
#include "mace/ops/arm/base/depthwise_deconv_2d_3x3.h"
#include "mace/ops/arm/fp32/common_neon.h" #include "mace/ops/arm/fp32/common_neon.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
template<>
MaceStatus DepthwiseDeconv2dK3x3S1::Compute(const OpContext *context, MaceStatus DepthwiseDeconv2dK3x3S1<float>::DoCompute(
const Tensor *input, const DepthwiseDeconvComputeParam &p, const float *filter_data,
const Tensor *filter, const float *input_data, float *padded_out_data) {
const Tensor *output_shape, p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
Tensor *output) {
std::unique_ptr<Tensor> padded_out;
std::vector<int> out_pad_size;
group_ = input->dim(1);
ResizeOutAndPadOut(context,
input,
filter,
output_shape,
output,
&out_pad_size,
&padded_out);
Tensor *out_tensor = output;
if (padded_out != nullptr) {
out_tensor = padded_out.get();
}
out_tensor->Clear();
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>();
auto filter_data = filter->data<float>();
auto padded_out_data = out_tensor->mutable_data<float>();
auto &in_shape = input->shape();
auto &out_shape = out_tensor->shape();
const index_t batch = in_shape[0];
const index_t channels = in_shape[1];
const index_t h = in_shape[2];
const index_t w = in_shape[3];
const index_t in_img_size = h * w;
const index_t outh = out_shape[2];
const index_t outw = out_shape[3];
const index_t out_img_size = outh * outw;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) { index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
for (index_t c = start1; c < end1; c += step1) { for (index_t c = start1; c < end1; c += step1) {
const index_t offset = b * channels + c; const index_t offset = b * p.in_channels + c;
float *out_base = padded_out_data + offset * out_img_size; float *out_base = padded_out_data + offset * p.out_img_size;
const float *input_base = input_data + offset * in_img_size; const float *input_base = input_data + offset * p.in_img_size;
const float *kernel_base = filter_data + c * 9; const float *kernel_base = filter_data + c * 9;
const float *in = input_base; const float *in = input_base;
const float *k0 = kernel_base; const float *k0 = kernel_base;
...@@ -86,14 +43,14 @@ MaceStatus DepthwiseDeconv2dK3x3S1::Compute(const OpContext *context, ...@@ -86,14 +43,14 @@ MaceStatus DepthwiseDeconv2dK3x3S1::Compute(const OpContext *context,
float32x4_t k1_vec = vld1q_f32(k1); float32x4_t k1_vec = vld1q_f32(k1);
float32x4_t k2_vec = vld1q_f32(k2); float32x4_t k2_vec = vld1q_f32(k2);
for (index_t i = 0; i < h; ++i) { for (index_t i = 0; i < p.in_height; ++i) {
float *out_row_base = out_base + i * outw; float *out_row_base = out_base + i * p.out_width;
float *out_row0 = out_row_base; float *out_row0 = out_row_base;
float *out_row1 = out_row_base + outw; float *out_row1 = out_row_base + p.out_width;
float *out_row2 = out_row_base + 2 * outw; float *out_row2 = out_row_base + 2 * p.out_width;
index_t j = 0; index_t j = 0;
for (; j + 3 < w; j += 4) { for (; j + 3 < p.in_width; j += 4) {
float32x4_t in_vec = vld1q_f32(in); float32x4_t in_vec = vld1q_f32(in);
float32x4_t out00, out01, out02; float32x4_t out00, out01, out02;
...@@ -142,7 +99,7 @@ MaceStatus DepthwiseDeconv2dK3x3S1::Compute(const OpContext *context, ...@@ -142,7 +99,7 @@ MaceStatus DepthwiseDeconv2dK3x3S1::Compute(const OpContext *context,
out_row2 += 4; out_row2 += 4;
} }
for (; j < w; ++j) { for (; j < p.in_width; ++j) {
float val = in[0]; float val = in[0];
for (int k = 0; k < 3; ++k) { for (int k = 0; k < 3; ++k) {
out_row0[k] += val * k0[k]; out_row0[k] += val * k0[k];
...@@ -157,66 +114,22 @@ MaceStatus DepthwiseDeconv2dK3x3S1::Compute(const OpContext *context, ...@@ -157,66 +114,22 @@ MaceStatus DepthwiseDeconv2dK3x3S1::Compute(const OpContext *context,
} }
} }
} }
}, 0, batch, 1, 0, channels, 1); }, 0, p.batch, 1, 0, p.in_channels, 1);
UnPadOutput(*out_tensor, out_pad_size, output);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MaceStatus DepthwiseDeconv2dK3x3S2::Compute(const OpContext *context, template<>
const Tensor *input, MaceStatus DepthwiseDeconv2dK3x3S2<float>::DoCompute(
const Tensor *filter, const DepthwiseDeconvComputeParam &p, const float *filter_data,
const Tensor *output_shape, const float *input_data, float *padded_out_data) {
Tensor *output) { p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
std::unique_ptr<Tensor> padded_out;
std::vector<int> out_pad_size;
group_ = input->dim(1);
ResizeOutAndPadOut(context,
input,
filter,
output_shape,
output,
&out_pad_size,
&padded_out);
Tensor *out_tensor = output;
if (padded_out != nullptr) {
out_tensor = padded_out.get();
}
out_tensor->Clear();
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>();
auto filter_data = filter->data<float>();
auto padded_out_data = out_tensor->mutable_data<float>();
auto &in_shape = input->shape();
auto &out_shape = out_tensor->shape();
const index_t batch = in_shape[0];
const index_t channels = in_shape[1];
const index_t h = in_shape[2];
const index_t w = in_shape[3];
const index_t in_img_size = h * w;
const index_t outh = out_shape[2];
const index_t outw = out_shape[3];
const index_t out_img_size = outh * outw;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) { index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
for (index_t c = start1; c < end1; c += step1) { for (index_t c = start1; c < end1; c += step1) {
const index_t offset = b * channels + c; const index_t offset = b * p.in_channels + c;
float *out_base = padded_out_data + offset * out_img_size; float *out_base = padded_out_data + offset * p.out_img_size;
const float *input_base = input_data + offset * in_img_size; const float *input_base = input_data + offset * p.in_img_size;
const float *kernel_base = filter_data + c * 9; const float *kernel_base = filter_data + c * 9;
const float *in = input_base; const float *in = input_base;
...@@ -228,15 +141,15 @@ MaceStatus DepthwiseDeconv2dK3x3S2::Compute(const OpContext *context, ...@@ -228,15 +141,15 @@ MaceStatus DepthwiseDeconv2dK3x3S2::Compute(const OpContext *context,
float32x4_t k1_vec = vld1q_f32(k1); float32x4_t k1_vec = vld1q_f32(k1);
float32x4_t k2_vec = vld1q_f32(k2); float32x4_t k2_vec = vld1q_f32(k2);
for (index_t i = 0; i < h; ++i) { for (index_t i = 0; i < p.in_height; ++i) {
float *out_row_base = out_base + i * 2 * outw; float *out_row_base = out_base + i * 2 * p.out_width;
float *out_row_0 = out_row_base; float *out_row_0 = out_row_base;
float *out_row_1 = out_row_0 + outw; float *out_row_1 = out_row_0 + p.out_width;
float *out_row_2 = out_row_1 + outw; float *out_row_2 = out_row_1 + p.out_width;
index_t j = 0; index_t j = 0;
for (index_t n = 0; n + 9 < outw; n += 8) { for (index_t n = 0; n + 9 < p.out_width; n += 8) {
float32x4_t in_vec = vld1q_f32(in); float32x4_t in_vec = vld1q_f32(in);
// out row 0 // out row 0
...@@ -285,7 +198,7 @@ MaceStatus DepthwiseDeconv2dK3x3S2::Compute(const OpContext *context, ...@@ -285,7 +198,7 @@ MaceStatus DepthwiseDeconv2dK3x3S2::Compute(const OpContext *context,
j += 4; j += 4;
} }
for (; j < w; ++j) { for (; j < p.in_width; ++j) {
float val = in[0]; float val = in[0];
for (int k = 0; k < 3; ++k) { for (int k = 0; k < 3; ++k) {
...@@ -302,80 +215,31 @@ MaceStatus DepthwiseDeconv2dK3x3S2::Compute(const OpContext *context, ...@@ -302,80 +215,31 @@ MaceStatus DepthwiseDeconv2dK3x3S2::Compute(const OpContext *context,
} }
} }
} }
}, 0, batch, 1, 0, channels, 1); }, 0, p.batch, 1, 0, p.in_channels, 1);
UnPadOutput(*out_tensor, out_pad_size, output);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context, template<>
const Tensor *input, MaceStatus GroupDeconv2dK3x3S1<float>::DoCompute(
const Tensor *filter, const GroupDeconvComputeParam &p, const float *filter_data,
const Tensor *output_shape, const float *input_data, float *padded_out_data) {
Tensor *output) { p.thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
std::unique_ptr<Tensor> padded_out;
std::vector<int> out_pad_size;
ResizeOutAndPadOut(context,
input,
filter,
output_shape,
output,
&out_pad_size,
&padded_out);
Tensor *out_tensor = output;
if (padded_out != nullptr) {
out_tensor = padded_out.get();
}
out_tensor->Clear();
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>();
auto filter_data = filter->data<float>();
auto padded_out_data = out_tensor->mutable_data<float>();
auto &in_shape = input->shape();
auto &out_shape = out_tensor->shape();
const index_t batch = in_shape[0];
const index_t inch = in_shape[1];
const index_t h = in_shape[2];
const index_t w = in_shape[3];
const index_t outch = out_shape[1];
const index_t outh = out_shape[2];
const index_t outw = out_shape[3];
const index_t in_img_size = h * w;
const index_t out_img_size = outh * outw;
const index_t inch_g = inch / group_;
const index_t outch_g = outch / group_;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1, index_t start1, index_t end1, index_t step1,
index_t start2, index_t end2, index_t step2) { index_t start2, index_t end2, index_t step2) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
for (index_t g = start1; g < end1; g += step1) { for (index_t g = start1; g < end1; g += step1) {
for (index_t oc = start2; oc < end2; oc += step2) { for (index_t oc = start2; oc < end2; oc += step2) {
if (oc + 1 < outch_g) { if (oc + 1 < p.outch_g) {
const index_t out_offset = b * outch + outch_g * g + oc; const index_t out_offset = b * p.out_channels + p.outch_g * g + oc;
float *out_base0 = padded_out_data + out_offset * out_img_size; float *out_base0 = padded_out_data + out_offset * p.out_img_size;
float *out_base1 = out_base0 + out_img_size; float *out_base1 = out_base0 + p.out_img_size;
for (index_t ic = 0; ic < inch_g; ++ic) { for (index_t ic = 0; ic < p.inch_g; ++ic) {
const index_t in_offset = b * inch + inch_g * g + ic; const index_t in_offset = b * p.in_channels + p.inch_g * g + ic;
const float *input_base = input_data + in_offset * in_img_size; const float *input_base = input_data + in_offset * p.in_img_size;
const index_t kernel_offset = (oc * group_ + g) * inch_g + ic; const index_t kernel_offset = (oc * group_ + g) * p.inch_g + ic;
const float *kernel_base0 = filter_data + kernel_offset * 9; const float *kernel_base0 = filter_data + kernel_offset * 9;
const float *kernel_base1 = kernel_base0 + inch * 9; const float *kernel_base1 = kernel_base0 + p.in_channels * 9;
const float *in = input_base; const float *in = input_base;
// output channel 0 // output channel 0
...@@ -399,20 +263,20 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context, ...@@ -399,20 +263,20 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context,
k11_vec = vld1q_f32(k1_1); k11_vec = vld1q_f32(k1_1);
k12_vec = vld1q_f32(k1_2); k12_vec = vld1q_f32(k1_2);
for (index_t i = 0; i < h; ++i) { for (index_t i = 0; i < p.in_height; ++i) {
float *out_row_base0 = out_base0 + i * outw; float *out_row_base0 = out_base0 + i * p.out_width;
float *out_row0_0 = out_row_base0; float *out_row0_0 = out_row_base0;
float *out_row0_1 = out_row_base0 + outw; float *out_row0_1 = out_row_base0 + p.out_width;
float *out_row0_2 = out_row_base0 + 2 * outw; float *out_row0_2 = out_row_base0 + 2 * p.out_width;
float *out_row_base1 = out_base1 + i * outw; float *out_row_base1 = out_base1 + i * p.out_width;
float *out_row1_0 = out_row_base1; float *out_row1_0 = out_row_base1;
float *out_row1_1 = out_row_base1 + outw; float *out_row1_1 = out_row_base1 + p.out_width;
float *out_row1_2 = out_row_base1 + 2 * outw; float *out_row1_2 = out_row_base1 + 2 * p.out_width;
index_t j = 0; index_t j = 0;
for (; j + 3 < w; j += 4) { for (; j + 3 < p.in_width; j += 4) {
float32x4_t in_vec = vld1q_f32(in); float32x4_t in_vec = vld1q_f32(in);
float32x4_t out00, out01, out02; float32x4_t out00, out01, out02;
...@@ -500,7 +364,7 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context, ...@@ -500,7 +364,7 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context,
out_row1_2 += 4; out_row1_2 += 4;
} }
for (; j < w; ++j) { for (; j < p.in_width; ++j) {
float val = in[0]; float val = in[0];
for (int k = 0; k < 3; ++k) { for (int k = 0; k < 3; ++k) {
out_row0_0[k] += val * k0_0[k]; out_row0_0[k] += val * k0_0[k];
...@@ -521,12 +385,12 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context, ...@@ -521,12 +385,12 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context,
} }
} }
} else { } else {
const index_t out_offset = b * outch + outch_g * g + oc; const index_t out_offset = b * p.out_channels + p.outch_g * g + oc;
float *out_base0 = padded_out_data + out_offset * out_img_size; float *out_base0 = padded_out_data + out_offset * p.out_img_size;
for (index_t ic = 0; ic < inch_g; ++ic) { for (index_t ic = 0; ic < p.inch_g; ++ic) {
const index_t in_offset = (b * group_ + g) * inch_g + ic; const index_t in_offset = (b * group_ + g) * p.inch_g + ic;
const float *input_base = input_data + in_offset * in_img_size; const float *input_base = input_data + in_offset * p.in_img_size;
const index_t kernel_offset = (oc * group_ + g) * inch_g + ic; const index_t kernel_offset = (oc * group_ + g) * p.inch_g + ic;
const float *kernel_base0 = filter_data + kernel_offset * 9; const float *kernel_base0 = filter_data + kernel_offset * 9;
const float *in = input_base; const float *in = input_base;
const float *k0_0 = kernel_base0; const float *k0_0 = kernel_base0;
...@@ -538,14 +402,14 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context, ...@@ -538,14 +402,14 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context,
float32x4_t k01_vec = vld1q_f32(k0_1); float32x4_t k01_vec = vld1q_f32(k0_1);
float32x4_t k02_vec = vld1q_f32(k0_2); float32x4_t k02_vec = vld1q_f32(k0_2);
for (index_t i = 0; i < h; ++i) { for (index_t i = 0; i < p.in_height; ++i) {
float *out_row_base0 = out_base0 + i * outw; float *out_row_base0 = out_base0 + i * p.out_width;
float *out_row0_0 = out_row_base0; float *out_row0_0 = out_row_base0;
float *out_row0_1 = out_row_base0 + outw; float *out_row0_1 = out_row_base0 + p.out_width;
float *out_row0_2 = out_row_base0 + 2 * outw; float *out_row0_2 = out_row_base0 + 2 * p.out_width;
index_t j = 0; index_t j = 0;
for (; j + 3 < w; j += 4) { for (; j + 3 < p.in_width; j += 4) {
float32x4_t in_vec = vld1q_f32(in); float32x4_t in_vec = vld1q_f32(in);
float32x4_t out00, out01, out02; float32x4_t out00, out01, out02;
...@@ -594,7 +458,7 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context, ...@@ -594,7 +458,7 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context,
out_row0_2 += 4; out_row0_2 += 4;
} }
for (; j < w; ++j) { for (; j < p.in_width; ++j) {
float val = in[0]; float val = in[0];
for (int k = 0; k < 3; ++k) { for (int k = 0; k < 3; ++k) {
out_row0_0[k] += val * k0_0[k]; out_row0_0[k] += val * k0_0[k];
...@@ -612,76 +476,27 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context, ...@@ -612,76 +476,27 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context,
} }
} }
} }
}, 0, batch, 1, 0, group_, 1, 0, outch_g, 2); }, 0, p.batch, 1, 0, group_, 1, 0, p.outch_g, 2);
UnPadOutput(*out_tensor, out_pad_size, output);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MaceStatus GroupDeconv2dK3x3S2::Compute(const OpContext *context, template<>
const Tensor *input, MaceStatus GroupDeconv2dK3x3S2<float>::DoCompute(
const Tensor *filter, const GroupDeconvComputeParam &p, const float *filter_data,
const Tensor *output_shape, const float *input_data, float *padded_out_data) {
Tensor *output) { p.thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
std::unique_ptr<Tensor> padded_out;
std::vector<int> out_pad_size;
ResizeOutAndPadOut(context,
input,
filter,
output_shape,
output,
&out_pad_size,
&padded_out);
Tensor *out_tensor = output;
if (padded_out != nullptr) {
out_tensor = padded_out.get();
}
out_tensor->Clear();
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>();
auto filter_data = filter->data<float>();
auto padded_out_data = out_tensor->mutable_data<float>();
auto &in_shape = input->shape();
auto &out_shape = out_tensor->shape();
const index_t batch = in_shape[0];
const index_t inch = in_shape[1];
const index_t h = in_shape[2];
const index_t w = in_shape[3];
const index_t outch = out_shape[1];
const index_t outh = out_shape[2];
const index_t outw = out_shape[3];
const index_t in_img_size = h * w;
const index_t out_img_size = outh * outw;
const index_t inch_g = inch / group_;
const index_t outch_g = outch / group_;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1, index_t start1, index_t end1, index_t step1,
index_t start2, index_t end2, index_t step2) { index_t start2, index_t end2, index_t step2) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
for (index_t g = start1; g < end1; g += step1) { for (index_t g = start1; g < end1; g += step1) {
for (index_t oc = start2; oc < end2; oc += step2) { for (index_t oc = start2; oc < end2; oc += step2) {
const index_t out_offset = b * outch + outch_g * g + oc; const index_t out_offset = b * p.out_channels + p.outch_g * g + oc;
float *out_base = padded_out_data + out_offset * out_img_size; float *out_base = padded_out_data + out_offset * p.out_img_size;
for (index_t ic = 0; ic < inch_g; ++ic) { for (index_t ic = 0; ic < p.inch_g; ++ic) {
const index_t in_offset = b * inch + inch_g * g + ic; const index_t in_offset = b * p.in_channels + p.inch_g * g + ic;
const float *input_base = input_data + in_offset * in_img_size; const float *input_base = input_data + in_offset * p.in_img_size;
const index_t kernel_offset = (oc * group_ + g) * inch_g + ic; const index_t kernel_offset = (oc * group_ + g) * p.inch_g + ic;
const float *kernel_base = filter_data + kernel_offset * 9; const float *kernel_base = filter_data + kernel_offset * 9;
const float *in = input_base; const float *in = input_base;
...@@ -693,15 +508,15 @@ MaceStatus GroupDeconv2dK3x3S2::Compute(const OpContext *context, ...@@ -693,15 +508,15 @@ MaceStatus GroupDeconv2dK3x3S2::Compute(const OpContext *context,
float32x4_t k1_vec = vld1q_f32(k1); float32x4_t k1_vec = vld1q_f32(k1);
float32x4_t k2_vec = vld1q_f32(k2); float32x4_t k2_vec = vld1q_f32(k2);
for (index_t i = 0; i < h; ++i) { for (index_t i = 0; i < p.in_height; ++i) {
float *out_row_base = out_base + i * 2 * outw; float *out_row_base = out_base + i * 2 * p.out_width;
float *out_row_0 = out_row_base; float *out_row_0 = out_row_base;
float *out_row_1 = out_row_0 + outw; float *out_row_1 = out_row_0 + p.out_width;
float *out_row_2 = out_row_1 + outw; float *out_row_2 = out_row_1 + p.out_width;
index_t j = 0; index_t j = 0;
for (index_t n = 0; n + 9 < outw; n += 8) { for (index_t n = 0; n + 9 < p.out_width; n += 8) {
float32x4_t in_vec = vld1q_f32(in); float32x4_t in_vec = vld1q_f32(in);
// out row 0 // out row 0
...@@ -750,7 +565,7 @@ MaceStatus GroupDeconv2dK3x3S2::Compute(const OpContext *context, ...@@ -750,7 +565,7 @@ MaceStatus GroupDeconv2dK3x3S2::Compute(const OpContext *context,
j += 4; j += 4;
} }
for (; j < w; ++j) { for (; j < p.in_width; ++j) {
float val = in[0]; float val = in[0];
for (int k = 0; k < 3; ++k) { for (int k = 0; k < 3; ++k) {
...@@ -769,36 +584,11 @@ MaceStatus GroupDeconv2dK3x3S2::Compute(const OpContext *context, ...@@ -769,36 +584,11 @@ MaceStatus GroupDeconv2dK3x3S2::Compute(const OpContext *context,
} }
} }
} }
}, 0, batch, 1, 0, group_, 1, 0, outch_g, 1); }, 0, p.batch, 1, 0, group_, 1, 0, p.outch_g, 1);
UnPadOutput(*out_tensor, out_pad_size, output);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
void RegisterDepthwiseDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2dK3x3S1, delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S1));
MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2dK3x3S2, delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S2));
}
void RegisterGroupDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dK3x3S1, delegator::GroupDeconv2dParam,
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S1));
MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dK3x3S2, delegator::GroupDeconv2dParam,
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K3x3S2));
}
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
...@@ -12,69 +12,26 @@ ...@@ -12,69 +12,26 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h"
#include <arm_neon.h> #include <arm_neon.h>
#include "mace/ops/arm/base/depthwise_deconv_2d_4x4.h"
#include "mace/ops/arm/fp32/common_neon.h" #include "mace/ops/arm/fp32/common_neon.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
template<>
MaceStatus DepthwiseDeconv2dK4x4S1::Compute(const OpContext *context, MaceStatus DepthwiseDeconv2dK4x4S1<float>::DoCompute(
const Tensor *input, const DepthwiseDeconvComputeParam &p, const float *filter_data,
const Tensor *filter, const float *input_data, float *padded_out_data) {
const Tensor *output_shape, p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
Tensor *output) {
std::unique_ptr<Tensor> padded_out;
std::vector<int> out_pad_size;
group_ = input->dim(1);
ResizeOutAndPadOut(context,
input,
filter,
output_shape,
output,
&out_pad_size,
&padded_out);
Tensor *out_tensor = output;
if (padded_out != nullptr) {
out_tensor = padded_out.get();
}
out_tensor->Clear();
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>();
auto filter_data = filter->data<float>();
auto padded_out_data = out_tensor->mutable_data<float>();
auto &in_shape = input->shape();
auto &out_shape = out_tensor->shape();
const index_t batch = in_shape[0];
const index_t channels = in_shape[1];
const index_t h = in_shape[2];
const index_t w = in_shape[3];
const index_t in_img_size = h * w;
const index_t outh = out_shape[2];
const index_t outw = out_shape[3];
const index_t out_img_size = outh * outw;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) { index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
for (index_t c = start1; c < end1; c += step1) { for (index_t c = start1; c < end1; c += step1) {
const index_t offset = b * channels + c; const index_t offset = b * p.in_channels + c;
float *out_base = padded_out_data + offset * out_img_size; float *out_base = padded_out_data + offset * p.out_img_size;
const float *input_base = input_data + offset * in_img_size; const float *input_base = input_data + offset * p.in_img_size;
const float *kernel_base = filter_data + c * 16; const float *kernel_base = filter_data + c * 16;
const float *in = input_base; const float *in = input_base;
const float *k0 = kernel_base; const float *k0 = kernel_base;
...@@ -87,15 +44,15 @@ MaceStatus DepthwiseDeconv2dK4x4S1::Compute(const OpContext *context, ...@@ -87,15 +44,15 @@ MaceStatus DepthwiseDeconv2dK4x4S1::Compute(const OpContext *context,
float32x4_t k2_vec = vld1q_f32(k2); float32x4_t k2_vec = vld1q_f32(k2);
float32x4_t k3_vec = vld1q_f32(k3); float32x4_t k3_vec = vld1q_f32(k3);
for (index_t i = 0; i < h; i++) { for (index_t i = 0; i < p.in_height; i++) {
float *out_row = out_base + i * outw; float *out_row = out_base + i * p.out_width;
float *out_row_0 = out_row; float *out_row_0 = out_row;
float *out_row_1 = out_row_0 + outw; float *out_row_1 = out_row_0 + p.out_width;
float *out_row_2 = out_row_1 + outw; float *out_row_2 = out_row_1 + p.out_width;
float *out_row_3 = out_row_2 + outw; float *out_row_3 = out_row_2 + p.out_width;
index_t j = 0; index_t j = 0;
for (; j + 3 < w; j += 4) { for (; j + 3 < p.in_width; j += 4) {
float32x4_t in_vec = vld1q_f32(in); float32x4_t in_vec = vld1q_f32(in);
float32x4_t out00 = vld1q_f32(out_row_0); float32x4_t out00 = vld1q_f32(out_row_0);
...@@ -172,7 +129,7 @@ MaceStatus DepthwiseDeconv2dK4x4S1::Compute(const OpContext *context, ...@@ -172,7 +129,7 @@ MaceStatus DepthwiseDeconv2dK4x4S1::Compute(const OpContext *context,
out_row_3 += 4; out_row_3 += 4;
} }
for (; j < w; j++) { for (; j < p.in_width; j++) {
float val = in[0]; float val = in[0];
for (int k = 0; k < 4; ++k) { for (int k = 0; k < 4; ++k) {
out_row_0[k] += val * k0[k]; out_row_0[k] += val * k0[k];
...@@ -189,66 +146,22 @@ MaceStatus DepthwiseDeconv2dK4x4S1::Compute(const OpContext *context, ...@@ -189,66 +146,22 @@ MaceStatus DepthwiseDeconv2dK4x4S1::Compute(const OpContext *context,
} }
} }
} }
}, 0, batch, 1, 0, channels, 1); }, 0, p.batch, 1, 0, p.in_channels, 1);
UnPadOutput(*out_tensor, out_pad_size, output);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MaceStatus DepthwiseDeconv2dK4x4S2::Compute(const OpContext *context, template<>
const Tensor *input, MaceStatus DepthwiseDeconv2dK4x4S2<float>::DoCompute(
const Tensor *filter, const DepthwiseDeconvComputeParam &p, const float *filter_data,
const Tensor *output_shape, const float *input_data, float *padded_out_data) {
Tensor *output) { p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
std::unique_ptr<Tensor> padded_out;
std::vector<int> out_pad_size;
group_ = input->dim(1);
ResizeOutAndPadOut(context,
input,
filter,
output_shape,
output,
&out_pad_size,
&padded_out);
Tensor *out_tensor = output;
if (padded_out != nullptr) {
out_tensor = padded_out.get();
}
out_tensor->Clear();
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>();
auto filter_data = filter->data<float>();
auto padded_out_data = out_tensor->mutable_data<float>();
auto &in_shape = input->shape();
auto &out_shape = out_tensor->shape();
const index_t batch = in_shape[0];
const index_t channels = in_shape[1];
const index_t h = in_shape[2];
const index_t w = in_shape[3];
const index_t in_img_size = h * w;
const index_t outh = out_shape[2];
const index_t outw = out_shape[3];
const index_t out_img_size = outh * outw;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1) { index_t start1, index_t end1, index_t step1) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
for (index_t c = start1; c < end1; c += step1) { for (index_t c = start1; c < end1; c += step1) {
const index_t offset = b * channels + c; const index_t offset = b * p.in_channels + c;
float *out_base = padded_out_data + offset * out_img_size; float *out_base = padded_out_data + offset * p.out_img_size;
const float *input_base = input_data + offset * in_img_size; const float *input_base = input_data + offset * p.in_img_size;
const float *kernel_base = filter_data + c * 16; const float *kernel_base = filter_data + c * 16;
const float *in = input_base; const float *in = input_base;
...@@ -262,17 +175,17 @@ MaceStatus DepthwiseDeconv2dK4x4S2::Compute(const OpContext *context, ...@@ -262,17 +175,17 @@ MaceStatus DepthwiseDeconv2dK4x4S2::Compute(const OpContext *context,
float32x4_t k2_vec = vld1q_f32(k2); float32x4_t k2_vec = vld1q_f32(k2);
float32x4_t k3_vec = vld1q_f32(k3); float32x4_t k3_vec = vld1q_f32(k3);
for (index_t i = 0; i < h; i++) { for (index_t i = 0; i < p.in_height; i++) {
float *out_row = out_base + 2 * i * outw; float *out_row = out_base + 2 * i * p.out_width;
float *out_row_0 = out_row; float *out_row_0 = out_row;
float *out_row_1 = out_row_0 + outw; float *out_row_1 = out_row_0 + p.out_width;
float *out_row_2 = out_row_1 + outw; float *out_row_2 = out_row_1 + p.out_width;
float *out_row_3 = out_row_2 + outw; float *out_row_3 = out_row_2 + p.out_width;
index_t j = 0; index_t j = 0;
for (index_t n = 0; n + 9 < outw; n += 8) { for (index_t n = 0; n + 9 < p.out_width; n += 8) {
float32x4_t in_vec = vld1q_f32(in); float32x4_t in_vec = vld1q_f32(in);
// row 0 // row 0
...@@ -339,7 +252,7 @@ MaceStatus DepthwiseDeconv2dK4x4S2::Compute(const OpContext *context, ...@@ -339,7 +252,7 @@ MaceStatus DepthwiseDeconv2dK4x4S2::Compute(const OpContext *context,
j += 4; j += 4;
} }
for (; j < w; j++) { for (; j < p.in_width; j++) {
float val = in[0]; float val = in[0];
for (int k = 0; k < 4; ++k) { for (int k = 0; k < 4; ++k) {
out_row_0[k] += val * k0[k]; out_row_0[k] += val * k0[k];
...@@ -356,89 +269,40 @@ MaceStatus DepthwiseDeconv2dK4x4S2::Compute(const OpContext *context, ...@@ -356,89 +269,40 @@ MaceStatus DepthwiseDeconv2dK4x4S2::Compute(const OpContext *context,
} }
} }
} }
}, 0, batch, 1, 0, channels, 1); }, 0, p.batch, 1, 0, p.in_channels, 1);
UnPadOutput(*out_tensor, out_pad_size, output);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context, template<>
const Tensor *input, MaceStatus GroupDeconv2dK4x4S1<float>::DoCompute(
const Tensor *filter, const GroupDeconvComputeParam &p, const float *filter_data,
const Tensor *output_shape, const float *input_data, float *padded_out_data) {
Tensor *output) { p.thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
std::unique_ptr<Tensor> padded_out;
std::vector<int> out_pad_size;
ResizeOutAndPadOut(context,
input,
filter,
output_shape,
output,
&out_pad_size,
&padded_out);
Tensor *out_tensor = output;
if (padded_out != nullptr) {
out_tensor = padded_out.get();
}
out_tensor->Clear();
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>();
auto filter_data = filter->data<float>();
auto padded_out_data = out_tensor->mutable_data<float>();
auto &in_shape = input->shape();
auto &out_shape = out_tensor->shape();
const index_t batch = in_shape[0];
const index_t inch = in_shape[1];
const index_t h = in_shape[2];
const index_t w = in_shape[3];
const index_t outch = out_shape[1];
const index_t outh = out_shape[2];
const index_t outw = out_shape[3];
const index_t in_img_size = h * w;
const index_t out_img_size = outh * outw;
const index_t inch_g = inch / group_;
const index_t outch_g = outch / group_;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1, index_t start1, index_t end1, index_t step1,
index_t start2, index_t end2, index_t step2) { index_t start2, index_t end2, index_t step2) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
for (index_t g = start1; g < end1; g += step1) { for (index_t g = start1; g < end1; g += step1) {
for (index_t oc = start2; oc < end2; oc += step2) { for (index_t oc = start2; oc < end2; oc += step2) {
if (oc + 1 < outch_g) { if (oc + 1 < p.outch_g) {
const index_t out_offset = const index_t out_offset =
(b * outch + outch_g * g + oc) * out_img_size; (b * p.out_channels + p.outch_g * g + oc) * p.out_img_size;
float *out_base = padded_out_data + out_offset; float *out_base = padded_out_data + out_offset;
float *out_base1 = out_base + out_img_size; float *out_base1 = out_base + p.out_img_size;
for (index_t ic = 0; ic < inch_g; ic++) { for (index_t ic = 0; ic < p.inch_g; ic++) {
const index_t in_offset = const index_t in_offset =
(b * inch + inch_g * g + ic) * in_img_size; (b * p.in_channels + p.inch_g * g + ic) * p.in_img_size;
const float *input_base = input_data + in_offset; const float *input_base = input_data + in_offset;
const float *in = input_base; const float *in = input_base;
const index_t kernel_offset = const index_t kernel_offset =
((oc * group_ + g) * inch_g + ic) * 16; ((oc * group_ + g) * p.inch_g + ic) * 16;
const float *kernel_base = filter_data + kernel_offset; const float *kernel_base = filter_data + kernel_offset;
const float *k0 = kernel_base; const float *k0 = kernel_base;
const float *k1 = kernel_base + 4; const float *k1 = kernel_base + 4;
const float *k2 = kernel_base + 8; const float *k2 = kernel_base + 8;
const float *k3 = kernel_base + 12; const float *k3 = kernel_base + 12;
const float *kernel_base1 = kernel_base + inch * 16; const float *kernel_base1 = kernel_base + p.in_channels * 16;
const float *k10 = kernel_base1; const float *k10 = kernel_base1;
const float *k11 = kernel_base1 + 4; const float *k11 = kernel_base1 + 4;
const float *k12 = kernel_base1 + 8; const float *k12 = kernel_base1 + 8;
...@@ -454,24 +318,24 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context, ...@@ -454,24 +318,24 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context,
float32x4_t k12_vec = vld1q_f32(k12); float32x4_t k12_vec = vld1q_f32(k12);
float32x4_t k13_vec = vld1q_f32(k13); float32x4_t k13_vec = vld1q_f32(k13);
for (index_t i = 0; i < h; i++) { for (index_t i = 0; i < p.in_height; i++) {
float *out_row = out_base + i * outw; float *out_row = out_base + i * p.out_width;
float *out_row_0 = out_row; float *out_row_0 = out_row;
float *out_row_1 = out_row_0 + outw; float *out_row_1 = out_row_0 + p.out_width;
float *out_row_2 = out_row_1 + outw; float *out_row_2 = out_row_1 + p.out_width;
float *out_row_3 = out_row_2 + outw; float *out_row_3 = out_row_2 + p.out_width;
float *out_row1 = out_base1 + i * outw; float *out_row1 = out_base1 + i * p.out_width;
float *out_row1_0 = out_row1; float *out_row1_0 = out_row1;
float *out_row1_1 = out_row1_0 + outw; float *out_row1_1 = out_row1_0 + p.out_width;
float *out_row1_2 = out_row1_1 + outw; float *out_row1_2 = out_row1_1 + p.out_width;
float *out_row1_3 = out_row1_2 + outw; float *out_row1_3 = out_row1_2 + p.out_width;
index_t j = 0; index_t j = 0;
for (; j + 3 < w; j += 4) { for (; j + 3 < p.in_width; j += 4) {
float32x4_t in_vec = vld1q_f32(in); float32x4_t in_vec = vld1q_f32(in);
float32x4_t out00, out01, out02, out03; float32x4_t out00, out01, out02, out03;
float32x4_t out10, out11, out12, out13; float32x4_t out10, out11, out12, out13;
...@@ -618,7 +482,7 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context, ...@@ -618,7 +482,7 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context,
out_row1_3 += 4; out_row1_3 += 4;
} }
for (; j < w; j++) { for (; j < p.in_width; j++) {
float val = in[0]; float val = in[0];
for (int k = 0; k < 4; ++k) { for (int k = 0; k < 4; ++k) {
out_row_0[k] += val * k0[k]; out_row_0[k] += val * k0[k];
...@@ -644,13 +508,13 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context, ...@@ -644,13 +508,13 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context,
} }
} else { } else {
const index_t out_offset = const index_t out_offset =
(b * outch + outch_g * g + oc) * out_img_size; (b * p.out_channels + p.outch_g * g + oc) * p.out_img_size;
float *out_base = padded_out_data + out_offset; float *out_base = padded_out_data + out_offset;
for (index_t ic = 0; ic < inch_g; ++ic) { for (index_t ic = 0; ic < p.inch_g; ++ic) {
const index_t in_offset = const index_t in_offset =
(b * inch + inch_g * g + ic) * in_img_size; (b * p.in_channels + p.inch_g * g + ic) * p.in_img_size;
const index_t kernel_offset = const index_t kernel_offset =
((oc * group_ + g) * inch_g + ic) * 16; ((oc * group_ + g) * p.inch_g + ic) * 16;
const float *input_base = input_data + in_offset; const float *input_base = input_data + in_offset;
const float *kernel_base = filter_data + kernel_offset; const float *kernel_base = filter_data + kernel_offset;
...@@ -665,15 +529,15 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context, ...@@ -665,15 +529,15 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context,
float32x4_t k2_vec = vld1q_f32(k2); float32x4_t k2_vec = vld1q_f32(k2);
float32x4_t k3_vec = vld1q_f32(k3); float32x4_t k3_vec = vld1q_f32(k3);
for (index_t i = 0; i < h; i++) { for (index_t i = 0; i < p.in_height; i++) {
float *out_row = out_base + i * outw; float *out_row = out_base + i * p.out_width;
float *out_row_0 = out_row; float *out_row_0 = out_row;
float *out_row_1 = out_row_0 + outw; float *out_row_1 = out_row_0 + p.out_width;
float *out_row_2 = out_row_1 + outw; float *out_row_2 = out_row_1 + p.out_width;
float *out_row_3 = out_row_2 + outw; float *out_row_3 = out_row_2 + p.out_width;
index_t j = 0; index_t j = 0;
for (; j + 3 < w; j += 4) { for (; j + 3 < p.in_width; j += 4) {
float32x4_t in_vec = vld1q_f32(in); float32x4_t in_vec = vld1q_f32(in);
float32x4_t out00 = vld1q_f32(out_row_0); float32x4_t out00 = vld1q_f32(out_row_0);
...@@ -750,7 +614,7 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context, ...@@ -750,7 +614,7 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context,
out_row_3 += 4; out_row_3 += 4;
} }
for (; j < w; j++) { for (; j < p.in_width; j++) {
float val = in[0]; float val = in[0];
for (int k = 0; k < 4; ++k) { for (int k = 0; k < 4; ++k) {
out_row_0[k] += val * k0[k]; out_row_0[k] += val * k0[k];
...@@ -770,78 +634,29 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context, ...@@ -770,78 +634,29 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context,
} }
} }
} }
}, 0, batch, 1, 0, group_, 1, 0, outch_g, 2); }, 0, p.batch, 1, 0, group_, 1, 0, p.outch_g, 2);
UnPadOutput(*out_tensor, out_pad_size, output);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
MaceStatus GroupDeconv2dK4x4S2::Compute(const OpContext *context, template<>
const Tensor *input, MaceStatus GroupDeconv2dK4x4S2<float>::DoCompute(
const Tensor *filter, const GroupDeconvComputeParam &p, const float *filter_data,
const Tensor *output_shape, const float *input_data, float *padded_out_data) {
Tensor *output) { p.thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
std::unique_ptr<Tensor> padded_out;
std::vector<int> out_pad_size;
ResizeOutAndPadOut(context,
input,
filter,
output_shape,
output,
&out_pad_size,
&padded_out);
Tensor *out_tensor = output;
if (padded_out != nullptr) {
out_tensor = padded_out.get();
}
out_tensor->Clear();
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>();
auto filter_data = filter->data<float>();
auto padded_out_data = out_tensor->mutable_data<float>();
auto &in_shape = input->shape();
auto &out_shape = out_tensor->shape();
const index_t batch = in_shape[0];
const index_t inch = in_shape[1];
const index_t h = in_shape[2];
const index_t w = in_shape[3];
const index_t outch = out_shape[1];
const index_t outh = out_shape[2];
const index_t outw = out_shape[3];
const index_t in_img_size = h * w;
const index_t out_img_size = outh * outw;
const index_t inch_g = inch / group_;
const index_t outch_g = outch / group_;
utils::ThreadPool
&thread_pool = context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
index_t start1, index_t end1, index_t step1, index_t start1, index_t end1, index_t step1,
index_t start2, index_t end2, index_t step2) { index_t start2, index_t end2, index_t step2) {
for (index_t b = start0; b < end0; b += step0) { for (index_t b = start0; b < end0; b += step0) {
for (index_t g = start1; g < end1; g += step1) { for (index_t g = start1; g < end1; g += step1) {
for (index_t oc = start2; oc < end2; oc += step2) { for (index_t oc = start2; oc < end2; oc += step2) {
const index_t out_offset = const index_t out_offset =
(b * outch + outch_g * g + oc) * out_img_size; (b * p.out_channels + p.outch_g * g + oc) * p.out_img_size;
float *out_base = padded_out_data + out_offset; float *out_base = padded_out_data + out_offset;
for (index_t ic = 0; ic < inch_g; ic++) { for (index_t ic = 0; ic < p.inch_g; ic++) {
const index_t in_offset = const index_t in_offset =
(b * inch + inch_g * g + ic) * in_img_size; (b * p.in_channels + p.inch_g * g + ic) * p.in_img_size;
const index_t kernel_offset = const index_t kernel_offset =
((oc * group_ + g) * inch_g + ic) * 16; ((oc * group_ + g) * p.inch_g + ic) * 16;
const float *input_base = input_data + in_offset; const float *input_base = input_data + in_offset;
const float *kernel_base = filter_data + kernel_offset; const float *kernel_base = filter_data + kernel_offset;
const float *in = input_base; const float *in = input_base;
...@@ -856,17 +671,17 @@ MaceStatus GroupDeconv2dK4x4S2::Compute(const OpContext *context, ...@@ -856,17 +671,17 @@ MaceStatus GroupDeconv2dK4x4S2::Compute(const OpContext *context,
float32x4_t k2_vec = vld1q_f32(k2); float32x4_t k2_vec = vld1q_f32(k2);
float32x4_t k3_vec = vld1q_f32(k3); float32x4_t k3_vec = vld1q_f32(k3);
for (index_t i = 0; i < h; i++) { for (index_t i = 0; i < p.in_height; i++) {
float *out_row = out_base + 2 * i * outw; float *out_row = out_base + 2 * i * p.out_width;
float *out_row_0 = out_row; float *out_row_0 = out_row;
float *out_row_1 = out_row_0 + outw; float *out_row_1 = out_row_0 + p.out_width;
float *out_row_2 = out_row_1 + outw; float *out_row_2 = out_row_1 + p.out_width;
float *out_row_3 = out_row_2 + outw; float *out_row_3 = out_row_2 + p.out_width;
index_t j = 0; index_t j = 0;
for (index_t n = 0; n + 9 < outw; n += 8) { for (index_t n = 0; n + 9 < p.out_width; n += 8) {
float32x4_t in_vec = vld1q_f32(in); float32x4_t in_vec = vld1q_f32(in);
// row 0 // row 0
...@@ -933,7 +748,7 @@ MaceStatus GroupDeconv2dK4x4S2::Compute(const OpContext *context, ...@@ -933,7 +748,7 @@ MaceStatus GroupDeconv2dK4x4S2::Compute(const OpContext *context,
j += 4; j += 4;
} }
for (; j < w; j++) { for (; j < p.in_width; j++) {
float val = in[0]; float val = in[0];
for (int k = 0; k < 4; ++k) { for (int k = 0; k < 4; ++k) {
out_row_0[k] += val * k0[k]; out_row_0[k] += val * k0[k];
...@@ -952,36 +767,11 @@ MaceStatus GroupDeconv2dK4x4S2::Compute(const OpContext *context, ...@@ -952,36 +767,11 @@ MaceStatus GroupDeconv2dK4x4S2::Compute(const OpContext *context,
} }
} }
} }
}, 0, batch, 1, 0, group_, 1, 0, outch_g, 1); }, 0, p.batch, 1, 0, group_, 1, 0, p.outch_g, 1);
UnPadOutput(*out_tensor, out_pad_size, output);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
void RegisterDepthwiseDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2dK4x4S1, delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K4x4S1));
MACE_REGISTER_DELEGATOR(
registry, DepthwiseDeconv2dK4x4S2, delegator::DepthwiseDeconv2dParam,
MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K4x4S2));
}
void RegisterGroupDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dK4x4S1, delegator::GroupDeconv2dParam,
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K4x4S1));
MACE_REGISTER_DELEGATOR(
registry, GroupDeconv2dK4x4S2, delegator::GroupDeconv2dParam,
MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
float, ImplType::NEON, K4x4S2));
}
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
......
...@@ -12,527 +12,570 @@ ...@@ -12,527 +12,570 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/ops/arm/fp32/gemm.h"
#include <arm_neon.h> #include <arm_neon.h>
#include <algorithm> #include <algorithm>
#include <utility> #include <utility>
#include "mace/ops/arm/base/gemm.h"
#include "mace/port/env.h" #include "mace/port/env.h"
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
enum { kNoCache, kCacheLhs, kCacheRhs }; template<>
template<>
MaceStatus Gemm::Compute(const OpContext *context, void Gemm<float>::Pack<4, 4>(const MatrixMap<const float> &matrix,
const Tensor *lhs, MatrixMajor dst_major,
const Tensor *rhs, float *packed_matrix) {
const index_t batch, const index_t rows = matrix.rows();
const index_t rows, const index_t cols = matrix.cols();
const index_t cols,
const index_t depth,
const MatrixMajor lhs_major,
const MatrixMajor rhs_major,
const MatrixMajor output_major,
const bool lhs_batched,
const bool rhs_batched,
Tensor *output) {
MACE_CHECK(output->size() == batch * rows * cols,
"Need resize output tensor before call gemm.");
Tensor::MappingGuard lhs_guard(lhs);
Tensor::MappingGuard rhs_guard(rhs);
Tensor::MappingGuard output_guard(output);
const float *lhs_data = lhs->data<float>();
const float *rhs_data = rhs->data<float>();
float *output_data = output->mutable_data<float>();
#ifdef __aarch64__ // use the same terminology as GemmLowp:
const index_t row_block_size = 8; // depth is depth, width is the opposite dim other than depth
#else // lhs
const index_t row_block_size = 4; index_t width = rows;
#endif index_t depth = cols;
const index_t col_block_size = 8; index_t width_stride = matrix.rows_stride();
const index_t depth_block_size = 4; index_t depth_stride = matrix.cols_stride();
const index_t row_block_count = RoundUpDiv(rows, row_block_size); if (dst_major == RowMajor) {
const index_t col_block_count = RoundUpDiv(cols, col_block_size); // rhs
const index_t rows_padded = RoundUp(rows, row_block_size); std::swap(width, depth);
const index_t cols_padded = RoundUp(cols, col_block_size); std::swap(width_stride, depth_stride);
const index_t depth_padded = RoundUp(depth, depth_block_size); }
const float *data = matrix.data();
float *packed_ptr = packed_matrix;
ScratchBuffer *scratch = context->device()->scratch_buffer(); const index_t block_size = 4;
const index_t depth_padded = RoundUp(depth, static_cast<index_t>(4));
index_t packed_lhs_size = if (depth_padded > depth) {
PadAlignSize(sizeof(float) * rows_padded * depth_padded); memset(packed_ptr + depth * block_size,
index_t packed_rhs_size = 0,
PadAlignSize(sizeof(float) * depth_padded * cols_padded); sizeof(float) * (depth_padded - depth) * block_size);
index_t packed_output_size = }
PadAlignSize(sizeof(float) * rows_padded * cols_padded);
// resize to the total size of lhs & rhs & output anyway,
// in case we do not cache const tensor for saving memory
MACE_RETURN_IF_ERROR(scratch->GrowSize(
packed_lhs_size + packed_rhs_size + packed_output_size));
float *packed_lhs_data =
scratch->Scratch(packed_lhs_size).mutable_data<float>();
float *packed_rhs_data =
scratch->Scratch(packed_rhs_size).mutable_data<float>();
float *packed_output_data =
scratch->Scratch(packed_output_size).mutable_data<float>();
int cache_side = kNoCache; if (dst_major == matrix.matrix_major()) {
if (cached_ == kCacheLhs) { if (width < block_size) {
packed_lhs_data = pack_cache_.mutable_data<float>(); const index_t width_remain = block_size - width;
} else if (cached_ == kCacheRhs) { for (index_t d = 0; d < depth; ++d) {
packed_rhs_data = pack_cache_.mutable_data<float>(); memcpy(packed_ptr, data, sizeof(float) * width);
} else if (should_cache_pack_) { memset(packed_ptr + width, 0, sizeof(float) * width_remain);
if (lhs->is_weight() && (!lhs_batched || batch == 1)) { data += depth_stride;
cache_side = kCacheLhs; packed_ptr += block_size;
pack_cache_.Resize(packed_lhs_size); }
packed_lhs_data = pack_cache_.mutable_data<float>(); } else {
} else if (rhs->is_weight() && (!rhs_batched || batch == 1)) { for (index_t d = 0; d < depth; ++d) {
cache_side = kCacheRhs; float32x4_t vi = vld1q_f32(data);
pack_cache_.Resize(packed_rhs_size); vst1q_f32(packed_ptr, vi);
packed_rhs_data = pack_cache_.mutable_data<float>(); data += depth_stride;
packed_ptr += block_size;
} }
} }
} else {
if (width < block_size) {
const index_t width_remain = block_size - width;
for (index_t d = 0; d < depth; ++d) {
for (index_t w = 0; w < width; ++w) {
packed_ptr[w] = data[w * width_stride + d];
} // w
memset(packed_ptr + width, 0, sizeof(float) * width_remain);
packed_ptr += block_size;
} // d
} else {
const float *data0 = data;
const float *data1 = data + width_stride;
const float *data2 = data1 + width_stride;
const float *data3 = data2 + width_stride;
utils::ThreadPool const index_t depth_block = depth / 4;
&thread_pool = context->device()->cpu_runtime()->thread_pool(); const index_t depth_remain = depth - depth_block * 4;
for (index_t depth_block_idx = 0; depth_block_idx < depth_block;
++depth_block_idx) {
float32x4_t v0 = vld1q_f32(data0);
float32x4_t v1 = vld1q_f32(data1);
float32x4_t v2 = vld1q_f32(data2);
float32x4_t v3 = vld1q_f32(data3);
float32x4x2_t v02_intertwined = vzipq_f32(v0, v2);
float32x4x2_t v13_intertwined = vzipq_f32(v1, v3);
float32x4x2_t v0123_intertwined =
vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]);
float32x4x2_t v0123n_intertwined =
vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]);
for (index_t b = 0; b < batch; ++b) { vst1q_f32(packed_ptr, v0123_intertwined.val[0]);
MatrixMap<const float> packed_ptr += 4;
lhs_matrix
(lhs_data + static_cast<index_t>(lhs_batched) * b * rows * depth,
lhs_major,
rows,
depth);
MatrixMap<const float>
rhs_matrix
(rhs_data + static_cast<index_t>(rhs_batched) * b * depth * cols,
rhs_major,
depth,
cols);
MatrixMap<float> output_matrix
(output_data + b * rows * cols, output_major, rows, cols);
// pack lhs vst1q_f32(packed_ptr, v0123_intertwined.val[1]);
if (cached_ != kCacheLhs) { packed_ptr += 4;
thread_pool.Compute1D([=, &lhs_matrix](index_t start,
index_t end,
index_t step) {
for (index_t row_block_idx = start; row_block_idx < end;
row_block_idx += step) {
const index_t start_row = row_block_idx * row_block_size;
const index_t
row_block_len = std::min(row_block_size, rows - start_row);
float *packed_lhs_data_block =
packed_lhs_data + row_block_idx * row_block_size * depth_padded;
PackLhs(lhs_matrix.block(start_row, 0, row_block_len, depth),
packed_lhs_data_block);
}
}, 0, row_block_count, 1);
if (cache_side == kCacheLhs) { vst1q_f32(packed_ptr, v0123n_intertwined.val[0]);
cached_ = kCacheLhs; packed_ptr += 4;
if (lhs->UnderlyingBuffer()->OnHost()) {
AdviseFree(reinterpret_cast<void *>(const_cast<float *>(lhs->data< vst1q_f32(packed_ptr, v0123n_intertwined.val[1]);
float>())), packed_ptr += 4;
lhs->raw_size());
data0 += 4;
data1 += 4;
data2 += 4;
data3 += 4;
} }
for (index_t d = 0; d < depth_remain; ++d) {
float32x4_t vi = {*data0, *data1, *data2, *data3};
vst1q_f32(packed_ptr, vi);
packed_ptr += 4;
++data0;
++data1;
++data2;
++data3;
} // d
} }
} }
}
// pack rhs template<>
if (cached_ != kCacheRhs) { template<>
thread_pool.Compute1D([=, &rhs_matrix](index_t start, void Gemm<float>::Pack<8, 4>(const MatrixMap<const float> &matrix,
index_t end, MatrixMajor dst_major,
index_t step) { float *packed_matrix) {
for (index_t col_block_idx = start; col_block_idx < end; const index_t rows = matrix.rows();
col_block_idx += step) { const index_t cols = matrix.cols();
const index_t start_col = col_block_idx * col_block_size;
const index_t
col_block_len = std::min(col_block_size, cols - start_col);
float *packed_rhs_data_block =
packed_rhs_data + col_block_idx * col_block_size * depth_padded;
PackRhs(rhs_matrix.block(0, start_col, depth, col_block_len),
packed_rhs_data_block);
}
}, 0, col_block_count, 1);
if (cache_side == kCacheRhs) { // use the same terminology as GemmLowp:
cached_ = kCacheRhs; // depth is depth, width is the opposite dim other than depth
if (rhs->UnderlyingBuffer()->OnHost()) { // lhs
AdviseFree(reinterpret_cast<void *>(const_cast<float *>(rhs->data< index_t width = rows;
float>())), index_t depth = cols;
rhs->raw_size()); index_t width_stride = matrix.rows_stride();
} index_t depth_stride = matrix.cols_stride();
if (dst_major == RowMajor) {
// rhs
std::swap(width, depth);
std::swap(width_stride, depth_stride);
} }
const float *data = matrix.data();
float *packed_ptr = packed_matrix;
const index_t block_size = 8;
const index_t depth_padded = RoundUp(depth, static_cast<index_t>(4));
if (depth_padded > depth) {
memset(packed_ptr + depth * block_size,
0,
sizeof(float) * (depth_padded - depth) * block_size);
} }
// multiply lhs and rhs if (dst_major == matrix.matrix_major()) {
thread_pool.Compute1D([=, &output_matrix](index_t start, if (width < block_size) {
index_t end, const index_t width_remain = block_size - width;
index_t step) {
for (index_t row_block_idx = start; row_block_idx < end;
row_block_idx += step) {
const index_t start_row = row_block_idx * row_block_size;
const index_t
row_block_len = std::min(row_block_size, rows - start_row);
const float *packed_lhs_data_block =
packed_lhs_data + row_block_idx * row_block_size * depth_padded;
for (index_t col_block_idx = 0; col_block_idx < col_block_count;
++col_block_idx) {
const index_t start_col = col_block_idx * col_block_size;
const index_t
col_block_len = std::min(col_block_size, cols - start_col);
const float *packed_rhs_data_block =
packed_rhs_data + col_block_idx * col_block_size * depth_padded;
float *packed_output_data_block =
packed_output_data + row_block_idx * row_block_size * cols_padded
+ col_block_idx * col_block_size;
ComputeBlock(packed_lhs_data_block,
packed_rhs_data_block,
depth_padded,
packed_output_data_block);
MatrixMap<float> output_block = output_matrix.block(start_row,
start_col,
row_block_len,
col_block_len);
UnpackOutput(packed_output_data_block, &output_block);
} // col_block_idx
} // row_block_idx
}, 0, row_block_count, 1);
} // b
return MaceStatus::MACE_SUCCESS;
}
void Gemm::ComputeBlock(const float *packed_lhs_data,
const float *packed_rhs_data,
const index_t depth_padded,
float *packed_output_data) {
/* Ref:
for (index_t r = 0; r < block_size; ++r) {
for (index_t c = 0; c < block_size; ++c) {
float sum = 0;
for (index_t d = 0; d < depth; ++d) { for (index_t d = 0; d < depth; ++d) {
// (r, d) * (d, c) memcpy(packed_ptr, data, sizeof(float) * width);
sum += packed_lhs_data[d * r_block_size + r] memset(packed_ptr + width, 0, sizeof(float) * width_remain);
* packed_rhs_data[d * c_block_size + c]; data += depth_stride;
packed_ptr += block_size;
} }
packed_output_data[r * c_block_size + c] = sum; } else {
for (index_t d = 0; d < depth; ++d) {
float32x4_t vi = vld1q_f32(data);
vst1q_f32(packed_ptr, vi);
float32x4_t vin = vld1q_f32(data + 4);
vst1q_f32(packed_ptr + 4, vin);
data += depth_stride;
packed_ptr += block_size;
} }
} }
*/ } else {
const float *lhs_ptr = packed_lhs_data; if (width < block_size) {
const float *rhs_ptr = packed_rhs_data; const index_t width_remain = block_size - width;
for (index_t d = 0; d < depth; ++d) {
for (index_t w = 0; w < width; ++w) {
packed_ptr[w] = data[w * width_stride + d];
} // w
memset(packed_ptr + width, 0, sizeof(float) * width_remain);
packed_ptr += block_size;
} // d
} else {
const float *data0 = data;
const float *data1 = data + width_stride;
const float *data2 = data1 + width_stride;
const float *data3 = data2 + width_stride;
const float *data4 = data3 + width_stride;
const float *data5 = data4 + width_stride;
const float *data6 = data5 + width_stride;
const float *data7 = data6 + width_stride;
const index_t depth_block_count = depth_padded / 4; const index_t depth_block = depth / 4;
const index_t depth_remain = depth - depth_block * 4;
for (index_t depth_block_idx = 0; depth_block_idx < depth_block;
++depth_block_idx) {
float32x4_t v0 = vld1q_f32(data0);
float32x4_t v1 = vld1q_f32(data1);
float32x4_t v2 = vld1q_f32(data2);
float32x4_t v3 = vld1q_f32(data3);
float32x4x2_t v02_intertwined = vzipq_f32(v0, v2);
float32x4x2_t v13_intertwined = vzipq_f32(v1, v3);
float32x4x2_t v0123_intertwined =
vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]);
float32x4x2_t v0123n_intertwined =
vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]);
#ifdef __aarch64__ float32x4_t v4 = vld1q_f32(data4);
// Register layout: (8x4) x (4,8) float32x4_t v5 = vld1q_f32(data5);
// float32x4_t v6 = vld1q_f32(data6);
// +--------+--------+ float32x4_t v7 = vld1q_f32(data7);
// | v8 ... | v9 ... | float32x4x2_t v46_intertwined = vzipq_f32(v4, v6);
// Rhs +--------+--------+ float32x4x2_t v57_intertwined = vzipq_f32(v5, v7);
// | v10... | v11... | float32x4x2_t v4567_intertwined =
// +--------+--------+ vzipq_f32(v46_intertwined.val[0], v57_intertwined.val[0]);
// | v12... | v13... | float32x4x2_t v4567n_intertwined =
// +--------+--------+ vzipq_f32(v46_intertwined.val[1], v57_intertwined.val[1]);
// | v14... | v15... |
// +--------+--------+
//
// Lhs
//
// +----+----+----+----+ - - +--------+--------+
// | v0 | v2 | v4 | v6 | | v16... | v17... |
// | . | | | | | v18... | v19... |
// | . | | | | | v20... | v21... |
// | . | | | | | v22... | v23... |
// +----+----|----+----+ +--------+--------+
// | v1 | v3 | v5 | v7 | | v24... | v25... |
// | . | | | | | v26... | v27... |
// | . | | | | | v28... | v29... |
// | . | | | | | v30... | v31... |
// +----+----|----+----+ +--------+--------+
//
// Accumulator
//
if (depth_block_count > 0) { vst1q_f32(packed_ptr, v0123_intertwined.val[0]);
index_t r_depth_block_count = depth_block_count; packed_ptr += 4;
// just make compiler happy
MACE_UNUSED(r_depth_block_count);
asm volatile( vst1q_f32(packed_ptr, v4567_intertwined.val[0]);
"dup v16.4s, wzr \n" packed_ptr += 4;
"dup v17.4s, wzr \n"
"dup v18.4s, wzr \n"
"dup v19.4s, wzr \n"
"dup v20.4s, wzr \n"
"dup v21.4s, wzr \n"
"dup v22.4s, wzr \n"
"dup v23.4s, wzr \n"
"dup v24.4s, wzr \n"
"dup v25.4s, wzr \n"
"dup v26.4s, wzr \n"
"dup v27.4s, wzr \n"
"dup v28.4s, wzr \n"
"dup v29.4s, wzr \n"
"dup v30.4s, wzr \n"
"dup v31.4s, wzr \n"
// prelogue vst1q_f32(packed_ptr, v0123_intertwined.val[1]);
"ld1 {v0.4s}, [%[lhs_ptr]], #16 \n" packed_ptr += 4;
"ld1 {v1.4s}, [%[lhs_ptr]], #16 \n"
"ld1 {v2.4s}, [%[lhs_ptr]], #16 \n"
"ld1 {v3.4s}, [%[lhs_ptr]], #16 \n"
"ld1 {v4.4s}, [%[lhs_ptr]], #16 \n"
"ld1 {v5.4s}, [%[lhs_ptr]], #16 \n"
"ld1 {v6.4s}, [%[lhs_ptr]], #16 \n"
"ld1 {v7.4s}, [%[lhs_ptr]], #16 \n"
"ld1 {v8.4s}, [%[rhs_ptr]], #16 \n" vst1q_f32(packed_ptr, v4567_intertwined.val[1]);
"ld1 {v9.4s}, [%[rhs_ptr]], #16 \n" packed_ptr += 4;
"ld1 {v10.4s}, [%[rhs_ptr]], #16 \n"
"ld1 {v11.4s}, [%[rhs_ptr]], #16 \n"
"ld1 {v12.4s}, [%[rhs_ptr]], #16 \n"
"ld1 {v13.4s}, [%[rhs_ptr]], #16 \n"
"ld1 {v14.4s}, [%[rhs_ptr]], #16 \n"
"ld1 {v15.4s}, [%[rhs_ptr]], #16 \n"
"subs %[r_depth_block_count], %[r_depth_block_count], #1 \n" vst1q_f32(packed_ptr, v0123n_intertwined.val[0]);
"beq 1f\n" packed_ptr += 4;
"0: \n" vst1q_f32(packed_ptr, v4567n_intertwined.val[0]);
"fmla v16.4s, v8.4s, v0.s[0] \n" packed_ptr += 4;
"fmla v17.4s, v9.4s, v0.s[0] \n"
"fmla v18.4s, v8.4s, v0.s[1] \n"
"fmla v19.4s, v9.4s, v0.s[1] \n"
"fmla v20.4s, v8.4s, v0.s[2] \n"
"fmla v21.4s, v9.4s, v0.s[2] \n"
"fmla v22.4s, v8.4s, v0.s[3] \n"
"fmla v23.4s, v9.4s, v0.s[3] \n"
"ld1 {v0.4s}, [%[lhs_ptr]], #16 \n" vst1q_f32(packed_ptr, v0123n_intertwined.val[1]);
packed_ptr += 4;
"fmla v24.4s, v8.4s, v1.s[0] \n" vst1q_f32(packed_ptr, v4567n_intertwined.val[1]);
"fmla v25.4s, v9.4s, v1.s[0] \n" packed_ptr += 4;
"fmla v26.4s, v8.4s, v1.s[1] \n"
"fmla v27.4s, v9.4s, v1.s[1] \n"
"fmla v28.4s, v8.4s, v1.s[2] \n"
"fmla v29.4s, v9.4s, v1.s[2] \n"
"fmla v30.4s, v8.4s, v1.s[3] \n"
"fmla v31.4s, v9.4s, v1.s[3] \n"
"ld1 {v1.4s}, [%[lhs_ptr]], #16 \n" data0 += 4;
"ld1 {v8.4s}, [%[rhs_ptr]], #16 \n" data1 += 4;
"ld1 {v9.4s}, [%[rhs_ptr]], #16 \n" data2 += 4;
data3 += 4;
data4 += 4;
data5 += 4;
data6 += 4;
data7 += 4;
}
for (index_t d = 0; d < depth_remain; ++d) {
float32x4_t vi = {*data0, *data1, *data2, *data3};
vst1q_f32(packed_ptr, vi);
packed_ptr += 4;
"fmla v16.4s, v10.4s, v2.s[0] \n" float32x4_t vin = {*data4, *data5, *data6, *data7};
"fmla v17.4s, v11.4s, v2.s[0] \n" vst1q_f32(packed_ptr, vin);
"fmla v18.4s, v10.4s, v2.s[1] \n" packed_ptr += 4;
"fmla v19.4s, v11.4s, v2.s[1] \n"
"fmla v20.4s, v10.4s, v2.s[2] \n"
"fmla v21.4s, v11.4s, v2.s[2] \n"
"fmla v22.4s, v10.4s, v2.s[3] \n"
"fmla v23.4s, v11.4s, v2.s[3] \n"
"ld1 {v2.4s}, [%[lhs_ptr]], #16 \n" ++data0;
++data1;
++data2;
++data3;
++data4;
++data5;
++data6;
++data7;
} // d
}
}
}
"fmla v24.4s, v10.4s, v3.s[0] \n" template<>
"fmla v25.4s, v11.4s, v3.s[0] \n" template<>
"fmla v26.4s, v10.4s, v3.s[1] \n" void Gemm<float>::Unpack<4, 8>(const float *packed_output,
"fmla v27.4s, v11.4s, v3.s[1] \n" MatrixMap<float> *output) {
"fmla v28.4s, v10.4s, v3.s[2] \n" const index_t rows = output->rows();
"fmla v29.4s, v11.4s, v3.s[2] \n" const index_t cols = output->cols();
"fmla v30.4s, v10.4s, v3.s[3] \n" index_t row_stride = output->rows_stride();
"fmla v31.4s, v11.4s, v3.s[3] \n" index_t col_stride = output->cols_stride();
"ld1 {v3.4s}, [%[lhs_ptr]], #16 \n" float *output_ptr = output->data();
"ld1 {v10.4s}, [%[rhs_ptr]], #16 \n" const float *packed_ptr = packed_output;
"ld1 {v11.4s}, [%[rhs_ptr]], #16 \n"
"fmla v16.4s, v12.4s, v4.s[0] \n" const index_t block_size = 8;
"fmla v17.4s, v13.4s, v4.s[0] \n"
"fmla v18.4s, v12.4s, v4.s[1] \n"
"fmla v19.4s, v13.4s, v4.s[1] \n"
"fmla v20.4s, v12.4s, v4.s[2] \n"
"fmla v21.4s, v13.4s, v4.s[2] \n"
"fmla v22.4s, v12.4s, v4.s[3] \n"
"fmla v23.4s, v13.4s, v4.s[3] \n"
"ld1 {v4.4s}, [%[lhs_ptr]], #16 \n" // packed_output always has row-major
if (output->matrix_major() == RowMajor) {
if (cols < block_size) {
for (index_t r = 0; r < rows; ++r) {
memcpy(output_ptr, packed_ptr, sizeof(float) * cols);
output_ptr += row_stride;
packed_ptr += block_size;
}
} else {
for (index_t r = 0; r < rows; ++r) {
float32x4_t vi = vld1q_f32(packed_ptr);
vst1q_f32(output_ptr, vi);
float32x4_t vin = vld1q_f32(packed_ptr + 4);
vst1q_f32(output_ptr + 4, vin);
"fmla v24.4s, v12.4s, v5.s[0] \n" output_ptr += row_stride;
"fmla v25.4s, v13.4s, v5.s[0] \n" packed_ptr += block_size;
"fmla v26.4s, v12.4s, v5.s[1] \n" }
"fmla v27.4s, v13.4s, v5.s[1] \n" }
"fmla v28.4s, v12.4s, v5.s[2] \n" } else {
"fmla v29.4s, v13.4s, v5.s[2] \n" // ColMajor
"fmla v30.4s, v12.4s, v5.s[3] \n" if (rows < block_size) {
"fmla v31.4s, v13.4s, v5.s[3] \n" for (index_t c = 0; c < cols; ++c) {
for (index_t r = 0; r < rows; ++r) {
output_ptr[c * col_stride + r] = packed_ptr[r * block_size + c];
} // r
} // c
} else {
const float *data0 = packed_ptr;
const float *data1 = data0 + block_size;
const float *data2 = data1 + block_size;
const float *data3 = data2 + block_size;
"ld1 {v5.4s}, [%[lhs_ptr]], #16 \n" index_t col_block = cols / 4;
"ld1 {v12.4s}, [%[rhs_ptr]], #16 \n" index_t col_remain = cols - col_block * 4;
"ld1 {v13.4s}, [%[rhs_ptr]], #16 \n" for (index_t col_block_idx = 0; col_block_idx < col_block;
++col_block_idx) {
float32x4_t v0 = vld1q_f32(data0);
float32x4_t v1 = vld1q_f32(data1);
float32x4_t v2 = vld1q_f32(data2);
float32x4_t v3 = vld1q_f32(data3);
float32x4x2_t v02_intertwined = vzipq_f32(v0, v2);
float32x4x2_t v13_intertwined = vzipq_f32(v1, v3);
float32x4x2_t v0123_intertwined =
vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]);
float32x4x2_t v0123n_intertwined =
vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]);
"fmla v16.4s, v14.4s, v6.s[0] \n" vst1q_f32(output_ptr, v0123_intertwined.val[0]);
"fmla v17.4s, v15.4s, v6.s[0] \n" output_ptr += col_stride;
"fmla v18.4s, v14.4s, v6.s[1] \n"
"fmla v19.4s, v15.4s, v6.s[1] \n"
"fmla v20.4s, v14.4s, v6.s[2] \n"
"fmla v21.4s, v15.4s, v6.s[2] \n"
"fmla v22.4s, v14.4s, v6.s[3] \n"
"fmla v23.4s, v15.4s, v6.s[3] \n"
"ld1 {v6.4s}, [%[lhs_ptr]], #16 \n" vst1q_f32(output_ptr, v0123_intertwined.val[1]);
output_ptr += col_stride;
"subs %[r_depth_block_count], %[r_depth_block_count], #1 \n" vst1q_f32(output_ptr, v0123n_intertwined.val[0]);
output_ptr += col_stride;
"fmla v24.4s, v14.4s, v7.s[0] \n" vst1q_f32(output_ptr, v0123n_intertwined.val[1]);
"fmla v25.4s, v15.4s, v7.s[0] \n" output_ptr += col_stride;
"fmla v26.4s, v14.4s, v7.s[1] \n"
"fmla v27.4s, v15.4s, v7.s[1] \n"
"fmla v28.4s, v14.4s, v7.s[2] \n"
"fmla v29.4s, v15.4s, v7.s[2] \n"
"fmla v30.4s, v14.4s, v7.s[3] \n"
"fmla v31.4s, v15.4s, v7.s[3] \n"
"ld1 {v7.4s}, [%[lhs_ptr]], #16 \n" data0 += 4;
"ld1 {v14.4s}, [%[rhs_ptr]], #16 \n" data1 += 4;
"ld1 {v15.4s}, [%[rhs_ptr]], #16 \n" data2 += 4;
data3 += 4;
}
for (index_t c = 0; c < col_remain; ++c) {
float32x4_t vi = {*data0, *data1, *data2, *data3};
vst1q_f32(output_ptr, vi);
output_ptr += col_stride;
"bne 0b \n" ++data0;
++data1;
++data2;
++data3;
} // d
}
}
}
// prologue template<>
"1:\n" template<>
"fmla v16.4s, v8.4s, v0.s[0] \n" void Gemm<float>::Unpack<8, 8>(const float *packed_output,
"fmla v17.4s, v9.4s, v0.s[0] \n" MatrixMap<float> *output) {
"fmla v18.4s, v8.4s, v0.s[1] \n" const index_t rows = output->rows();
"fmla v19.4s, v9.4s, v0.s[1] \n" const index_t cols = output->cols();
"fmla v20.4s, v8.4s, v0.s[2] \n" index_t row_stride = output->rows_stride();
"fmla v21.4s, v9.4s, v0.s[2] \n" index_t col_stride = output->cols_stride();
"fmla v22.4s, v8.4s, v0.s[3] \n"
"fmla v23.4s, v9.4s, v0.s[3] \n"
"fmla v24.4s, v8.4s, v1.s[0] \n" float *output_ptr = output->data();
"fmla v25.4s, v9.4s, v1.s[0] \n" const float *packed_ptr = packed_output;
"fmla v26.4s, v8.4s, v1.s[1] \n"
"fmla v27.4s, v9.4s, v1.s[1] \n"
"fmla v28.4s, v8.4s, v1.s[2] \n"
"fmla v29.4s, v9.4s, v1.s[2] \n"
"fmla v30.4s, v8.4s, v1.s[3] \n"
"fmla v31.4s, v9.4s, v1.s[3] \n"
"fmla v16.4s, v10.4s, v2.s[0] \n" const index_t block_size = 8;
"fmla v17.4s, v11.4s, v2.s[0] \n"
"fmla v18.4s, v10.4s, v2.s[1] \n"
"fmla v19.4s, v11.4s, v2.s[1] \n"
"fmla v20.4s, v10.4s, v2.s[2] \n"
"fmla v21.4s, v11.4s, v2.s[2] \n"
"fmla v22.4s, v10.4s, v2.s[3] \n"
"fmla v23.4s, v11.4s, v2.s[3] \n"
"fmla v24.4s, v10.4s, v3.s[0] \n" // packed_output always has row-major
"fmla v25.4s, v11.4s, v3.s[0] \n" if (output->matrix_major() == RowMajor) {
"fmla v26.4s, v10.4s, v3.s[1] \n" if (cols < block_size) {
"fmla v27.4s, v11.4s, v3.s[1] \n" for (index_t r = 0; r < rows; ++r) {
"fmla v28.4s, v10.4s, v3.s[2] \n" memcpy(output_ptr, packed_ptr, sizeof(float) * cols);
"fmla v29.4s, v11.4s, v3.s[2] \n" output_ptr += row_stride;
"fmla v30.4s, v10.4s, v3.s[3] \n" packed_ptr += block_size;
"fmla v31.4s, v11.4s, v3.s[3] \n" }
} else {
for (index_t r = 0; r < rows; ++r) {
float32x4_t vi = vld1q_f32(packed_ptr);
vst1q_f32(output_ptr, vi);
float32x4_t vin = vld1q_f32(packed_ptr + 4);
vst1q_f32(output_ptr + 4, vin);
"fmla v16.4s, v12.4s, v4.s[0] \n" output_ptr += row_stride;
"fmla v17.4s, v13.4s, v4.s[0] \n" packed_ptr += block_size;
"fmla v18.4s, v12.4s, v4.s[1] \n" }
"fmla v19.4s, v13.4s, v4.s[1] \n" }
"fmla v20.4s, v12.4s, v4.s[2] \n" } else {
"fmla v21.4s, v13.4s, v4.s[2] \n" // ColMajor
"fmla v22.4s, v12.4s, v4.s[3] \n" if (rows < block_size) {
"fmla v23.4s, v13.4s, v4.s[3] \n" for (index_t c = 0; c < cols; ++c) {
for (index_t r = 0; r < rows; ++r) {
output_ptr[c * col_stride + r] = packed_ptr[r * block_size + c];
} // r
} // c
} else {
const float *data0 = packed_ptr;
const float *data1 = data0 + block_size;
const float *data2 = data1 + block_size;
const float *data3 = data2 + block_size;
const float *data4 = data3 + block_size;
const float *data5 = data4 + block_size;
const float *data6 = data5 + block_size;
const float *data7 = data6 + block_size;
"fmla v24.4s, v12.4s, v5.s[0] \n" index_t col_block = cols / 4;
"fmla v25.4s, v13.4s, v5.s[0] \n" index_t col_remain = cols - col_block * 4;
"fmla v26.4s, v12.4s, v5.s[1] \n" for (index_t col_block_idx = 0; col_block_idx < col_block;
"fmla v27.4s, v13.4s, v5.s[1] \n" ++col_block_idx) {
"fmla v28.4s, v12.4s, v5.s[2] \n" float32x4_t v0 = vld1q_f32(data0);
"fmla v29.4s, v13.4s, v5.s[2] \n" float32x4_t v1 = vld1q_f32(data1);
"fmla v30.4s, v12.4s, v5.s[3] \n" float32x4_t v2 = vld1q_f32(data2);
"fmla v31.4s, v13.4s, v5.s[3] \n" float32x4_t v3 = vld1q_f32(data3);
float32x4x2_t v02_intertwined = vzipq_f32(v0, v2);
float32x4x2_t v13_intertwined = vzipq_f32(v1, v3);
float32x4x2_t v0123_intertwined =
vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]);
float32x4x2_t v0123n_intertwined =
vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]);
"fmla v16.4s, v14.4s, v6.s[0] \n" float32x4_t v4 = vld1q_f32(data4);
"fmla v17.4s, v15.4s, v6.s[0] \n" float32x4_t v5 = vld1q_f32(data5);
"fmla v18.4s, v14.4s, v6.s[1] \n" float32x4_t v6 = vld1q_f32(data6);
"fmla v19.4s, v15.4s, v6.s[1] \n" float32x4_t v7 = vld1q_f32(data7);
"fmla v20.4s, v14.4s, v6.s[2] \n" float32x4x2_t v46_intertwined = vzipq_f32(v4, v6);
"fmla v21.4s, v15.4s, v6.s[2] \n" float32x4x2_t v57_intertwined = vzipq_f32(v5, v7);
"fmla v22.4s, v14.4s, v6.s[3] \n" float32x4x2_t v4567_intertwined =
"fmla v23.4s, v15.4s, v6.s[3] \n" vzipq_f32(v46_intertwined.val[0], v57_intertwined.val[0]);
float32x4x2_t v4567n_intertwined =
vzipq_f32(v46_intertwined.val[1], v57_intertwined.val[1]);
"fmla v24.4s, v14.4s, v7.s[0] \n" vst1q_f32(output_ptr, v0123_intertwined.val[0]);
"fmla v25.4s, v15.4s, v7.s[0] \n" vst1q_f32(output_ptr + 4, v4567_intertwined.val[0]);
"fmla v26.4s, v14.4s, v7.s[1] \n" output_ptr += col_stride;
"fmla v27.4s, v15.4s, v7.s[1] \n"
"fmla v28.4s, v14.4s, v7.s[2] \n"
"fmla v29.4s, v15.4s, v7.s[2] \n"
"fmla v30.4s, v14.4s, v7.s[3] \n"
"fmla v31.4s, v15.4s, v7.s[3] \n"
"st1 {v16.4s}, [%[packed_output_data]], #16 \n" vst1q_f32(output_ptr, v0123_intertwined.val[1]);
"st1 {v17.4s}, [%[packed_output_data]], #16 \n" vst1q_f32(output_ptr + 4, v4567_intertwined.val[1]);
"st1 {v18.4s}, [%[packed_output_data]], #16 \n" output_ptr += col_stride;
"st1 {v19.4s}, [%[packed_output_data]], #16 \n"
"st1 {v20.4s}, [%[packed_output_data]], #16 \n" vst1q_f32(output_ptr, v0123n_intertwined.val[0]);
"st1 {v21.4s}, [%[packed_output_data]], #16 \n" vst1q_f32(output_ptr + 4, v4567n_intertwined.val[0]);
"st1 {v22.4s}, [%[packed_output_data]], #16 \n" output_ptr += col_stride;
"st1 {v23.4s}, [%[packed_output_data]], #16 \n"
"st1 {v24.4s}, [%[packed_output_data]], #16 \n" vst1q_f32(output_ptr, v0123n_intertwined.val[1]);
"st1 {v25.4s}, [%[packed_output_data]], #16 \n" vst1q_f32(output_ptr + 4, v4567n_intertwined.val[1]);
"st1 {v26.4s}, [%[packed_output_data]], #16 \n" output_ptr += col_stride;
"st1 {v27.4s}, [%[packed_output_data]], #16 \n"
"st1 {v28.4s}, [%[packed_output_data]], #16 \n" data0 += 4;
"st1 {v29.4s}, [%[packed_output_data]], #16 \n" data1 += 4;
"st1 {v30.4s}, [%[packed_output_data]], #16 \n" data2 += 4;
"st1 {v31.4s}, [%[packed_output_data]], #16 \n" data3 += 4;
: // outputs data4 += 4;
[lhs_ptr] "+r"(lhs_ptr), data5 += 4;
[rhs_ptr] "+r"(rhs_ptr), data6 += 4;
[packed_output_data] "+r"(packed_output_data), data7 += 4;
[r_depth_block_count] "+r"(r_depth_block_count)
: // inputs
: // clabbers
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
} }
#else // armeabi-v7a for (index_t c = 0; c < col_remain; ++c) {
// Register layout: (4x4) x (4,8) float32x4_t vi = {*data0, *data1, *data2, *data3};
vst1q_f32(output_ptr, vi);
float32x4_t vin = {*data4, *data5, *data6, *data7};
vst1q_f32(output_ptr + 4, vin);
output_ptr += col_stride;
++data0;
++data1;
++data2;
++data3;
++data4;
++data5;
++data6;
++data7;
} // d
}
}
}
template<>
void Gemm<float>::PackLhs(const MatrixMap<const float> &lhs,
float *packed_lhs) {
#ifdef __aarch64__
Pack<8, 4>(lhs, ColMajor, packed_lhs);
#else
Pack<4, 4>(lhs, ColMajor, packed_lhs);
#endif
}
template<>
void Gemm<float>::PackRhs(const MatrixMap<const float> &rhs,
float *packed_rhs) {
Pack<8, 4>(rhs, RowMajor, packed_rhs);
}
template<>
void Gemm<float>::UnpackOutput(const float *packed_output,
MatrixMap<float> *output) {
#ifdef __aarch64__
Unpack<8, 8>(packed_output, output);
#else
Unpack<4, 8>(packed_output, output);
#endif
}
template<>
void Gemm<float>::ComputeBlock(const float *packed_lhs_data,
const float *packed_rhs_data,
const index_t depth_padded,
float *packed_output_data) {
/* Ref:
for (index_t r = 0; r < block_size; ++r) {
for (index_t c = 0; c < block_size; ++c) {
float sum = 0;
for (index_t d = 0; d < depth; ++d) {
// (r, d) * (d, c)
sum += packed_lhs_data[d * r_block_size + r]
* packed_rhs_data[d * c_block_size + c];
}
packed_output_data[r * c_block_size + c] = sum;
}
}
*/
const float *lhs_ptr = packed_lhs_data;
const float *rhs_ptr = packed_rhs_data;
const index_t depth_block_count = depth_padded / 4;
#ifdef __aarch64__
// Register layout: (8x4) x (4,8)
// //
// +--------+--------+ // +--------+--------+
// | q4 ... | q5 ... | // | v8 ... | v9 ... |
// Rhs +--------+--------+ // Rhs +--------+--------+
// | q6 ... | q7 ... | // | v10... | v11... |
// +--------+--------+ // +--------+--------+
// | q4 ... | q5 ... | // | v12... | v13... |
// +--------+--------+ // +--------+--------+
// | q6 ... | q7 ... | // | v14... | v15... |
// +--------+--------+ // +--------+--------+
// //
// Lhs // Lhs
// //
// +----+----+----+----+ - - +--------+--------+ // +----+----+----+----+ - - +--------+--------+
// | q0 | q1 | q2 | q3 | | q8... | q9... | // | v0 | v2 | v4 | v6 | | v16... | v17... |
// | . | | | | | q10... | q11... | // | . | | | | | v18... | v19... |
// | . | | | | | q12... | q13... | // | . | | | | | v20... | v21... |
// | . | | | | | q14... | q15... | // | . | | | | | v22... | v23... |
// +----+----+----+----+ +--------+--------+ // +----+----|----+----+ +--------+--------+
// | v1 | v3 | v5 | v7 | | v24... | v25... |
// | . | | | | | v26... | v27... |
// | . | | | | | v28... | v29... |
// | . | | | | | v30... | v31... |
// +----+----|----+----+ +--------+--------+
// //
// Accumulator // Accumulator
// //
...@@ -543,90 +586,306 @@ void Gemm::ComputeBlock(const float *packed_lhs_data, ...@@ -543,90 +586,306 @@ void Gemm::ComputeBlock(const float *packed_lhs_data,
MACE_UNUSED(r_depth_block_count); MACE_UNUSED(r_depth_block_count);
asm volatile( asm volatile(
"mov r0, #0\n" "dup v16.4s, wzr \n"
"vdup.f32 q8, r0 \n" "dup v17.4s, wzr \n"
"vdup.f32 q9, r0 \n" "dup v18.4s, wzr \n"
"vdup.f32 q10, r0 \n" "dup v19.4s, wzr \n"
"vdup.f32 q11, r0 \n" "dup v20.4s, wzr \n"
"vdup.f32 q12, r0 \n" "dup v21.4s, wzr \n"
"vdup.f32 q13, r0 \n" "dup v22.4s, wzr \n"
"vdup.f32 q14, r0 \n" "dup v23.4s, wzr \n"
"vdup.f32 q15, r0 \n" "dup v24.4s, wzr \n"
"dup v25.4s, wzr \n"
"dup v26.4s, wzr \n"
"dup v27.4s, wzr \n"
"dup v28.4s, wzr \n"
"dup v29.4s, wzr \n"
"dup v30.4s, wzr \n"
"dup v31.4s, wzr \n"
// prelogue // prelogue
"vld1.f32 {d0-d1}, [%[lhs_ptr]]! \n" "ld1 {v0.4s}, [%[lhs_ptr]], #16 \n"
"vld1.f32 {d2-d3}, [%[lhs_ptr]]! \n" "ld1 {v1.4s}, [%[lhs_ptr]], #16 \n"
"vld1.f32 {d4-d5}, [%[lhs_ptr]]! \n" "ld1 {v2.4s}, [%[lhs_ptr]], #16 \n"
"vld1.f32 {d6-d7}, [%[lhs_ptr]]! \n" "ld1 {v3.4s}, [%[lhs_ptr]], #16 \n"
"ld1 {v4.4s}, [%[lhs_ptr]], #16 \n"
"ld1 {v5.4s}, [%[lhs_ptr]], #16 \n"
"ld1 {v6.4s}, [%[lhs_ptr]], #16 \n"
"ld1 {v7.4s}, [%[lhs_ptr]], #16 \n"
"vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n" "ld1 {v8.4s}, [%[rhs_ptr]], #16 \n"
"vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n" "ld1 {v9.4s}, [%[rhs_ptr]], #16 \n"
"vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n" "ld1 {v10.4s}, [%[rhs_ptr]], #16 \n"
"vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n" "ld1 {v11.4s}, [%[rhs_ptr]], #16 \n"
"ld1 {v12.4s}, [%[rhs_ptr]], #16 \n"
"ld1 {v13.4s}, [%[rhs_ptr]], #16 \n"
"ld1 {v14.4s}, [%[rhs_ptr]], #16 \n"
"ld1 {v15.4s}, [%[rhs_ptr]], #16 \n"
"subs %[r_depth_block_count], %[r_depth_block_count], #1 \n" "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n"
"beq 1f\n" "beq 1f\n"
"0: \n" "0: \n"
"fmla v16.4s, v8.4s, v0.s[0] \n"
"fmla v17.4s, v9.4s, v0.s[0] \n"
"fmla v18.4s, v8.4s, v0.s[1] \n"
"fmla v19.4s, v9.4s, v0.s[1] \n"
"fmla v20.4s, v8.4s, v0.s[2] \n"
"fmla v21.4s, v9.4s, v0.s[2] \n"
"fmla v22.4s, v8.4s, v0.s[3] \n"
"fmla v23.4s, v9.4s, v0.s[3] \n"
"vmla.f32 q8, q4, d0[0] \n" "ld1 {v0.4s}, [%[lhs_ptr]], #16 \n"
"vmla.f32 q9, q5, d0[0] \n"
"vmla.f32 q10, q4, d0[1] \n"
"vmla.f32 q11, q5, d0[1] \n"
"vmla.f32 q12, q4, d1[0] \n"
"vmla.f32 q13, q5, d1[0] \n"
"vmla.f32 q14, q4, d1[1] \n"
"vmla.f32 q15, q5, d1[1] \n"
"vld1.f32 {d0-d1}, [%[lhs_ptr]]! \n" "fmla v24.4s, v8.4s, v1.s[0] \n"
"vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n" "fmla v25.4s, v9.4s, v1.s[0] \n"
"vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n" "fmla v26.4s, v8.4s, v1.s[1] \n"
"fmla v27.4s, v9.4s, v1.s[1] \n"
"fmla v28.4s, v8.4s, v1.s[2] \n"
"fmla v29.4s, v9.4s, v1.s[2] \n"
"fmla v30.4s, v8.4s, v1.s[3] \n"
"fmla v31.4s, v9.4s, v1.s[3] \n"
"vmla.f32 q8, q6, d2[0] \n" "ld1 {v1.4s}, [%[lhs_ptr]], #16 \n"
"vmla.f32 q9, q7, d2[0] \n" "ld1 {v8.4s}, [%[rhs_ptr]], #16 \n"
"vmla.f32 q10, q6, d2[1] \n" "ld1 {v9.4s}, [%[rhs_ptr]], #16 \n"
"vmla.f32 q11, q7, d2[1] \n"
"vmla.f32 q12, q6, d3[0] \n"
"vmla.f32 q13, q7, d3[0] \n"
"vmla.f32 q14, q6, d3[1] \n"
"vmla.f32 q15, q7, d3[1] \n"
"vld1.f32 {d2-d3}, [%[lhs_ptr]]! \n" "fmla v16.4s, v10.4s, v2.s[0] \n"
"vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n" "fmla v17.4s, v11.4s, v2.s[0] \n"
"vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n" "fmla v18.4s, v10.4s, v2.s[1] \n"
"fmla v19.4s, v11.4s, v2.s[1] \n"
"fmla v20.4s, v10.4s, v2.s[2] \n"
"fmla v21.4s, v11.4s, v2.s[2] \n"
"fmla v22.4s, v10.4s, v2.s[3] \n"
"fmla v23.4s, v11.4s, v2.s[3] \n"
"vmla.f32 q8, q4, d4[0] \n" "ld1 {v2.4s}, [%[lhs_ptr]], #16 \n"
"vmla.f32 q9, q5, d4[0] \n"
"vmla.f32 q10, q4, d4[1] \n"
"vmla.f32 q11, q5, d4[1] \n"
"vmla.f32 q12, q4, d5[0] \n"
"vmla.f32 q13, q5, d5[0] \n"
"vmla.f32 q14, q4, d5[1] \n"
"vmla.f32 q15, q5, d5[1] \n"
"vld1.f32 {d4-d5}, [%[lhs_ptr]]! \n" "fmla v24.4s, v10.4s, v3.s[0] \n"
"vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n" "fmla v25.4s, v11.4s, v3.s[0] \n"
"vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n" "fmla v26.4s, v10.4s, v3.s[1] \n"
"fmla v27.4s, v11.4s, v3.s[1] \n"
"fmla v28.4s, v10.4s, v3.s[2] \n"
"fmla v29.4s, v11.4s, v3.s[2] \n"
"fmla v30.4s, v10.4s, v3.s[3] \n"
"fmla v31.4s, v11.4s, v3.s[3] \n"
"ld1 {v3.4s}, [%[lhs_ptr]], #16 \n"
"ld1 {v10.4s}, [%[rhs_ptr]], #16 \n"
"ld1 {v11.4s}, [%[rhs_ptr]], #16 \n"
"fmla v16.4s, v12.4s, v4.s[0] \n"
"fmla v17.4s, v13.4s, v4.s[0] \n"
"fmla v18.4s, v12.4s, v4.s[1] \n"
"fmla v19.4s, v13.4s, v4.s[1] \n"
"fmla v20.4s, v12.4s, v4.s[2] \n"
"fmla v21.4s, v13.4s, v4.s[2] \n"
"fmla v22.4s, v12.4s, v4.s[3] \n"
"fmla v23.4s, v13.4s, v4.s[3] \n"
"ld1 {v4.4s}, [%[lhs_ptr]], #16 \n"
"fmla v24.4s, v12.4s, v5.s[0] \n"
"fmla v25.4s, v13.4s, v5.s[0] \n"
"fmla v26.4s, v12.4s, v5.s[1] \n"
"fmla v27.4s, v13.4s, v5.s[1] \n"
"fmla v28.4s, v12.4s, v5.s[2] \n"
"fmla v29.4s, v13.4s, v5.s[2] \n"
"fmla v30.4s, v12.4s, v5.s[3] \n"
"fmla v31.4s, v13.4s, v5.s[3] \n"
"ld1 {v5.4s}, [%[lhs_ptr]], #16 \n"
"ld1 {v12.4s}, [%[rhs_ptr]], #16 \n"
"ld1 {v13.4s}, [%[rhs_ptr]], #16 \n"
"fmla v16.4s, v14.4s, v6.s[0] \n"
"fmla v17.4s, v15.4s, v6.s[0] \n"
"fmla v18.4s, v14.4s, v6.s[1] \n"
"fmla v19.4s, v15.4s, v6.s[1] \n"
"fmla v20.4s, v14.4s, v6.s[2] \n"
"fmla v21.4s, v15.4s, v6.s[2] \n"
"fmla v22.4s, v14.4s, v6.s[3] \n"
"fmla v23.4s, v15.4s, v6.s[3] \n"
"ld1 {v6.4s}, [%[lhs_ptr]], #16 \n"
"subs %[r_depth_block_count], %[r_depth_block_count], #1 \n" "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n"
"vmla.f32 q8, q6, d6[0] \n" "fmla v24.4s, v14.4s, v7.s[0] \n"
"vmla.f32 q9, q7, d6[0] \n" "fmla v25.4s, v15.4s, v7.s[0] \n"
"vmla.f32 q10, q6, d6[1] \n" "fmla v26.4s, v14.4s, v7.s[1] \n"
"vmla.f32 q11, q7, d6[1] \n" "fmla v27.4s, v15.4s, v7.s[1] \n"
"vmla.f32 q12, q6, d7[0] \n" "fmla v28.4s, v14.4s, v7.s[2] \n"
"vmla.f32 q13, q7, d7[0] \n" "fmla v29.4s, v15.4s, v7.s[2] \n"
"vmla.f32 q14, q6, d7[1] \n" "fmla v30.4s, v14.4s, v7.s[3] \n"
"vmla.f32 q15, q7, d7[1] \n" "fmla v31.4s, v15.4s, v7.s[3] \n"
"vld1.f32 {d6-d7}, [%[lhs_ptr]]! \n" "ld1 {v7.4s}, [%[lhs_ptr]], #16 \n"
"vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n" "ld1 {v14.4s}, [%[rhs_ptr]], #16 \n"
"vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n" "ld1 {v15.4s}, [%[rhs_ptr]], #16 \n"
"bne 0b \n" "bne 0b \n"
// prologue // prologue
"1:\n" "1:\n"
"fmla v16.4s, v8.4s, v0.s[0] \n"
"fmla v17.4s, v9.4s, v0.s[0] \n"
"fmla v18.4s, v8.4s, v0.s[1] \n"
"fmla v19.4s, v9.4s, v0.s[1] \n"
"fmla v20.4s, v8.4s, v0.s[2] \n"
"fmla v21.4s, v9.4s, v0.s[2] \n"
"fmla v22.4s, v8.4s, v0.s[3] \n"
"fmla v23.4s, v9.4s, v0.s[3] \n"
"fmla v24.4s, v8.4s, v1.s[0] \n"
"fmla v25.4s, v9.4s, v1.s[0] \n"
"fmla v26.4s, v8.4s, v1.s[1] \n"
"fmla v27.4s, v9.4s, v1.s[1] \n"
"fmla v28.4s, v8.4s, v1.s[2] \n"
"fmla v29.4s, v9.4s, v1.s[2] \n"
"fmla v30.4s, v8.4s, v1.s[3] \n"
"fmla v31.4s, v9.4s, v1.s[3] \n"
"fmla v16.4s, v10.4s, v2.s[0] \n"
"fmla v17.4s, v11.4s, v2.s[0] \n"
"fmla v18.4s, v10.4s, v2.s[1] \n"
"fmla v19.4s, v11.4s, v2.s[1] \n"
"fmla v20.4s, v10.4s, v2.s[2] \n"
"fmla v21.4s, v11.4s, v2.s[2] \n"
"fmla v22.4s, v10.4s, v2.s[3] \n"
"fmla v23.4s, v11.4s, v2.s[3] \n"
"fmla v24.4s, v10.4s, v3.s[0] \n"
"fmla v25.4s, v11.4s, v3.s[0] \n"
"fmla v26.4s, v10.4s, v3.s[1] \n"
"fmla v27.4s, v11.4s, v3.s[1] \n"
"fmla v28.4s, v10.4s, v3.s[2] \n"
"fmla v29.4s, v11.4s, v3.s[2] \n"
"fmla v30.4s, v10.4s, v3.s[3] \n"
"fmla v31.4s, v11.4s, v3.s[3] \n"
"fmla v16.4s, v12.4s, v4.s[0] \n"
"fmla v17.4s, v13.4s, v4.s[0] \n"
"fmla v18.4s, v12.4s, v4.s[1] \n"
"fmla v19.4s, v13.4s, v4.s[1] \n"
"fmla v20.4s, v12.4s, v4.s[2] \n"
"fmla v21.4s, v13.4s, v4.s[2] \n"
"fmla v22.4s, v12.4s, v4.s[3] \n"
"fmla v23.4s, v13.4s, v4.s[3] \n"
"fmla v24.4s, v12.4s, v5.s[0] \n"
"fmla v25.4s, v13.4s, v5.s[0] \n"
"fmla v26.4s, v12.4s, v5.s[1] \n"
"fmla v27.4s, v13.4s, v5.s[1] \n"
"fmla v28.4s, v12.4s, v5.s[2] \n"
"fmla v29.4s, v13.4s, v5.s[2] \n"
"fmla v30.4s, v12.4s, v5.s[3] \n"
"fmla v31.4s, v13.4s, v5.s[3] \n"
"fmla v16.4s, v14.4s, v6.s[0] \n"
"fmla v17.4s, v15.4s, v6.s[0] \n"
"fmla v18.4s, v14.4s, v6.s[1] \n"
"fmla v19.4s, v15.4s, v6.s[1] \n"
"fmla v20.4s, v14.4s, v6.s[2] \n"
"fmla v21.4s, v15.4s, v6.s[2] \n"
"fmla v22.4s, v14.4s, v6.s[3] \n"
"fmla v23.4s, v15.4s, v6.s[3] \n"
"fmla v24.4s, v14.4s, v7.s[0] \n"
"fmla v25.4s, v15.4s, v7.s[0] \n"
"fmla v26.4s, v14.4s, v7.s[1] \n"
"fmla v27.4s, v15.4s, v7.s[1] \n"
"fmla v28.4s, v14.4s, v7.s[2] \n"
"fmla v29.4s, v15.4s, v7.s[2] \n"
"fmla v30.4s, v14.4s, v7.s[3] \n"
"fmla v31.4s, v15.4s, v7.s[3] \n"
"st1 {v16.4s}, [%[packed_output_data]], #16 \n"
"st1 {v17.4s}, [%[packed_output_data]], #16 \n"
"st1 {v18.4s}, [%[packed_output_data]], #16 \n"
"st1 {v19.4s}, [%[packed_output_data]], #16 \n"
"st1 {v20.4s}, [%[packed_output_data]], #16 \n"
"st1 {v21.4s}, [%[packed_output_data]], #16 \n"
"st1 {v22.4s}, [%[packed_output_data]], #16 \n"
"st1 {v23.4s}, [%[packed_output_data]], #16 \n"
"st1 {v24.4s}, [%[packed_output_data]], #16 \n"
"st1 {v25.4s}, [%[packed_output_data]], #16 \n"
"st1 {v26.4s}, [%[packed_output_data]], #16 \n"
"st1 {v27.4s}, [%[packed_output_data]], #16 \n"
"st1 {v28.4s}, [%[packed_output_data]], #16 \n"
"st1 {v29.4s}, [%[packed_output_data]], #16 \n"
"st1 {v30.4s}, [%[packed_output_data]], #16 \n"
"st1 {v31.4s}, [%[packed_output_data]], #16 \n"
: // outputs
[lhs_ptr] "+r"(lhs_ptr),
[rhs_ptr] "+r"(rhs_ptr),
[packed_output_data] "+r"(packed_output_data),
[r_depth_block_count] "+r"(r_depth_block_count)
: // inputs
: // clabbers
"cc", "memory",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
}
#else // armeabi-v7a
// Register layout: (4x4) x (4,8)
//
// +--------+--------+
// | q4 ... | q5 ... |
// Rhs +--------+--------+
// | q6 ... | q7 ... |
// +--------+--------+
// | q4 ... | q5 ... |
// +--------+--------+
// | q6 ... | q7 ... |
// +--------+--------+
//
// Lhs
//
// +----+----+----+----+ - - +--------+--------+
// | q0 | q1 | q2 | q3 | | q8... | q9... |
// | . | | | | | q10... | q11... |
// | . | | | | | q12... | q13... |
// | . | | | | | q14... | q15... |
// +----+----+----+----+ +--------+--------+
//
// Accumulator
//
if (depth_block_count > 0) {
index_t r_depth_block_count = depth_block_count;
// just make compiler happy
MACE_UNUSED(r_depth_block_count);
asm volatile(
"mov r0, #0\n"
"vdup.f32 q8, r0 \n"
"vdup.f32 q9, r0 \n"
"vdup.f32 q10, r0 \n"
"vdup.f32 q11, r0 \n"
"vdup.f32 q12, r0 \n"
"vdup.f32 q13, r0 \n"
"vdup.f32 q14, r0 \n"
"vdup.f32 q15, r0 \n"
// prelogue
"vld1.f32 {d0-d1}, [%[lhs_ptr]]! \n"
"vld1.f32 {d2-d3}, [%[lhs_ptr]]! \n"
"vld1.f32 {d4-d5}, [%[lhs_ptr]]! \n"
"vld1.f32 {d6-d7}, [%[lhs_ptr]]! \n"
"vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n"
"vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n"
"vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n"
"vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n"
"subs %[r_depth_block_count], %[r_depth_block_count], #1 \n"
"beq 1f\n"
"0: \n"
"vmla.f32 q8, q4, d0[0] \n" "vmla.f32 q8, q4, d0[0] \n"
"vmla.f32 q9, q5, d0[0] \n" "vmla.f32 q9, q5, d0[0] \n"
"vmla.f32 q10, q4, d0[1] \n" "vmla.f32 q10, q4, d0[1] \n"
...@@ -636,6 +895,7 @@ void Gemm::ComputeBlock(const float *packed_lhs_data, ...@@ -636,6 +895,7 @@ void Gemm::ComputeBlock(const float *packed_lhs_data,
"vmla.f32 q14, q4, d1[1] \n" "vmla.f32 q14, q4, d1[1] \n"
"vmla.f32 q15, q5, d1[1] \n" "vmla.f32 q15, q5, d1[1] \n"
"vld1.f32 {d0-d1}, [%[lhs_ptr]]! \n"
"vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n" "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n"
"vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n" "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n"
...@@ -648,6 +908,7 @@ void Gemm::ComputeBlock(const float *packed_lhs_data, ...@@ -648,6 +908,7 @@ void Gemm::ComputeBlock(const float *packed_lhs_data,
"vmla.f32 q14, q6, d3[1] \n" "vmla.f32 q14, q6, d3[1] \n"
"vmla.f32 q15, q7, d3[1] \n" "vmla.f32 q15, q7, d3[1] \n"
"vld1.f32 {d2-d3}, [%[lhs_ptr]]! \n"
"vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n" "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n"
"vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n" "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n"
...@@ -660,6 +921,12 @@ void Gemm::ComputeBlock(const float *packed_lhs_data, ...@@ -660,6 +921,12 @@ void Gemm::ComputeBlock(const float *packed_lhs_data,
"vmla.f32 q14, q4, d5[1] \n" "vmla.f32 q14, q4, d5[1] \n"
"vmla.f32 q15, q5, d5[1] \n" "vmla.f32 q15, q5, d5[1] \n"
"vld1.f32 {d4-d5}, [%[lhs_ptr]]! \n"
"vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n"
"vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n"
"subs %[r_depth_block_count], %[r_depth_block_count], #1 \n"
"vmla.f32 q8, q6, d6[0] \n" "vmla.f32 q8, q6, d6[0] \n"
"vmla.f32 q9, q7, d6[0] \n" "vmla.f32 q9, q7, d6[0] \n"
"vmla.f32 q10, q6, d6[1] \n" "vmla.f32 q10, q6, d6[1] \n"
...@@ -669,568 +936,262 @@ void Gemm::ComputeBlock(const float *packed_lhs_data, ...@@ -669,568 +936,262 @@ void Gemm::ComputeBlock(const float *packed_lhs_data,
"vmla.f32 q14, q6, d7[1] \n" "vmla.f32 q14, q6, d7[1] \n"
"vmla.f32 q15, q7, d7[1] \n" "vmla.f32 q15, q7, d7[1] \n"
"vst1.f32 {d16-d17}, [%[packed_output_data]]! \n" "vld1.f32 {d6-d7}, [%[lhs_ptr]]! \n"
"vst1.f32 {d18-d19}, [%[packed_output_data]]! \n" "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n"
"vst1.f32 {d20-d21}, [%[packed_output_data]]! \n" "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n"
"vst1.f32 {d22-d23}, [%[packed_output_data]]! \n"
"vst1.f32 {d24-d25}, [%[packed_output_data]]! \n"
"vst1.f32 {d26-d27}, [%[packed_output_data]]! \n"
"vst1.f32 {d28-d29}, [%[packed_output_data]]! \n"
"vst1.f32 {d30-d31}, [%[packed_output_data]]! \n"
: // outputs
[lhs_ptr] "+r"(lhs_ptr),
[rhs_ptr] "+r"(rhs_ptr),
[packed_output_data] "+r"(packed_output_data),
[r_depth_block_count] "+r"(r_depth_block_count)
: // inputs
: // clabbers
"cc", "memory", "r0",
"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
}
#endif
}
void Gemm::PackLhs(const MatrixMap<const float> &lhs,
float *packed_lhs) {
#ifdef __aarch64__
Pack<8, 4>(lhs, ColMajor, packed_lhs);
#else
Pack<4, 4>(lhs, ColMajor, packed_lhs);
#endif
}
void Gemm::PackRhs(const MatrixMap<const float> &rhs,
float *packed_rhs) {
Pack<8, 4>(rhs, RowMajor, packed_rhs);
}
void Gemm::UnpackOutput(const float *packed_output, MatrixMap<float> *output) {
#ifdef __aarch64__
Unpack<8, 8>(packed_output, output);
#else
Unpack<4, 8>(packed_output, output);
#endif
}
template<>
void Gemm::Pack<4, 4>(const MatrixMap<const float> &matrix,
MatrixMajor dst_major,
float *packed_matrix) {
const index_t rows = matrix.rows();
const index_t cols = matrix.cols();
// use the same terminology as GemmLowp:
// depth is depth, width is the opposite dim other than depth
// lhs
index_t width = rows;
index_t depth = cols;
index_t width_stride = matrix.rows_stride();
index_t depth_stride = matrix.cols_stride();
if (dst_major == RowMajor) {
// rhs
std::swap(width, depth);
std::swap(width_stride, depth_stride);
}
const float *data = matrix.data();
float *packed_ptr = packed_matrix;
const index_t block_size = 4;
const index_t depth_padded = RoundUp(depth, static_cast<index_t>(4));
if (depth_padded > depth) {
memset(packed_ptr + depth * block_size,
0,
sizeof(float) * (depth_padded - depth) * block_size);
}
if (dst_major == matrix.matrix_major()) {
if (width < block_size) {
const index_t width_remain = block_size - width;
for (index_t d = 0; d < depth; ++d) {
memcpy(packed_ptr, data, sizeof(float) * width);
memset(packed_ptr + width, 0, sizeof(float) * width_remain);
data += depth_stride;
packed_ptr += block_size;
}
} else {
for (index_t d = 0; d < depth; ++d) {
float32x4_t vi = vld1q_f32(data);
vst1q_f32(packed_ptr, vi);
data += depth_stride;
packed_ptr += block_size;
}
}
} else {
if (width < block_size) {
const index_t width_remain = block_size - width;
for (index_t d = 0; d < depth; ++d) {
for (index_t w = 0; w < width; ++w) {
packed_ptr[w] = data[w * width_stride + d];
} // w
memset(packed_ptr + width, 0, sizeof(float) * width_remain);
packed_ptr += block_size;
} // d
} else {
const float *data0 = data;
const float *data1 = data + width_stride;
const float *data2 = data1 + width_stride;
const float *data3 = data2 + width_stride;
const index_t depth_block = depth / 4;
const index_t depth_remain = depth - depth_block * 4;
for (index_t depth_block_idx = 0; depth_block_idx < depth_block;
++depth_block_idx) {
float32x4_t v0 = vld1q_f32(data0);
float32x4_t v1 = vld1q_f32(data1);
float32x4_t v2 = vld1q_f32(data2);
float32x4_t v3 = vld1q_f32(data3);
float32x4x2_t v02_intertwined = vzipq_f32(v0, v2);
float32x4x2_t v13_intertwined = vzipq_f32(v1, v3);
float32x4x2_t v0123_intertwined =
vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]);
float32x4x2_t v0123n_intertwined =
vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]);
vst1q_f32(packed_ptr, v0123_intertwined.val[0]);
packed_ptr += 4;
vst1q_f32(packed_ptr, v0123_intertwined.val[1]);
packed_ptr += 4;
vst1q_f32(packed_ptr, v0123n_intertwined.val[0]);
packed_ptr += 4;
vst1q_f32(packed_ptr, v0123n_intertwined.val[1]);
packed_ptr += 4;
data0 += 4;
data1 += 4;
data2 += 4;
data3 += 4;
}
for (index_t d = 0; d < depth_remain; ++d) {
float32x4_t vi = {*data0, *data1, *data2, *data3};
vst1q_f32(packed_ptr, vi);
packed_ptr += 4;
++data0;
++data1;
++data2;
++data3;
} // d
}
}
}
template<>
void Gemm::Pack<8, 4>(const MatrixMap<const float> &matrix,
MatrixMajor dst_major,
float *packed_matrix) {
const index_t rows = matrix.rows();
const index_t cols = matrix.cols();
// use the same terminology as GemmLowp:
// depth is depth, width is the opposite dim other than depth
// lhs
index_t width = rows;
index_t depth = cols;
index_t width_stride = matrix.rows_stride();
index_t depth_stride = matrix.cols_stride();
if (dst_major == RowMajor) {
// rhs
std::swap(width, depth);
std::swap(width_stride, depth_stride);
}
const float *data = matrix.data();
float *packed_ptr = packed_matrix;
const index_t block_size = 8;
const index_t depth_padded = RoundUp(depth, static_cast<index_t>(4));
if (depth_padded > depth) {
memset(packed_ptr + depth * block_size,
0,
sizeof(float) * (depth_padded - depth) * block_size);
}
if (dst_major == matrix.matrix_major()) {
if (width < block_size) {
const index_t width_remain = block_size - width;
for (index_t d = 0; d < depth; ++d) {
memcpy(packed_ptr, data, sizeof(float) * width);
memset(packed_ptr + width, 0, sizeof(float) * width_remain);
data += depth_stride;
packed_ptr += block_size;
}
} else {
for (index_t d = 0; d < depth; ++d) {
float32x4_t vi = vld1q_f32(data);
vst1q_f32(packed_ptr, vi);
float32x4_t vin = vld1q_f32(data + 4);
vst1q_f32(packed_ptr + 4, vin);
data += depth_stride;
packed_ptr += block_size;
}
}
} else {
if (width < block_size) {
const index_t width_remain = block_size - width;
for (index_t d = 0; d < depth; ++d) {
for (index_t w = 0; w < width; ++w) {
packed_ptr[w] = data[w * width_stride + d];
} // w
memset(packed_ptr + width, 0, sizeof(float) * width_remain);
packed_ptr += block_size;
} // d
} else {
const float *data0 = data;
const float *data1 = data + width_stride;
const float *data2 = data1 + width_stride;
const float *data3 = data2 + width_stride;
const float *data4 = data3 + width_stride;
const float *data5 = data4 + width_stride;
const float *data6 = data5 + width_stride;
const float *data7 = data6 + width_stride;
const index_t depth_block = depth / 4;
const index_t depth_remain = depth - depth_block * 4;
for (index_t depth_block_idx = 0; depth_block_idx < depth_block;
++depth_block_idx) {
float32x4_t v0 = vld1q_f32(data0);
float32x4_t v1 = vld1q_f32(data1);
float32x4_t v2 = vld1q_f32(data2);
float32x4_t v3 = vld1q_f32(data3);
float32x4x2_t v02_intertwined = vzipq_f32(v0, v2);
float32x4x2_t v13_intertwined = vzipq_f32(v1, v3);
float32x4x2_t v0123_intertwined =
vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]);
float32x4x2_t v0123n_intertwined =
vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]);
float32x4_t v4 = vld1q_f32(data4);
float32x4_t v5 = vld1q_f32(data5);
float32x4_t v6 = vld1q_f32(data6);
float32x4_t v7 = vld1q_f32(data7);
float32x4x2_t v46_intertwined = vzipq_f32(v4, v6);
float32x4x2_t v57_intertwined = vzipq_f32(v5, v7);
float32x4x2_t v4567_intertwined =
vzipq_f32(v46_intertwined.val[0], v57_intertwined.val[0]);
float32x4x2_t v4567n_intertwined =
vzipq_f32(v46_intertwined.val[1], v57_intertwined.val[1]);
vst1q_f32(packed_ptr, v0123_intertwined.val[0]);
packed_ptr += 4;
vst1q_f32(packed_ptr, v4567_intertwined.val[0]);
packed_ptr += 4;
vst1q_f32(packed_ptr, v0123_intertwined.val[1]);
packed_ptr += 4;
vst1q_f32(packed_ptr, v4567_intertwined.val[1]);
packed_ptr += 4;
vst1q_f32(packed_ptr, v0123n_intertwined.val[0]);
packed_ptr += 4;
vst1q_f32(packed_ptr, v4567n_intertwined.val[0]);
packed_ptr += 4;
vst1q_f32(packed_ptr, v0123n_intertwined.val[1]);
packed_ptr += 4;
vst1q_f32(packed_ptr, v4567n_intertwined.val[1]);
packed_ptr += 4;
data0 += 4;
data1 += 4;
data2 += 4;
data3 += 4;
data4 += 4;
data5 += 4;
data6 += 4;
data7 += 4;
}
for (index_t d = 0; d < depth_remain; ++d) {
float32x4_t vi = {*data0, *data1, *data2, *data3};
vst1q_f32(packed_ptr, vi);
packed_ptr += 4;
float32x4_t vin = {*data4, *data5, *data6, *data7};
vst1q_f32(packed_ptr, vin);
packed_ptr += 4;
++data0;
++data1;
++data2;
++data3;
++data4;
++data5;
++data6;
++data7;
} // d
}
}
}
template<>
void Gemm::Unpack<4, 8>(const float *packed_output, MatrixMap<float> *output) {
const index_t rows = output->rows();
const index_t cols = output->cols();
index_t row_stride = output->rows_stride();
index_t col_stride = output->cols_stride();
float *output_ptr = output->data();
const float *packed_ptr = packed_output;
const index_t block_size = 8;
// packed_output always has row-major
if (output->matrix_major() == RowMajor) {
if (cols < block_size) {
for (index_t r = 0; r < rows; ++r) {
memcpy(output_ptr, packed_ptr, sizeof(float) * cols);
output_ptr += row_stride;
packed_ptr += block_size;
}
} else {
for (index_t r = 0; r < rows; ++r) {
float32x4_t vi = vld1q_f32(packed_ptr);
vst1q_f32(output_ptr, vi);
float32x4_t vin = vld1q_f32(packed_ptr + 4);
vst1q_f32(output_ptr + 4, vin);
output_ptr += row_stride;
packed_ptr += block_size;
}
}
} else {
// ColMajor
if (rows < block_size) {
for (index_t c = 0; c < cols; ++c) {
for (index_t r = 0; r < rows; ++r) {
output_ptr[c * col_stride + r] = packed_ptr[r * block_size + c];
} // r
} // c
} else {
const float *data0 = packed_ptr;
const float *data1 = data0 + block_size;
const float *data2 = data1 + block_size;
const float *data3 = data2 + block_size;
index_t col_block = cols / 4;
index_t col_remain = cols - col_block * 4;
for (index_t col_block_idx = 0; col_block_idx < col_block;
++col_block_idx) {
float32x4_t v0 = vld1q_f32(data0);
float32x4_t v1 = vld1q_f32(data1);
float32x4_t v2 = vld1q_f32(data2);
float32x4_t v3 = vld1q_f32(data3);
float32x4x2_t v02_intertwined = vzipq_f32(v0, v2);
float32x4x2_t v13_intertwined = vzipq_f32(v1, v3);
float32x4x2_t v0123_intertwined =
vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]);
float32x4x2_t v0123n_intertwined =
vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]);
vst1q_f32(output_ptr, v0123_intertwined.val[0]);
output_ptr += col_stride;
vst1q_f32(output_ptr, v0123_intertwined.val[1]);
output_ptr += col_stride;
vst1q_f32(output_ptr, v0123n_intertwined.val[0]);
output_ptr += col_stride;
vst1q_f32(output_ptr, v0123n_intertwined.val[1]);
output_ptr += col_stride;
data0 += 4;
data1 += 4;
data2 += 4;
data3 += 4;
}
for (index_t c = 0; c < col_remain; ++c) {
float32x4_t vi = {*data0, *data1, *data2, *data3};
vst1q_f32(output_ptr, vi);
output_ptr += col_stride;
++data0;
++data1;
++data2;
++data3;
} // d
}
}
}
template<>
void Gemm::Unpack<8, 8>(const float *packed_output, MatrixMap<float> *output) {
const index_t rows = output->rows();
const index_t cols = output->cols();
index_t row_stride = output->rows_stride();
index_t col_stride = output->cols_stride();
float *output_ptr = output->data();
const float *packed_ptr = packed_output;
const index_t block_size = 8;
// packed_output always has row-major
if (output->matrix_major() == RowMajor) {
if (cols < block_size) {
for (index_t r = 0; r < rows; ++r) {
memcpy(output_ptr, packed_ptr, sizeof(float) * cols);
output_ptr += row_stride;
packed_ptr += block_size;
}
} else {
for (index_t r = 0; r < rows; ++r) {
float32x4_t vi = vld1q_f32(packed_ptr);
vst1q_f32(output_ptr, vi);
float32x4_t vin = vld1q_f32(packed_ptr + 4);
vst1q_f32(output_ptr + 4, vin);
output_ptr += row_stride;
packed_ptr += block_size;
}
}
} else {
// ColMajor
if (rows < block_size) {
for (index_t c = 0; c < cols; ++c) {
for (index_t r = 0; r < rows; ++r) {
output_ptr[c * col_stride + r] = packed_ptr[r * block_size + c];
} // r
} // c
} else {
const float *data0 = packed_ptr;
const float *data1 = data0 + block_size;
const float *data2 = data1 + block_size;
const float *data3 = data2 + block_size;
const float *data4 = data3 + block_size;
const float *data5 = data4 + block_size;
const float *data6 = data5 + block_size;
const float *data7 = data6 + block_size;
index_t col_block = cols / 4;
index_t col_remain = cols - col_block * 4;
for (index_t col_block_idx = 0; col_block_idx < col_block;
++col_block_idx) {
float32x4_t v0 = vld1q_f32(data0);
float32x4_t v1 = vld1q_f32(data1);
float32x4_t v2 = vld1q_f32(data2);
float32x4_t v3 = vld1q_f32(data3);
float32x4x2_t v02_intertwined = vzipq_f32(v0, v2);
float32x4x2_t v13_intertwined = vzipq_f32(v1, v3);
float32x4x2_t v0123_intertwined =
vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]);
float32x4x2_t v0123n_intertwined =
vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]);
float32x4_t v4 = vld1q_f32(data4); "bne 0b \n"
float32x4_t v5 = vld1q_f32(data5);
float32x4_t v6 = vld1q_f32(data6);
float32x4_t v7 = vld1q_f32(data7);
float32x4x2_t v46_intertwined = vzipq_f32(v4, v6);
float32x4x2_t v57_intertwined = vzipq_f32(v5, v7);
float32x4x2_t v4567_intertwined =
vzipq_f32(v46_intertwined.val[0], v57_intertwined.val[0]);
float32x4x2_t v4567n_intertwined =
vzipq_f32(v46_intertwined.val[1], v57_intertwined.val[1]);
vst1q_f32(output_ptr, v0123_intertwined.val[0]); // prologue
vst1q_f32(output_ptr + 4, v4567_intertwined.val[0]); "1:\n"
output_ptr += col_stride; "vmla.f32 q8, q4, d0[0] \n"
"vmla.f32 q9, q5, d0[0] \n"
"vmla.f32 q10, q4, d0[1] \n"
"vmla.f32 q11, q5, d0[1] \n"
"vmla.f32 q12, q4, d1[0] \n"
"vmla.f32 q13, q5, d1[0] \n"
"vmla.f32 q14, q4, d1[1] \n"
"vmla.f32 q15, q5, d1[1] \n"
vst1q_f32(output_ptr, v0123_intertwined.val[1]); "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n"
vst1q_f32(output_ptr + 4, v4567_intertwined.val[1]); "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n"
output_ptr += col_stride;
vst1q_f32(output_ptr, v0123n_intertwined.val[0]); "vmla.f32 q8, q6, d2[0] \n"
vst1q_f32(output_ptr + 4, v4567n_intertwined.val[0]); "vmla.f32 q9, q7, d2[0] \n"
output_ptr += col_stride; "vmla.f32 q10, q6, d2[1] \n"
"vmla.f32 q11, q7, d2[1] \n"
"vmla.f32 q12, q6, d3[0] \n"
"vmla.f32 q13, q7, d3[0] \n"
"vmla.f32 q14, q6, d3[1] \n"
"vmla.f32 q15, q7, d3[1] \n"
vst1q_f32(output_ptr, v0123n_intertwined.val[1]); "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n"
vst1q_f32(output_ptr + 4, v4567n_intertwined.val[1]); "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n"
output_ptr += col_stride;
data0 += 4; "vmla.f32 q8, q4, d4[0] \n"
data1 += 4; "vmla.f32 q9, q5, d4[0] \n"
data2 += 4; "vmla.f32 q10, q4, d4[1] \n"
data3 += 4; "vmla.f32 q11, q5, d4[1] \n"
data4 += 4; "vmla.f32 q12, q4, d5[0] \n"
data5 += 4; "vmla.f32 q13, q5, d5[0] \n"
data6 += 4; "vmla.f32 q14, q4, d5[1] \n"
data7 += 4; "vmla.f32 q15, q5, d5[1] \n"
}
for (index_t c = 0; c < col_remain; ++c) {
float32x4_t vi = {*data0, *data1, *data2, *data3};
vst1q_f32(output_ptr, vi);
float32x4_t vin = {*data4, *data5, *data6, *data7};
vst1q_f32(output_ptr + 4, vin);
output_ptr += col_stride;
++data0; "vmla.f32 q8, q6, d6[0] \n"
++data1; "vmla.f32 q9, q7, d6[0] \n"
++data2; "vmla.f32 q10, q6, d6[1] \n"
++data3; "vmla.f32 q11, q7, d6[1] \n"
++data4; "vmla.f32 q12, q6, d7[0] \n"
++data5; "vmla.f32 q13, q7, d7[0] \n"
++data6; "vmla.f32 q14, q6, d7[1] \n"
++data7; "vmla.f32 q15, q7, d7[1] \n"
} // d
} "vst1.f32 {d16-d17}, [%[packed_output_data]]! \n"
"vst1.f32 {d18-d19}, [%[packed_output_data]]! \n"
"vst1.f32 {d20-d21}, [%[packed_output_data]]! \n"
"vst1.f32 {d22-d23}, [%[packed_output_data]]! \n"
"vst1.f32 {d24-d25}, [%[packed_output_data]]! \n"
"vst1.f32 {d26-d27}, [%[packed_output_data]]! \n"
"vst1.f32 {d28-d29}, [%[packed_output_data]]! \n"
"vst1.f32 {d30-d31}, [%[packed_output_data]]! \n"
: // outputs
[lhs_ptr] "+r"(lhs_ptr),
[rhs_ptr] "+r"(rhs_ptr),
[packed_output_data] "+r"(packed_output_data),
[r_depth_block_count] "+r"(r_depth_block_count)
: // inputs
: // clabbers
"cc", "memory", "r0",
"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
} }
#endif
} }
MaceStatus Gemm::Compute(const OpContext *context, template<>
MaceStatus Gemm<float>::Compute(const OpContext *context,
const Tensor *lhs, const Tensor *lhs,
const Tensor *rhs, const Tensor *rhs,
const index_t batch, const index_t batch,
const index_t lhs_rows, const index_t rows,
const index_t lhs_cols, const index_t cols,
const index_t rhs_rows, const index_t depth,
const index_t rhs_cols, const MatrixMajor lhs_major,
const bool transpose_lhs, const MatrixMajor rhs_major,
const bool transpose_rhs, const MatrixMajor output_major,
const bool transpose_out,
const bool lhs_batched, const bool lhs_batched,
const bool rhs_batched, const bool rhs_batched,
Tensor *output) { Tensor *output) {
index_t rows = transpose_lhs ? lhs_cols : lhs_rows; MACE_CHECK(output->size() == batch * rows * cols,
index_t depth = transpose_lhs ? lhs_rows : lhs_cols; "Need resize output tensor before call gemm.");
index_t cols = transpose_rhs ? rhs_rows : rhs_cols; Tensor::MappingGuard lhs_guard(lhs);
index_t depth2 = transpose_rhs ? rhs_cols : rhs_rows; Tensor::MappingGuard rhs_guard(rhs);
MACE_CHECK(depth == depth2, Tensor::MappingGuard output_guard(output);
"Matrices that multiply have inconsistent depth dim: ", const float *lhs_data = lhs->data<float>();
depth, const float *rhs_data = rhs->data<float>();
" vs. ", float *output_data = output->mutable_data<float>();
depth2);
#ifdef __aarch64__
const index_t row_block_size = 8;
#else
const index_t row_block_size = 4;
#endif
const index_t col_block_size = 8;
const index_t depth_block_size = 4;
const index_t row_block_count = RoundUpDiv(rows, row_block_size);
const index_t col_block_count = RoundUpDiv(cols, col_block_size);
const index_t rows_padded = RoundUp(rows, row_block_size);
const index_t cols_padded = RoundUp(cols, col_block_size);
const index_t depth_padded = RoundUp(depth, depth_block_size);
ScratchBuffer *scratch = context->device()->scratch_buffer();
index_t packed_lhs_size =
PadAlignSize(sizeof(float) * rows_padded * depth_padded);
index_t packed_rhs_size =
PadAlignSize(sizeof(float) * depth_padded * cols_padded);
index_t packed_output_size =
PadAlignSize(sizeof(float) * rows_padded * cols_padded);
// resize to the total size of lhs & rhs & output anyway,
// in case we do not cache const tensor for saving memory
MACE_RETURN_IF_ERROR(scratch->GrowSize(
packed_lhs_size + packed_rhs_size + packed_output_size));
float *packed_lhs_data =
scratch->Scratch(packed_lhs_size).mutable_data<float>();
float *packed_rhs_data =
scratch->Scratch(packed_rhs_size).mutable_data<float>();
float *packed_output_data =
scratch->Scratch(packed_output_size).mutable_data<float>();
int cache_side = kNoCache;
if (cached_ == kCacheLhs) {
packed_lhs_data = pack_cache_.mutable_data<float>();
} else if (cached_ == kCacheRhs) {
packed_rhs_data = pack_cache_.mutable_data<float>();
} else if (should_cache_pack_) {
if (lhs->is_weight() && (!lhs_batched || batch == 1)) {
cache_side = kCacheLhs;
pack_cache_.Resize(packed_lhs_size);
packed_lhs_data = pack_cache_.mutable_data<float>();
} else if (rhs->is_weight() && (!rhs_batched || batch == 1)) {
cache_side = kCacheRhs;
pack_cache_.Resize(packed_rhs_size);
packed_rhs_data = pack_cache_.mutable_data<float>();
}
}
return Compute(context, utils::ThreadPool
lhs, &thread_pool = context->device()->cpu_runtime()->thread_pool();
rhs,
batch, for (index_t b = 0; b < batch; ++b) {
MatrixMap<const float>
lhs_matrix
(lhs_data + static_cast<index_t>(lhs_batched) * b * rows * depth,
lhs_major,
rows, rows,
cols, depth);
MatrixMap<const float>
rhs_matrix
(rhs_data + static_cast<index_t>(rhs_batched) * b * depth * cols,
rhs_major,
depth, depth,
transpose_lhs ? ColMajor : RowMajor, cols);
transpose_rhs ? ColMajor : RowMajor, MatrixMap<float> output_matrix
transpose_out ? ColMajor : RowMajor, (output_data + b * rows * cols, output_major, rows, cols);
lhs_batched,
rhs_batched, // pack lhs
output); if (cached_ != kCacheLhs) {
} thread_pool.Compute1D([=, &lhs_matrix](index_t start,
index_t end,
index_t step) {
for (index_t row_block_idx = start; row_block_idx < end;
row_block_idx += step) {
const index_t start_row = row_block_idx * row_block_size;
const index_t
row_block_len = std::min(row_block_size, rows - start_row);
float *packed_lhs_data_block =
packed_lhs_data + row_block_idx * row_block_size * depth_padded;
PackLhs(lhs_matrix.block(start_row, 0, row_block_len, depth),
packed_lhs_data_block);
}
}, 0, row_block_count, 1);
if (cache_side == kCacheLhs) {
cached_ = kCacheLhs;
if (lhs->UnderlyingBuffer()->OnHost()) {
AdviseFree(reinterpret_cast<void *>(const_cast<float *>(lhs->data<
float>())),
lhs->raw_size());
}
}
}
void RegisterGemmDelegator(OpDelegatorRegistry *registry) { // pack rhs
MACE_REGISTER_DELEGATOR( if (cached_ != kCacheRhs) {
registry, Gemm, delegator::GemmParam, thread_pool.Compute1D([=, &rhs_matrix](index_t start,
MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float, ImplType::NEON)); index_t end,
index_t step) {
for (index_t col_block_idx = start; col_block_idx < end;
col_block_idx += step) {
const index_t start_col = col_block_idx * col_block_size;
const index_t
col_block_len = std::min(col_block_size, cols - start_col);
float *packed_rhs_data_block =
packed_rhs_data + col_block_idx * col_block_size * depth_padded;
PackRhs(rhs_matrix.block(0, start_col, depth, col_block_len),
packed_rhs_data_block);
}
}, 0, col_block_count, 1);
if (cache_side == kCacheRhs) {
cached_ = kCacheRhs;
if (rhs->UnderlyingBuffer()->OnHost()) {
AdviseFree(reinterpret_cast<void *>(const_cast<float *>(rhs->data<
float>())),
rhs->raw_size());
}
}
}
// multiply lhs and rhs
thread_pool.Compute1D([=, &output_matrix](index_t start,
index_t end,
index_t step) {
for (index_t row_block_idx = start; row_block_idx < end;
row_block_idx += step) {
const index_t start_row = row_block_idx * row_block_size;
const index_t
row_block_len = std::min(row_block_size, rows - start_row);
const float *packed_lhs_data_block =
packed_lhs_data + row_block_idx * row_block_size * depth_padded;
for (index_t col_block_idx = 0; col_block_idx < col_block_count;
++col_block_idx) {
const index_t start_col = col_block_idx * col_block_size;
const index_t
col_block_len = std::min(col_block_size, cols - start_col);
const float *packed_rhs_data_block =
packed_rhs_data + col_block_idx * col_block_size * depth_padded;
float *packed_output_data_block =
packed_output_data + row_block_idx * row_block_size * cols_padded
+ col_block_idx * col_block_size;
ComputeBlock(packed_lhs_data_block,
packed_rhs_data_block,
depth_padded,
packed_output_data_block);
MatrixMap<float> output_block = output_matrix.block(start_row,
start_col,
row_block_len,
col_block_len);
UnpackOutput(packed_output_data_block, &output_block);
} // col_block_idx
} // row_block_idx
}, 0, row_block_count, 1);
} // b
return MaceStatus::MACE_SUCCESS;
} }
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
...@@ -12,12 +12,10 @@ ...@@ -12,12 +12,10 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "mace/ops/arm/fp32/gemv.h"
#include <arm_neon.h> #include <arm_neon.h>
#include <algorithm> #include <algorithm>
#include "mace/ops/arm/base/gemv.h"
#include "mace/utils/math.h" #include "mace/utils/math.h"
#if !defined(__aarch64__) #if !defined(__aarch64__)
...@@ -34,9 +32,9 @@ float vaddvq_f32(float32x4_t v) { ...@@ -34,9 +32,9 @@ float vaddvq_f32(float32x4_t v) {
namespace mace { namespace mace {
namespace ops { namespace ops {
namespace arm { namespace arm {
namespace fp32 {
MaceStatus Gemv::Compute(const OpContext *context, template<>
MaceStatus Gemv<float>::Compute(const OpContext *context,
const Tensor *lhs, const Tensor *lhs,
const Tensor *rhs, const Tensor *rhs,
const Tensor *bias, const Tensor *bias,
...@@ -378,13 +376,6 @@ MaceStatus Gemv::Compute(const OpContext *context, ...@@ -378,13 +376,6 @@ MaceStatus Gemv::Compute(const OpContext *context,
#undef vaddvq_f32 #undef vaddvq_f32
#endif #endif
void RegisterGemvDelegator(OpDelegatorRegistry *registry) {
MACE_REGISTER_DELEGATOR(
registry, Gemv, DelegatorParam,
MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, float, ImplType::NEON));
}
} // namespace fp32
} // namespace arm } // namespace arm
} // namespace ops } // namespace ops
} // namespace mace } // namespace mace
...@@ -38,13 +38,15 @@ extern void RegisterGemvDelegator(OpDelegatorRegistry *registry); ...@@ -38,13 +38,15 @@ extern void RegisterGemvDelegator(OpDelegatorRegistry *registry);
#ifdef MACE_ENABLE_NEON #ifdef MACE_ENABLE_NEON
namespace arm { namespace arm {
namespace fp32 { namespace fp32 {
extern void RegisterConv2dK3x3WinogradDelegator(OpDelegatorRegistry *registry);
} // namespace fp32
extern void RegisterActivationDelegator(OpDelegatorRegistry *registry); extern void RegisterActivationDelegator(OpDelegatorRegistry *registry);
extern void RegisterBiasAddDelegator(OpDelegatorRegistry *registry); extern void RegisterBiasAddDelegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK1x1Delegator(OpDelegatorRegistry *registry); extern void RegisterConv2dK1x1Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK1xNDelegator(OpDelegatorRegistry *registry); extern void RegisterConv2dK1xNDelegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK3x3Delegator(OpDelegatorRegistry *registry); extern void RegisterConv2dK3x3Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK3x3WinogradDelegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK5x5Delegator(OpDelegatorRegistry *registry); extern void RegisterConv2dK5x5Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dK7x7Delegator(OpDelegatorRegistry *registry); extern void RegisterConv2dK7x7Delegator(OpDelegatorRegistry *registry);
extern void RegisterConv2dGeneralDelegator(OpDelegatorRegistry *registry); extern void RegisterConv2dGeneralDelegator(OpDelegatorRegistry *registry);
...@@ -69,7 +71,6 @@ extern void RegisterGroupDeconv2dGeneralDelegator( ...@@ -69,7 +71,6 @@ extern void RegisterGroupDeconv2dGeneralDelegator(
extern void RegisterGemmDelegator(OpDelegatorRegistry *registry); extern void RegisterGemmDelegator(OpDelegatorRegistry *registry);
extern void RegisterGemvDelegator(OpDelegatorRegistry *registry); extern void RegisterGemvDelegator(OpDelegatorRegistry *registry);
} // namespace fp32
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
namespace q8 { namespace q8 {
...@@ -97,32 +98,33 @@ void RegisterAllOpDelegators(OpDelegatorRegistry *registry) { ...@@ -97,32 +98,33 @@ void RegisterAllOpDelegators(OpDelegatorRegistry *registry) {
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_NEON #ifdef MACE_ENABLE_NEON
arm::fp32::RegisterActivationDelegator(registry);
arm::fp32::RegisterBiasAddDelegator(registry);
arm::fp32::RegisterConv2dK1x1Delegator(registry);
arm::fp32::RegisterConv2dK1xNDelegator(registry);
arm::fp32::RegisterConv2dK3x3Delegator(registry);
arm::fp32::RegisterConv2dK3x3WinogradDelegator(registry); arm::fp32::RegisterConv2dK3x3WinogradDelegator(registry);
arm::fp32::RegisterConv2dK5x5Delegator(registry);
arm::fp32::RegisterConv2dK7x7Delegator(registry); arm::RegisterActivationDelegator(registry);
arm::fp32::RegisterConv2dGeneralDelegator(registry); arm::RegisterBiasAddDelegator(registry);
arm::fp32::RegisterDeconv2dK2x2Delegator(registry); arm::RegisterConv2dK1x1Delegator(registry);
arm::fp32::RegisterDeconv2dK3x3Delegator(registry); arm::RegisterConv2dK1xNDelegator(registry);
arm::fp32::RegisterDeconv2dK4x4Delegator(registry); arm::RegisterConv2dK3x3Delegator(registry);
arm::fp32::RegisterDeconv2dGeneralDelegator(registry); arm::RegisterConv2dK5x5Delegator(registry);
arm::RegisterConv2dK7x7Delegator(registry);
arm::fp32::RegisterDepthwiseConv2dK3x3Delegator(registry); arm::RegisterConv2dGeneralDelegator(registry);
arm::fp32::RegisterDepthwiseDeconv2dK3x3Delegator(registry);
arm::fp32::RegisterGroupDeconv2dK3x3Delegator(registry); arm::RegisterDeconv2dK2x2Delegator(registry);
arm::fp32::RegisterDepthwiseDeconv2dK4x4Delegator(registry); arm::RegisterDeconv2dK3x3Delegator(registry);
arm::fp32::RegisterGroupDeconv2dK4x4Delegator(registry); arm::RegisterDeconv2dK4x4Delegator(registry);
arm::fp32::RegisterDepthwiseDeconv2dGeneralDelegator(registry); arm::RegisterDeconv2dGeneralDelegator(registry);
arm::fp32::RegisterGroupDeconv2dGeneralDelegator(registry);
arm::RegisterDepthwiseConv2dK3x3Delegator(registry);
arm::fp32::RegisterGemmDelegator(registry); arm::RegisterDepthwiseDeconv2dK3x3Delegator(registry);
arm::fp32::RegisterGemvDelegator(registry); arm::RegisterGroupDeconv2dK3x3Delegator(registry);
arm::RegisterDepthwiseDeconv2dK4x4Delegator(registry);
arm::RegisterGroupDeconv2dK4x4Delegator(registry);
arm::RegisterDepthwiseDeconv2dGeneralDelegator(registry);
arm::RegisterGroupDeconv2dGeneralDelegator(registry);
arm::RegisterGemmDelegator(registry);
arm::RegisterGemvDelegator(registry);
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
arm::q8::RegisterEltwiseDelegator(registry); arm::q8::RegisterEltwiseDelegator(registry);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册