提交 6f57bd19 编写于 作者: L Liangliang He

Merge branch 'cpu' into 'master'

Reimplemented neon kernels

See merge request !339
......@@ -186,7 +186,7 @@ bool Run(MaceEngine *engine,
return true;
}
DEFINE_string(device, "CPU", "Device [CPU|OPENCL]");
DEFINE_string(device, "CPU", "Device [CPU|NEON|OPENCL]");
DEFINE_string(input_node, "input_node0,input_node1",
"input nodes, separated by comma");
DEFINE_string(output_node, "output_node0,output_node1",
......@@ -264,6 +264,8 @@ int Main(int argc, char **argv) {
DeviceType device_type = CPU;
if (FLAGS_device == "OPENCL") {
device_type = OPENCL;
} else if (FLAGS_device == "NEON") {
device_type = NEON;
}
// config runtime
......@@ -271,7 +273,7 @@ int Main(int argc, char **argv) {
mace::ConfigOpenCLRuntime(
static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
} else if (device_type == CPU) {
} else if (device_type == CPU || device_type == NEON) {
mace::ConfigOmpThreadsAndAffinity(
FLAGS_omp_num_threads,
static_cast<CPUPowerOption>(FLAGS_cpu_power_option));
......
......@@ -41,6 +41,8 @@ class BufferBase {
virtual bool OnHost() const = 0;
virtual void Clear() = 0;
virtual index_t offset() const { return 0; }
template <typename T>
......@@ -158,6 +160,12 @@ class Buffer : public BufferBase {
bool OnHost() const { return allocator_->OnHost(); }
void Clear() {
if (buf_ != nullptr) {
memset(buf_, 0, size_);
}
}
private:
Allocator *allocator_;
void *buf_;
......@@ -242,6 +250,10 @@ class Image : public BufferBase {
bool OnHost() const { return allocator_->OnHost(); }
void Clear() {
MACE_NOT_IMPLEMENTED;
}
private:
Allocator *allocator_;
std::vector<size_t> shape_;
......@@ -322,6 +334,10 @@ class BufferSlice : public BufferBase {
bool OnHost() const { return buffer_->OnHost(); }
void Clear() {
MACE_NOT_IMPLEMENTED;
}
private:
BufferBase *buffer_;
void *mapped_buf_;
......
......@@ -93,10 +93,9 @@ extern void Register_Slice(OperatorRegistry *op_registry);
extern void Register_Softmax(OperatorRegistry *op_registry);
extern void Register_SpaceToBatchND(OperatorRegistry *op_registry);
extern void Register_SpaceToDepth(OperatorRegistry *op_registry);
extern void Register_Transpose(OperatorRegistry *op_registry);
extern void Register_WinogradInverseTransform(OperatorRegistry *op_registry);
extern void Register_WinogradTransform(OperatorRegistry *op_registry);
} // namespace ops
OperatorRegistry::OperatorRegistry() {
......@@ -130,6 +129,7 @@ OperatorRegistry::OperatorRegistry() {
ops::Register_Softmax(this);
ops::Register_SpaceToBatchND(this);
ops::Register_SpaceToDepth(this);
ops::Register_Transpose(this);
ops::Register_WinogradInverseTransform(this);
ops::Register_WinogradTransform(this);
}
......
......@@ -146,21 +146,26 @@ class Tensor {
template <typename T>
inline const T *data() const {
MACE_CHECK(buffer_ != nullptr, "buffer is null");
MACE_CHECK_NOTNULL(buffer_);
return buffer_->data<T>();
}
inline void *raw_mutable_data() {
MACE_CHECK(buffer_ != nullptr, "buffer is null");
MACE_CHECK_NOTNULL(buffer_);
return buffer_->raw_mutable_data();
}
template <typename T>
inline T *mutable_data() {
MACE_CHECK(buffer_ != nullptr, "buffer is null");
MACE_CHECK_NOTNULL(buffer_);
return static_cast<T *>(buffer_->raw_mutable_data());
}
inline void Clear() {
MACE_CHECK_NOTNULL(buffer_);
buffer_->Clear();
}
inline void Reshape(const std::vector<index_t> &shape) {
shape_ = shape;
MACE_CHECK(raw_size() <= buffer_->size());
......@@ -258,22 +263,19 @@ class Tensor {
inline void DebugPrint() const {
using namespace numerical_chars; // NOLINT(build/namespaces)
std::stringstream os;
os << "Tensor " << name_ << " size: [";
for (index_t i : shape_) {
os << i << ", ";
}
os << "], content:\n";
os.str("");
os.clear();
MappingGuard guard(this);
for (int i = 0; i < size(); ++i) {
if (i != 0 && i % shape_[3] == 0) {
if (i != 0 && i % shape_.back() == 0) {
os << "\n";
}
CASES(dtype_, (os << (this->data<T>()[i]) << ", "));
}
LOG(INFO) << "Tensor size: [" << dim(0) << ", " << dim(1) << ", " << dim(2)
<< ", " << dim(3) << "], content:\n"
<< os.str();
LOG(INFO) << os.str();
}
class MappingGuard {
......
......@@ -21,6 +21,7 @@ Tensor *Workspace::CreateTensor(const std::string &name,
VLOG(3) << "Creating Tensor " << name;
tensor_map_[name] =
std::move(std::unique_ptr<Tensor>(new Tensor(alloc, type)));
tensor_map_[name]->SetSourceOpName(name);
}
return GetTensor(name);
}
......
......@@ -11,13 +11,21 @@ load("//mace:mace.bzl", "if_android", "if_neon_enabled", "if_openmp_enabled")
cc_library(
name = "kernels",
srcs = glob([
"*.cc",
"opencl/*.cc",
]),
srcs = glob(
[
"*.cc",
"opencl/*.cc",
"arm/*.cc",
],
exclude = [
"*_test.cc",
"arm/*_test.cc",
],
),
hdrs = glob([
"*.h",
"opencl/*.h",
"arm/*.h",
]),
copts = if_openmp_enabled(["-fopenmp"]) + if_neon_enabled(["-DMACE_ENABLE_NEON"]),
linkopts = if_android(["-lm"]),
......@@ -28,14 +36,20 @@ cc_library(
)
cc_test(
name = "kernel_test",
name = "kernels_test",
testonly = 1,
srcs = glob(["test/*.cc"]),
linkopts = if_android(["-pie"]),
srcs = glob(
[
"*_test.cc",
"arm/*_test.cc",
],
),
copts = if_openmp_enabled(["-fopenmp"]) + if_neon_enabled(["-DMACE_ENABLE_NEON"]),
linkopts = ["-fopenmp"],
linkstatic = 1,
deps = [
":kernels",
"//mace/core",
"@gtest//:gtest",
"@gtest//:gtest_main",
],
)
......
......@@ -134,11 +134,20 @@ class ActivationFunctor {
};
template <>
void ActivationFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input,
const Tensor *alpha,
Tensor *output,
StatsFuture *future);
class ActivationFunctor<DeviceType::NEON, float> {
public:
ActivationFunctor(ActivationType type, float relux_max_limit)
: activation_(type), relux_max_limit_(relux_max_limit) {}
void operator()(const Tensor *input,
const Tensor *alpha,
Tensor *output,
StatsFuture *future);
private:
ActivationType activation_;
float relux_max_limit_;
};
template <typename T>
class ActivationFunctor<DeviceType::OPENCL, T> {
......
//
// Copyright (c) 2018 XiaoMi All rights reserved.
//
#include "mace/kernels/activation.h"
namespace mace {
namespace kernels {
void ActivationFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input,
const Tensor *alpha,
Tensor *output,
StatsFuture *future) {
const float *input_ptr = input->data<float>();
float *output_ptr = output->mutable_data<float>();
if (activation_ == PRELU) {
MACE_CHECK_NOTNULL(alpha);
const float *alpha_ptr = alpha->data<float>();
PReLUActivation(input_ptr, output->size(), input->dim(1), alpha_ptr,
output_ptr);
} else {
DoActivation(input_ptr, output_ptr, output->size(), activation_,
relux_max_limit_);
}
}
} // namespace kernels
} // namespace mace
//
// Copyright (c) 2018 XiaoMi All rights reserved.
//
#include "mace/kernels/batch_norm.h"
namespace mace {
namespace kernels {
void BatchNormFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input,
const Tensor *scale,
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
const float epsilon,
Tensor *output,
StatsFuture *future) {
// Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
// The calculation formula for inference is
// Y = \frac{ \scale } { \sqrt{var+\variance_epsilon} } * X +
// ( \offset - \frac { \scale * mean } {
// \sqrt{var+\variance_epsilon} }
// new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} }
// new_offset = \offset - mean * common_val;
// Y = new_scale * X + new_offset;
const index_t batch = input->dim(0);
const index_t channels = input->dim(1);
const index_t height = input->dim(2);
const index_t width = input->dim(3);
const float *input_ptr = input->data<float>();
const float *scale_ptr = scale->data<float>();
const float *offset_ptr = offset->data<float>();
float *output_ptr = output->mutable_data<float>();
std::vector<float> new_scale;
std::vector<float> new_offset;
if (!folded_constant_) {
new_scale.resize(channels);
new_offset.resize(channels);
const float *mean_ptr = mean->data<float>();
const float *var_ptr = var->data<float>();
#pragma omp parallel for
for (index_t c = 0; c < channels; ++c) {
new_scale[c] = scale_ptr[c] / std::sqrt(var_ptr[c] + epsilon);
new_offset[c] = offset_ptr[c] - mean_ptr[c] * new_scale[c];
}
}
const float *scale_data = folded_constant_ ? scale_ptr : new_scale.data();
const float *offset_data = folded_constant_ ? offset_ptr : new_offset.data();
index_t channel_size = height * width;
index_t batch_size = channels * channel_size;
// NEON is slower, so stick to the trivial implementaion
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) {
index_t offset = b * batch_size + c * channel_size;
for (index_t hw = 0; hw < height * width; ++hw) {
output_ptr[offset + hw] =
scale_data[c] * input_ptr[offset + hw] + offset_data[c];
}
}
}
DoActivation(output_ptr, output_ptr, output->size(), activation_,
relux_max_limit_);
}
} // namespace kernels
} // namespace mace
//
// Copyright (c) 2018 XiaoMi All rights reserved.
//
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/arm/conv_winograd.h"
// winograd is always superior to neon impl during benchmark
#define USE_WINOGRAD 1
namespace mace {
namespace kernels {
namespace {
void Conv2dNCHW(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
const int filter_height,
const int filter_width,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
float *output) {
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) {
for (index_t m = 0; m < out_channels; ++m) {
for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w < out_width; ++w) {
index_t out_offset =
((b * out_channels + m) * out_height + h) * out_width + w;
for (index_t c = 0; c < in_channels; ++c) {
for (index_t kh = 0; kh < filter_height; ++kh) {
for (index_t kw = 0; kw < filter_width; ++kw) {
index_t ih = h * stride_h + kh * dilation_h;
index_t iw = w * stride_w + kw * dilation_w;
index_t in_offset =
((b * in_channels + c) * in_height + ih) * in_width + iw;
index_t filter_offset =
(((m * in_channels) + c) * filter_height + kh) * filter_width
+ kw;
output[out_offset] += input[in_offset] * filter[filter_offset];
}
}
}
}
}
}
}
}
} // namespace
extern void Conv2dNeonK1x1S1(const float *input,
const float *filter,
const index_t batch,
const index_t height,
const index_t width,
const index_t in_channels,
const index_t out_channels,
float *output);
extern void Conv2dNeonK3x3S1(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
float *output);
extern void Conv2dNeonK3x3S2(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
float *output);
void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
MACE_CHECK_NOTNULL(input);
MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(output);
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
if (paddings_.empty()) {
CalcNCHWPaddingAndOutputSize(input->shape().data(),
filter->shape().data(),
dilations_,
strides_,
padding_type_,
output_shape.data(),
paddings.data());
} else {
paddings = paddings_;
CalcNCHWOutputSize(input->shape().data(), filter->shape().data(),
paddings_.data(), dilations_, strides_, RoundType::FLOOR,
output_shape.data());
}
output->Resize(output_shape);
output->Clear();
index_t batch = output->dim(0);
index_t channels = output->dim(1);
index_t height = output->dim(2);
index_t width = output->dim(3);
index_t input_batch = input->dim(0);
index_t input_channels = input->dim(1);
index_t input_height = input->dim(2);
index_t input_width = input->dim(3);
index_t filter_h = filter->dim(2);
index_t filter_w = filter->dim(3);
MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels);
MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ",
input_channels);
index_t stride_h = strides_[0];
index_t stride_w = strides_[1];
index_t dilation_h = dilations_[0];
index_t dilation_w = dilations_[1];
MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");
index_t padded_input_height = input_height + paddings[0];
index_t padded_input_width = input_width + paddings[1];
index_t extra_input_height = padded_input_height;
index_t extra_input_width = padded_input_width;
index_t extra_output_height = height;
index_t extra_output_width = width;
int pad_top = paddings[0] >> 1;
int pad_bottom = paddings[0] - pad_top;
int pad_left = paddings[1] >> 1;
int pad_right = paddings[1] - pad_left;
std::function<void(const float *input, float *output)> conv_func;
auto input_data = input->data<float>();
auto filter_data = filter->data<float>();
auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
auto output_data = output->mutable_data<float>();
memset(output_data, 0, sizeof(float) * batch * channels * height * width);
if (USE_WINOGRAD && filter_h == 3 && filter_w == 3 && stride_h == 1
&& stride_w == 1
&& dilation_h == 1 && dilation_w == 1) {
extra_output_height = RoundUp<index_t>(height, 2);
extra_input_height = std::max(padded_input_height, extra_output_height + 2);
extra_output_width = RoundUp<index_t>(width, 2);
extra_input_width = std::max(padded_input_width, extra_output_width + 2);
if (extra_input_height != padded_input_height) {
pad_bottom += (extra_input_height - padded_input_height);
}
if (extra_input_width != padded_input_width) {
pad_right += (extra_input_width - padded_input_width);
}
index_t tile_height_count = (extra_output_height + 1) / 2;
index_t tile_width_count = (extra_output_width + 1) / 2;
index_t tile_count = tile_height_count * tile_width_count;
transformed_input_.Resize({16, batch, input_channels, tile_count});
transformed_filter_.Resize({16, channels, input_channels});
transformed_output_.Resize({16, batch, channels, tile_count});
conv_func = [=](const float *pad_input, float *pad_output) {
WinoGradConv3x3s1(pad_input,
filter_data,
batch,
extra_input_height,
extra_input_width,
input_channels,
channels,
transformed_input_.mutable_data<float>(),
transformed_filter_.mutable_data<float>(),
transformed_output_.mutable_data<float>(),
is_filter_transformed_,
pad_output);
is_filter_transformed_ = true;
};
} else if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1
&& dilation_h == 1 && dilation_w == 1) {
extra_output_height = RoundUp<index_t>(height, 2);
extra_input_height = std::max(padded_input_height, extra_output_height + 2);
extra_output_width = RoundUp<index_t>(width, 4);
extra_input_width = std::max(padded_input_width, extra_output_width + 2);
if (extra_input_height != padded_input_height) {
pad_bottom += (extra_input_height - padded_input_height);
}
if (extra_input_width != padded_input_width) {
pad_right += (extra_input_width - padded_input_width);
}
conv_func = [=](const float *pad_input, float *pad_output) {
Conv2dNeonK3x3S1(pad_input,
filter_data,
batch,
extra_input_height,
extra_input_width,
input_channels,
extra_output_height,
extra_output_width,
channels,
pad_output);
};
} else if (filter_h == 3 && filter_w == 3 && stride_h == 2 && stride_w == 2
&& dilation_h == 1 && dilation_w == 1) {
extra_output_height = height;
extra_input_height =
std::max(padded_input_height, (extra_output_height - 1) * 2 + 3);
extra_output_width = RoundUp<index_t>(width, 4);
extra_input_width =
std::max(padded_input_width, (extra_output_width - 1) * 2 + 3);
if (extra_input_height != padded_input_height) {
pad_bottom += (extra_input_height - padded_input_height);
}
if (extra_input_width != padded_input_width) {
pad_right += (extra_input_width - padded_input_width);
}
conv_func = [=](const float *pad_input, float *pad_output) {
Conv2dNeonK3x3S2(pad_input,
filter_data,
batch,
extra_input_height,
extra_input_width,
input_channels,
extra_output_height,
extra_output_width,
channels,
pad_output);
};
} else if (filter_h == 1 && filter_w == 1 && stride_h == 1 && stride_w == 1
&& dilation_h == 1 && dilation_w == 1) {
conv_func = [=](const float *pad_input, float *pad_output) {
Conv2dNeonK1x1S1(input_data,
filter_data,
batch,
height,
width,
input_channels,
channels,
output_data);
};
} else {
conv_func = [=](const float *pad_input, float *pad_output) {
Conv2dNCHW(pad_input,
filter_data,
batch,
extra_input_height,
extra_input_width,
input_channels,
extra_output_height,
extra_output_width,
channels,
filter_h,
filter_w,
stride_h,
stride_w,
dilation_h,
dilation_w,
pad_output);
};
}
const Tensor *pad_input_ptr = input;
// Keep this alive during kernel execution
if (extra_input_height != input_height || extra_input_width != input_width) {
ConstructNCHWInputWithSpecificPadding(input,
pad_top,
pad_bottom,
pad_left,
pad_right,
&padded_input_);
pad_input_ptr = &padded_input_;
}
const float *pad_input_data = pad_input_ptr->data<float>();
Tensor *pad_output_ptr = output;
// Keep this alive during kernel execution
if (extra_output_height != height || extra_output_width != width) {
std::vector<index_t> extra_output_shape
{batch, channels, extra_output_height, extra_output_width};
padded_output_.Resize(extra_output_shape);
pad_output_ptr = &padded_output_;
}
float *pad_output_data = pad_output_ptr->mutable_data<float>();
conv_func(pad_input_data, pad_output_data);
// unpack output
if (extra_output_height != height || extra_output_width != width) {
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) {
for (index_t h = 0; h < height; ++h) {
memcpy(
output_data + b * channels * height * width + c * height * width
+ h * width,
pad_output_data
+ b * channels * extra_output_height * extra_output_width
+ c * extra_output_height * extra_output_width
+ h * extra_output_width,
sizeof(float) * width);
}
}
}
}
if (bias_data != nullptr) {
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) {
for (index_t i = 0; i < height * width; ++i) {
output_data[(b * channels + c) * height * width + i] += bias_data[c];
}
}
}
}
DoActivation(output_data, output_data, output->size(), activation_,
relux_max_limit_);
}
} // namespace kernels
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
#include <arm_neon.h>
#endif
#include "mace/core/types.h"
#include "mace/kernels/gemm.h"
namespace mace {
namespace kernels {
void Conv2dNeonK1x1S1(const float *input,
const float *filter,
const index_t batch,
const index_t height,
const index_t width,
const index_t in_channels,
const index_t out_channels,
float *output) {
for (index_t b = 0; b < batch; ++b) {
Gemm(filter,
input + b * in_channels * height * width,
1,
out_channels,
in_channels,
height * width,
output + b * out_channels * height * width);
}
}
} // namespace kernels
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
#include <arm_neon.h>
#endif
#include "mace/core/types.h"
namespace mace {
namespace kernels {
// Ho = 2, Wo = 4, Co = 2
void Conv2dNeonK3x3S1(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
float *output) {
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = in_channels * in_image_size;
const index_t out_batch_size = out_channels * out_image_size;
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) {
for (index_t m = 0; m < out_channels; m += 2) {
if (m + 1 < out_channels) {
float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
float *out_ptr1_base =
output + b * out_batch_size + (m + 1) * out_image_size;
for (index_t c = 0; c < in_channels; ++c) {
float *out_ptr0 = out_ptr0_base;
float *out_ptr1 = out_ptr1_base;
const float *in_ptr0 = input + b * in_batch_size + c * in_image_size;
const float *in_ptr1 =
input + b * in_batch_size + c * in_image_size + 1 * in_width;
const float *in_ptr2 =
input + b * in_batch_size + c * in_image_size + 2 * in_width;
const float *in_ptr3 =
input + b * in_batch_size + c * in_image_size + 3 * in_width;
const float *filter_ptr0 = filter + m * in_channels * 9 + c * 9;
const float *filter_ptr1 = filter + (m + 1) * in_channels * 9 + c * 9;
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
// load filter (4 outch x 3 height x 3 width): vf_outch_height
float32x4_t vf00, vf01, vf02;
float32x4_t vf10, vf11, vf12;
vf00 = vld1q_f32(filter_ptr0);
vf01 = vld1q_f32(filter_ptr0 + 3);
vf02 = vld1q_f32(filter_ptr0 + 6);
vf10 = vld1q_f32(filter_ptr1);
vf11 = vld1q_f32(filter_ptr1 + 3);
vf12 = vld1q_f32(filter_ptr1 + 6);
for (index_t h = 0; h + 1 < out_height; h += 2) {
for (index_t w = 0; w + 3 < out_width; w += 4) {
// input (4 height x 3 slide): vi_height_slide
float32x4_t vi00, vi01, vi02; // reg count: 14
float32x4_t vi10, vi11, vi12;
float32x4_t vi20, vi21, vi22;
float32x4_t vi30, vi31, vi32;
float32x4_t vo20, vo30; // tmp use
// output (4 outch x 2 height x 4 width): vo_outch_height
float32x4_t vo00, vo01;
float32x4_t vo10, vo11;
// load input
vi00 = vld1q_f32(in_ptr0);
vo00 = vld1q_f32(in_ptr0 + 4); // reuse vo00: vi0n
vi10 = vld1q_f32(in_ptr1);
vo10 = vld1q_f32(in_ptr1 + 4);
vi20 = vld1q_f32(in_ptr2);
vo20 = vld1q_f32(in_ptr2 + 4);
vi30 = vld1q_f32(in_ptr3);
vo30 = vld1q_f32(in_ptr3 + 4);
vi01 = vextq_f32(vi00, vo00, 1);
vi02 = vextq_f32(vi00, vo00, 2);
vi11 = vextq_f32(vi10, vo10, 1);
vi12 = vextq_f32(vi10, vo10, 2);
vi21 = vextq_f32(vi20, vo20, 1);
vi22 = vextq_f32(vi20, vo20, 2);
vi31 = vextq_f32(vi30, vo30, 1);
vi32 = vextq_f32(vi30, vo30, 2);
// load ouptut
vo00 = vld1q_f32(out_ptr0);
vo01 = vld1q_f32(out_ptr0 + out_width);
vo10 = vld1q_f32(out_ptr1);
vo11 = vld1q_f32(out_ptr1 + out_width);
// outch 0, height 0
vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0); // reg count: 18
vo00 = vfmaq_laneq_f32(vo00, vi01, vf00, 1);
vo00 = vfmaq_laneq_f32(vo00, vi02, vf00, 2);
vo00 = vfmaq_laneq_f32(vo00, vi10, vf01, 0);
vo00 = vfmaq_laneq_f32(vo00, vi11, vf01, 1);
vo00 = vfmaq_laneq_f32(vo00, vi12, vf01, 2);
vo00 = vfmaq_laneq_f32(vo00, vi20, vf02, 0);
vo00 = vfmaq_laneq_f32(vo00, vi21, vf02, 1);
vo00 = vfmaq_laneq_f32(vo00, vi22, vf02, 2);
// outch 0, height 1
vo01 = vfmaq_laneq_f32(vo01, vi10, vf00, 0);
vo01 = vfmaq_laneq_f32(vo01, vi11, vf00, 1);
vo01 = vfmaq_laneq_f32(vo01, vi12, vf00, 2);
vo01 = vfmaq_laneq_f32(vo01, vi20, vf01, 0);
vo01 = vfmaq_laneq_f32(vo01, vi21, vf01, 1);
vo01 = vfmaq_laneq_f32(vo01, vi22, vf01, 2);
vo01 = vfmaq_laneq_f32(vo01, vi30, vf02, 0);
vo01 = vfmaq_laneq_f32(vo01, vi31, vf02, 1);
vo01 = vfmaq_laneq_f32(vo01, vi32, vf02, 2);
// outch 1, height 0
vo10 = vfmaq_laneq_f32(vo10, vi00, vf10, 0);
vo10 = vfmaq_laneq_f32(vo10, vi01, vf10, 1);
vo10 = vfmaq_laneq_f32(vo10, vi02, vf10, 2);
vo10 = vfmaq_laneq_f32(vo10, vi10, vf11, 0);
vo10 = vfmaq_laneq_f32(vo10, vi11, vf11, 1);
vo10 = vfmaq_laneq_f32(vo10, vi12, vf11, 2);
vo10 = vfmaq_laneq_f32(vo10, vi20, vf12, 0);
vo10 = vfmaq_laneq_f32(vo10, vi21, vf12, 1);
vo10 = vfmaq_laneq_f32(vo10, vi22, vf12, 2);
// outch 1, height 1
vo11 = vfmaq_laneq_f32(vo11, vi10, vf10, 0);
vo11 = vfmaq_laneq_f32(vo11, vi11, vf10, 1);
vo11 = vfmaq_laneq_f32(vo11, vi12, vf10, 2);
vo11 = vfmaq_laneq_f32(vo11, vi20, vf11, 0);
vo11 = vfmaq_laneq_f32(vo11, vi21, vf11, 1);
vo11 = vfmaq_laneq_f32(vo11, vi22, vf11, 2);
vo11 = vfmaq_laneq_f32(vo11, vi30, vf12, 0);
vo11 = vfmaq_laneq_f32(vo11, vi31, vf12, 1);
vo11 = vfmaq_laneq_f32(vo11, vi32, vf12, 2);
vst1q_f32(out_ptr0, vo00);
vst1q_f32(out_ptr0 + out_width, vo01);
vst1q_f32(out_ptr1, vo10);
vst1q_f32(out_ptr1 + out_width, vo11);
in_ptr0 += 4;
in_ptr1 += 4;
in_ptr2 += 4;
in_ptr3 += 4;
out_ptr0 += 4;
out_ptr1 += 4;
} // w
in_ptr0 += 2 + in_width;
in_ptr1 += 2 + in_width;
in_ptr2 += 2 + in_width;
in_ptr3 += 2 + in_width;
out_ptr0 += out_width;
out_ptr1 += out_width;
} // h
#else
for (index_t io = 0; io < 2; ++io) {
for (index_t ih = 0; ih < out_height; ++ih) {
for (index_t iw = 0; iw < out_width; ++iw) {
for (int i = 0; i < 3; ++i) {
for (int j = 0; j < 3; ++j) {
out_ptr0[io * out_image_size + ih * out_width + iw] +=
in_ptr0[(ih + i) * in_width + (iw + j)]
* filter_ptr0[io * in_channels * 9 + i * 3 + j];
}
}
}
}
} // for
#endif
} // c
} else {
for (index_t mm = m; mm < out_channels; ++mm) {
float
*out_ptr0_base = output + b * out_batch_size + mm * out_image_size;
for (index_t c = 0; c < in_channels; ++c) {
float *out_ptr0 = out_ptr0_base;
const float
*in_ptr0 = input + b * in_batch_size + c * in_image_size;
const float *in_ptr1 =
input + b * in_batch_size + c * in_image_size + 1 * in_width;
const float *in_ptr2 =
input + b * in_batch_size + c * in_image_size + 2 * in_width;
const float *in_ptr3 =
input + b * in_batch_size + c * in_image_size + 3 * in_width;
const float *filter_ptr0 = filter + mm * in_channels * 9 + c * 9;
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
// load filter (1 outch x 3 height x 3 width): vf_outch_height
float32x4_t vf00, vf01, vf02;
vf00 = vld1q_f32(filter_ptr0);
vf01 = vld1q_f32(filter_ptr0 + 3);
vf02 = vld1q_f32(filter_ptr0 + 6);
for (index_t h = 0; h + 1 < out_height; h += 2) {
for (index_t w = 0; w + 3 < out_width; w += 4) {
// input (4 height x 3 slide): vi_height_slide
float32x4_t vi00, vi01, vi02, vi0n;
float32x4_t vi10, vi11, vi12, vi1n;
float32x4_t vi20, vi21, vi22, vi2n;
float32x4_t vi30, vi31, vi32, vi3n;
// output (1 outch x 2 height x 4 width): vo_outch_height
float32x4_t vo00, vo01;
// load input
vi00 = vld1q_f32(in_ptr0);
vi0n = vld1q_f32(in_ptr0 + 4);
vi10 = vld1q_f32(in_ptr1);
vi1n = vld1q_f32(in_ptr1 + 4);
vi20 = vld1q_f32(in_ptr2);
vi2n = vld1q_f32(in_ptr2 + 4);
vi30 = vld1q_f32(in_ptr3);
vi3n = vld1q_f32(in_ptr3 + 4);
vi01 = vextq_f32(vi00, vi0n, 1);
vi02 = vextq_f32(vi00, vi0n, 2);
vi11 = vextq_f32(vi10, vi1n, 1);
vi12 = vextq_f32(vi10, vi1n, 2);
vi21 = vextq_f32(vi20, vi2n, 1);
vi22 = vextq_f32(vi20, vi2n, 2);
vi31 = vextq_f32(vi30, vi3n, 1);
vi32 = vextq_f32(vi30, vi3n, 2);
// load ouptut
vo00 = vld1q_f32(out_ptr0);
vo01 = vld1q_f32(out_ptr0 + out_width);
// outch 0, height 0
vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0);
vo00 = vfmaq_laneq_f32(vo00, vi01, vf00, 1);
vo00 = vfmaq_laneq_f32(vo00, vi02, vf00, 2);
vo00 = vfmaq_laneq_f32(vo00, vi10, vf01, 0);
vo00 = vfmaq_laneq_f32(vo00, vi11, vf01, 1);
vo00 = vfmaq_laneq_f32(vo00, vi12, vf01, 2);
vo00 = vfmaq_laneq_f32(vo00, vi20, vf02, 0);
vo00 = vfmaq_laneq_f32(vo00, vi21, vf02, 1);
vo00 = vfmaq_laneq_f32(vo00, vi22, vf02, 2);
// outch 0, height 1
vo01 = vfmaq_laneq_f32(vo01, vi10, vf00, 0);
vo01 = vfmaq_laneq_f32(vo01, vi11, vf00, 1);
vo01 = vfmaq_laneq_f32(vo01, vi12, vf00, 2);
vo01 = vfmaq_laneq_f32(vo01, vi20, vf01, 0);
vo01 = vfmaq_laneq_f32(vo01, vi21, vf01, 1);
vo01 = vfmaq_laneq_f32(vo01, vi22, vf01, 2);
vo01 = vfmaq_laneq_f32(vo01, vi30, vf02, 0);
vo01 = vfmaq_laneq_f32(vo01, vi31, vf02, 1);
vo01 = vfmaq_laneq_f32(vo01, vi32, vf02, 2);
vst1q_f32(out_ptr0, vo00);
vst1q_f32(out_ptr0 + out_width, vo01);
in_ptr0 += 4;
in_ptr1 += 4;
in_ptr2 += 4;
in_ptr3 += 4;
out_ptr0 += 4;
} // w
in_ptr0 += 2 + in_width;
in_ptr1 += 2 + in_width;
in_ptr2 += 2 + in_width;
in_ptr3 += 2 + in_width;
out_ptr0 += out_width;
} // h
#else
for (index_t ih = 0; ih < out_height; ++ih) {
for (index_t iw = 0; iw < out_width; ++iw) {
for (int i = 0; i < 3; ++i) {
for (int j = 0; j < 3; ++j) {
out_ptr0[ih * out_width + iw] +=
in_ptr0[(ih + i) * in_width + (iw + j)]
* filter_ptr0[i * 3 + j];
}
}
}
}
#endif
} // c
} // mm
} // if
} // m
} // b
}
void Conv2dNeonK3x3S2(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
float *output) {
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = in_channels * in_image_size;
const index_t out_batch_size = out_channels * out_image_size;
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) {
for (index_t m = 0; m < out_channels; ++m) {
for (index_t c = 0; c < in_channels; ++c) {
const float *in_base = input + b * in_batch_size + c * in_image_size;
const float
*filter_ptr = filter + m * in_channels * 9 + c * 9;
float *out_base = output + b * out_batch_size + m * out_image_size;
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
// load filter (1 outch x 3 height x 3 width): vf_outch_height
float32x4_t vf00, vf01, vf02;
vf00 = vld1q_f32(filter_ptr);
vf01 = vld1q_f32(filter_ptr + 3);
vf02 = vld1q_f32(filter_ptr + 6);
for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w + 3 < out_width; w += 4) {
float32x4x2_t vi0, vi1, vi2;
float32x4_t vi0n, vi1n, vi2n;
// input (3 height x 3 slide): vi_height_slide
float32x4_t vi00, vi01, vi02;
float32x4_t vi10, vi11, vi12;
float32x4_t vi20, vi21, vi22;
// output (1 outch x 1 height x 4 width): vo
float32x4_t vo;
// load input
index_t in_h = h * 2;
index_t in_w = w * 2;
index_t in_offset = in_h * in_width + in_w;
vi0 = vld2q_f32(in_base + in_offset); // [0.2.4.6, 1.3.5.7]
vi1 = vld2q_f32(in_base + in_offset + in_width);
vi2 = vld2q_f32(in_base + in_offset + 2 * in_width);
vi0n = vld1q_f32(in_base + in_offset + 8); // [8.9.10.11]
vi1n = vld1q_f32(in_base + in_offset + in_width + 8);
vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 8);
// load ouptut
index_t out_offset = h * out_width + w;
vo = vld1q_f32(out_base + out_offset);
vi00 = vi0.val[0]; // [0.2.4.6]
vi01 = vi0.val[1]; // [1.3.5.7]
vi02 = vextq_f32(vi00, vi0n, 1); // [2.4.6.8]
vi10 = vi1.val[0];
vi11 = vi1.val[1];
vi12 = vextq_f32(vi10, vi1n, 1);
vi20 = vi2.val[0];
vi21 = vi2.val[1];
vi22 = vextq_f32(vi20, vi2n, 1);
// outch 0, height 0
vo = vfmaq_laneq_f32(vo, vi00, vf00, 0);
vo = vfmaq_laneq_f32(vo, vi01, vf00, 1);
vo = vfmaq_laneq_f32(vo, vi02, vf00, 2);
vo = vfmaq_laneq_f32(vo, vi10, vf01, 0);
vo = vfmaq_laneq_f32(vo, vi11, vf01, 1);
vo = vfmaq_laneq_f32(vo, vi12, vf01, 2);
vo = vfmaq_laneq_f32(vo, vi20, vf02, 0);
vo = vfmaq_laneq_f32(vo, vi21, vf02, 1);
vo = vfmaq_laneq_f32(vo, vi22, vf02, 2);
vst1q_f32(out_base + out_offset, vo);
} // w
} // h
#else
for (index_t ih = 0; ih < out_height; ++ih) {
for (index_t iw = 0; iw < out_width; ++iw) {
for (int i = 0; i < 3; ++i) {
for (int j = 0; j < 3; ++j) {
out_base[ih * out_width + iw] +=
in_base[(ih * 2 + i) * in_width + (iw * 2 + j)]
* filter_ptr[i * 3 + j];
}
}
}
}
#endif
} // c
} // m
} // b
}
} // namespace kernels
} // namespace mace
//
// Copyright (c) 2018 XiaoMi All rights reserved.
//
#include <math.h>
#include <algorithm>
#include "mace/kernels/arm/conv_winograd.h"
#include "mace/kernels/gemm.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
namespace {
// NCHW => TNCB (T: in tile pixels, B: tile indices)
void TransformInput(const float *input,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t tile_count,
float *output) {
const index_t stride = batch * in_channels * tile_count;
const index_t in_height_width = in_height * in_width;
#pragma omp parallel for
for (index_t nc = 0; nc < batch * in_channels; ++nc) {
index_t tile_index = nc * tile_count;
for (index_t h = 0; h < in_height - 2; h += 2) {
for (index_t w = 0; w < in_width - 2; w += 2) {
float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14,
d15;
float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
s15;
// load tile data
const index_t tile_offset = nc * in_height_width + h * in_width + w;
d0 = input[tile_offset];
d1 = input[tile_offset + 1];
d2 = input[tile_offset + 2];
d3 = input[tile_offset + 3];
d4 = input[tile_offset + in_width];
d5 = input[tile_offset + in_width + 1];
d6 = input[tile_offset + in_width + 2];
d7 = input[tile_offset + in_width + 3];
d8 = input[tile_offset + 2 * in_width];
d9 = input[tile_offset + 2 * in_width + 1];
d10 = input[tile_offset + 2 * in_width + 2];
d11 = input[tile_offset + 2 * in_width + 3];
d12 = input[tile_offset + 3 * in_width];
d13 = input[tile_offset + 3 * in_width + 1];
d14 = input[tile_offset + 3 * in_width + 2];
d15 = input[tile_offset + 3 * in_width + 3];
// s = BT * d * B
s0 = (d0 - d8) - (d2 - d10);
s1 = (d1 - d9) + (d2 - d10);
s2 = (d2 - d10) - (d1 - d9);
s3 = (d1 - d9) - (d3 - d11);
s4 = (d4 + d8) - (d6 + d10);
s5 = (d5 + d9) + (d6 + d10);
s6 = (d6 + d10) - (d5 + d9);
s7 = (d5 + d9) - (d7 + d11);
s8 = (d8 - d4) - (d10 - d6);
s9 = (d9 - d5) + (d10 - d6);
s10 = (d10 - d6) - (d9 - d5);
s11 = (d9 - d5) - (d11 - d7);
s12 = (d4 - d12) - (d6 - d14);
s13 = (d5 - d13) + (d6 - d14);
s14 = (d6 - d14) - (d5 - d13);
s15 = (d5 - d13) - (d7 - d15);
// store output
output[tile_index + 0 * stride] = s0;
output[tile_index + 1 * stride] = s1;
output[tile_index + 2 * stride] = s2;
output[tile_index + 3 * stride] = s3;
output[tile_index + 4 * stride] = s4;
output[tile_index + 5 * stride] = s5;
output[tile_index + 6 * stride] = s6;
output[tile_index + 7 * stride] = s7;
output[tile_index + 8 * stride] = s8;
output[tile_index + 9 * stride] = s9;
output[tile_index + 10 * stride] = s10;
output[tile_index + 11 * stride] = s11;
output[tile_index + 12 * stride] = s12;
output[tile_index + 13 * stride] = s13;
output[tile_index + 14 * stride] = s14;
output[tile_index + 15 * stride] = s15;
++tile_index;
}
}
}
}
// OCHW => TOC
// no need to optimize, it will exist in converter
void TransformFilter(const float *filter,
const index_t in_channels,
const index_t out_channels,
float *output) {
const index_t stride = out_channels * in_channels;
#pragma omp parallel for collapse(2)
for (index_t m = 0; m < out_channels; ++m) {
for (index_t c = 0; c < in_channels; ++c) {
float g0, g1, g2, g3, g4, g5, g6, g7, g8;
float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14,
s15;
// load filter
index_t filter_offset = (m * in_channels + c) * 9;
g0 = filter[filter_offset];
g1 = filter[filter_offset + 1];
g2 = filter[filter_offset + 2];
g3 = filter[filter_offset + 3];
g4 = filter[filter_offset + 4];
g5 = filter[filter_offset + 5];
g6 = filter[filter_offset + 6];
g7 = filter[filter_offset + 7];
g8 = filter[filter_offset + 8];
// s = G * g * GT
s0 = g0;
s1 = (g0 + g2 + g1) * 0.5f;
s2 = (g0 + g2 - g1) * 0.5f;
s3 = g2;
s4 = (g0 + g6 + g3) * 0.5f;
s5 = ((g0 + g6 + g3) + (g2 + g8 + g5) + (g1 + g7 + g4)) * 0.25f;
s6 = ((g0 + g6 + g3) + (g2 + g8 + g5) - (g1 + g7 + g4)) * 0.25f;
s7 = (g2 + g8 + g5) * 0.5f;
s8 = (g0 + g6 - g3) * 0.5f;
s9 = ((g0 + g6 - g3) + (g2 + g8 - g5) + (g1 + g7 - g4)) * 0.25f;
s10 = ((g0 + g6 - g3) + (g2 + g8 - g5) - (g1 + g7 - g4)) * 0.25f;
s11 = (g2 + g8 - g5) * 0.5f;
s12 = g6;
s13 = (g6 + g8 + g7) * 0.5f;
s14 = (g6 + g8 - g7) * 0.5f;
s15 = g8;
// store output
index_t output_offset = m * in_channels + c;
output[output_offset + 0 * stride] = s0;
output[output_offset + 1 * stride] = s1;
output[output_offset + 2 * stride] = s2;
output[output_offset + 3 * stride] = s3;
output[output_offset + 4 * stride] = s4;
output[output_offset + 5 * stride] = s5;
output[output_offset + 6 * stride] = s6;
output[output_offset + 7 * stride] = s7;
output[output_offset + 8 * stride] = s8;
output[output_offset + 9 * stride] = s9;
output[output_offset + 10 * stride] = s10;
output[output_offset + 11 * stride] = s11;
output[output_offset + 12 * stride] = s12;
output[output_offset + 13 * stride] = s13;
output[output_offset + 14 * stride] = s14;
output[output_offset + 15 * stride] = s15;
}
}
}
// TOC * TNCB => TNOB
void BatchGemm(const float *input,
const float *filter,
index_t batch,
index_t in_channels,
index_t out_channels,
index_t tile_count,
float *output) {
const index_t in_stride = batch * in_channels * tile_count;
const index_t in_channels_tile_count = in_channels * tile_count;
const index_t filter_stride = out_channels * in_channels;
const index_t out_stride = batch * out_channels * tile_count;
const index_t out_channels_tile_count = out_channels * tile_count;
if (batch == 1) {
Gemm(filter, input, 16, out_channels, in_channels, tile_count, output);
} else {
for (int i = 0; i < 16; ++i) {
for (int b = 0; b < batch; ++b) {
const float
*in_ptr = input + i * in_stride + b * in_channels_tile_count;
const float *filter_ptr = filter + i * filter_stride;
float *out_ptr = output + i * out_stride + b * out_channels_tile_count;
Gemm(filter_ptr,
in_ptr,
1,
out_channels, /* rows */
in_channels, /* K */
tile_count, /* cols */
out_ptr);
}
}
}
}
// TNOB => ToNOB => NOHoWo
void TransformOutput(const float *input,
index_t batch,
index_t out_height,
index_t out_width,
index_t out_channels,
index_t tile_count,
float *output) {
const index_t in_stride = batch * out_channels * tile_count;
#pragma omp parallel for
for (index_t nm = 0; nm < batch * out_channels; ++nm) {
index_t tile_offset = nm * tile_count;
for (index_t h = 0; h < out_height; h += 2) {
for (index_t w = 0; w < out_width; w += 2) {
float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14,
d15;
float s0, s1, s2, s3, s4, s5, s6, s7;
float v0, v1, v2, v3;
d0 = input[tile_offset + 0 * in_stride];
d1 = input[tile_offset + 1 * in_stride];
d2 = input[tile_offset + 2 * in_stride];
d3 = input[tile_offset + 3 * in_stride];
d4 = input[tile_offset + 4 * in_stride];
d5 = input[tile_offset + 5 * in_stride];
d6 = input[tile_offset + 6 * in_stride];
d7 = input[tile_offset + 7 * in_stride];
d8 = input[tile_offset + 8 * in_stride];
d9 = input[tile_offset + 9 * in_stride];
d10 = input[tile_offset + 10 * in_stride];
d11 = input[tile_offset + 11 * in_stride];
d12 = input[tile_offset + 12 * in_stride];
d13 = input[tile_offset + 13 * in_stride];
d14 = input[tile_offset + 14 * in_stride];
d15 = input[tile_offset + 15 * in_stride];
s0 = d0 + d1 + d2;
s1 = d1 - d2 - d3;
s2 = d4 + d5 + d6;
s3 = d5 - d6 - d7;
s4 = d8 + d9 + d10;
s5 = d9 - d10 - d11;
s6 = d12 + d13 + d14;
s7 = d13 - d14 - d15;
v0 = s0 + s2 + s4;
v1 = s1 + s3 + s5;
v2 = s2 - s4 - s6;
v3 = s3 - s5 - s7;
index_t out_offset = nm * out_height * out_width + h * out_width + w;
output[out_offset] = v0;
output[out_offset + 1] = v1;
output[out_offset + out_width] = v2;
output[out_offset + out_width + 1] = v3;
++tile_offset;
}
}
}
}
void ConvRef3x3s1(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_channels,
float *output) {
index_t out_height = in_height - 2;
index_t out_width = in_width - 2;
#pragma omp parallel for collapse(4)
for (index_t b = 0; b < batch; ++b) {
for (index_t m = 0; m < out_channels; ++m) {
for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w < out_width; ++w) {
index_t out_offset =
((b * out_channels + m) * out_height + h) * out_width + w;
output[out_offset] = 0;
for (index_t c = 0; c < in_channels; ++c) {
for (index_t kh = 0; kh < 3; ++kh) {
for (index_t kw = 0; kw < 3; ++kw) {
index_t ih = h + kh;
index_t iw = w + kw;
index_t in_offset =
((b * in_channels + c) * in_height + ih) * in_width + iw;
index_t
filter_offset = (((m * in_channels) + c) * 3 + kh) * 3 + kw;
output[out_offset] += input[in_offset] * filter[filter_offset];
}
}
}
}
}
}
}
}
} // namespace
void WinoGradConv3x3s1(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_channels,
float *transformed_input,
float *transformed_filter,
float *transformed_output,
bool is_filter_transformed,
float *output) {
index_t out_height = in_height - 2;
index_t out_width = in_width - 2;
index_t tile_height_count = (out_height + 1) / 2;
index_t tile_width_count = (out_width + 1) / 2;
index_t tile_count = tile_height_count * tile_width_count;
TransformInput(input,
batch,
in_height,
in_width,
in_channels,
tile_count,
transformed_input);
// TODO(liyin): put it in model converter, but do not worry, it is fast and
// will only do once
if (!is_filter_transformed) {
TransformFilter(filter, in_channels, out_channels, transformed_filter);
}
BatchGemm(transformed_input,
transformed_filter,
batch,
in_channels,
out_channels,
tile_count,
transformed_output);
TransformOutput(transformed_output,
batch,
out_height,
out_width,
out_channels,
tile_count,
output);
}
void WinoGradConv3x3s1(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_channels,
float *output) {
index_t out_height = in_height - 2;
index_t out_width = in_width - 2;
index_t tile_height_count = (out_height + 1) / 2;
index_t tile_width_count = (out_width + 1) / 2;
index_t tile_count = tile_height_count * tile_width_count;
index_t transformed_input_size = 16 * batch * in_channels * tile_count;
index_t transformed_filter_size = 16 * out_channels * in_channels;
index_t transformed_output_size = 16 * batch * out_channels * tile_count;
float *transformed_input = new float[transformed_input_size]; // TNCB
float *transformed_filter = new float[transformed_filter_size]; // TOC
float *transformed_output = new float[transformed_output_size];
WinoGradConv3x3s1(input,
filter,
batch,
in_height,
in_width,
in_channels,
out_channels,
transformed_input,
transformed_filter,
transformed_output,
false,
output);
delete[]transformed_input;
delete[]transformed_filter;
delete[]transformed_output;
}
} // namespace kernels
} // namespace mace
//
// Copyright (c) 2018 XiaoMi All rights reserved.
//
#ifndef MACE_KERNELS_ARM_CONV_WINOGRAD_H_
#define MACE_KERNELS_ARM_CONV_WINOGRAD_H_
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
#include <arm_neon.h>
#endif
#include "mace/core/types.h"
namespace mace {
namespace kernels {
void WinoGradConv3x3s1(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_channels,
float *output);
void WinoGradConv3x3s1(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_channels,
float *transformed_input,
float *transformed_filter,
float *transformed_output,
bool is_filter_transformed,
float *output);
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_ARM_CONV_WINOGRAD_H_
//
// Copyright (c) 2018 XiaoMi All rights reserved.
//
#include <gtest/gtest.h>
#include <random>
#include <algorithm>
#include "mace/kernels/arm/conv_winograd.h"
#include "mace/core/types.h"
namespace mace {
namespace kernels {
TEST(ConvWinogradTest, winograd) {
index_t batch = 1;
index_t in_height = 32;
index_t in_width = 32;
index_t in_channels = 64;
index_t out_channels = 128;
index_t out_height = in_height - 2;
index_t out_width = in_width - 2;
index_t input_size = batch * in_channels * in_height * out_height;
index_t filter_size = 3 * 3 * in_channels * out_channels;
index_t output_size = batch * out_channels * out_height * out_width;
float *input_data = new float[input_size];
float *filter_data = new float[filter_size];
float *output_data = new float[output_size];
float *output_data_ref = new float[output_size];
std::random_device rd;
std::mt19937 gen(rd());
std::normal_distribution<float> nd(0, 1);
std::generate(input_data, input_data + input_size,
[&gen, &nd] {
return std::max(-1.0f, std::min(1.0f, nd(gen)));
});
std::generate(filter_data, filter_data + filter_size,
[&gen, &nd] {
return std::max(-1.0f, std::min(1.0f, nd(gen)));
});
kernels::ConvRef3x3s1(input_data,
filter_data,
batch,
in_height,
in_width,
in_channels,
out_channels,
output_data_ref);
kernels::WinoGradConv3x3s1(input_data,
filter_data,
batch,
in_height,
in_width,
in_channels,
out_channels,
output_data);
// test
for (index_t i = 0; i < output_size; ++i) {
EXPECT_NEAR(output_data_ref[i], output_data[i], 0.1);
}
delete[]input_data;
delete[]filter_data;
delete[]output_data;
delete[]output_data_ref;
}
} // namespace kernels
} // namespace mace
//
// Copyright (c) 2018 XiaoMi All rights reserved.
//
#include "mace/kernels/depthwise_conv2d.h"
#include "mace/kernels/activation.h"
namespace mace {
namespace kernels {
namespace {
void DepthwiseConv2dNCHW(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
const int filter_height,
const int filter_width,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int pad_top,
const int pad_left,
float *output) {
const index_t multiplier = out_channels / in_channels;
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) {
for (index_t m = 0; m < out_channels; ++m) {
for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w < out_width; ++w) {
index_t out_offset =
((b * out_channels + m) * out_height + h) * out_width + w;
index_t c = m / multiplier;
index_t o = m % multiplier;
float sum = 0;
for (index_t kh = 0; kh < filter_height; ++kh) {
for (index_t kw = 0; kw < filter_width; ++kw) {
index_t ih = h * stride_h + kh * dilation_h - pad_top;
index_t iw = w * stride_w + kw * dilation_w - pad_left;
if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
index_t in_offset =
((b * in_channels + c) * in_height + ih) * in_width + iw;
index_t filter_offset =
(((o * in_channels) + c) * filter_height + kh) * filter_width
+ kw;
sum += input[in_offset] * filter[filter_offset];
}
}
}
output[out_offset] = sum;
}
}
}
}
}
} // namespace
extern void DepthwiseConv2dNeonK3x3S1(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
const int pad_top,
const int pad_left,
const int valid_h_start,
const int valid_h_stop,
const int valid_w_start,
const int valid_w_stop,
float *output);
void DepthwiseConv2dNeonK3x3S2(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
const int pad_top,
const int pad_left,
const int valid_h_start,
const int valid_h_stop,
const int valid_w_start,
const int valid_w_stop,
float *output);
void DepthwiseConv2dFunctor<DeviceType::NEON,
float>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
MACE_CHECK_NOTNULL(input);
MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(output);
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
std::vector<index_t> filter_shape
{filter->dim(0) * filter->dim(1), filter->dim(1), filter->dim(2),
filter->dim(3)};
if (paddings_.empty()) {
CalcNCHWPaddingAndOutputSize(input->shape().data(),
filter_shape.data(),
dilations_,
strides_,
padding_type_,
output_shape.data(),
paddings.data());
} else {
paddings = paddings_;
CalcNCHWOutputSize(input->shape().data(), filter_shape.data(),
paddings_.data(), dilations_, strides_, RoundType::FLOOR,
output_shape.data());
}
output->Resize(output_shape);
output->Clear();
index_t batch = output->dim(0);
index_t channels = output->dim(1);
index_t height = output->dim(2);
index_t width = output->dim(3);
index_t input_batch = input->dim(0);
index_t input_channels = input->dim(1);
index_t input_height = input->dim(2);
index_t input_width = input->dim(3);
index_t filter_h = filter_shape[2];
index_t filter_w = filter_shape[3];
MACE_CHECK(filter_shape[0] == channels, filter_shape[0], " != ", channels);
MACE_CHECK(filter_shape[1] == input_channels, filter_shape[1], " != ",
input_channels);
index_t stride_h = strides_[0];
index_t stride_w = strides_[1];
index_t dilation_h = dilations_[0];
index_t dilation_w = dilations_[1];
MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");
int pad_top = paddings[0] >> 1;
int pad_bottom = paddings[0] - pad_top;
int pad_left = paddings[1] >> 1;
int pad_right = paddings[1] - pad_left;
int valid_h_start = pad_top == 0 ? 0 : (pad_top - 1) / stride_h + 1;
int valid_h_stop = pad_bottom == 0
? height
: height - ((pad_bottom - 1) / stride_h + 1);
int valid_w_start = pad_left == 0 ? 0 : (pad_left - 1) / stride_w + 1;
int valid_w_stop = pad_right == 0
? width
: width - ((pad_right - 1) / stride_w + 1);
std::function<void(const float *input, float *output)> conv_func;
auto input_data = input->data<float>();
auto filter_data = filter->data<float>();
auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
auto output_data = output->mutable_data<float>();
if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1
&& dilation_h == 1 && dilation_w == 1) {
conv_func = [=](const float *input, float *output) {
DepthwiseConv2dNeonK3x3S1(input,
filter_data,
batch,
input_height,
input_width,
input_channels,
height,
width,
channels,
pad_top,
pad_left,
valid_h_start,
valid_h_stop,
valid_w_start,
valid_w_stop,
output);
};
} else if (filter_h == 3 && filter_w == 3 && stride_h == 2 && stride_w == 2
&& dilation_h == 1 && dilation_w == 1) {
conv_func = [=](const float *input, float *output) {
DepthwiseConv2dNeonK3x3S2(input,
filter_data,
batch,
input_height,
input_width,
input_channels,
height,
width,
channels,
pad_top,
pad_left,
valid_h_start,
valid_h_stop,
valid_w_start,
valid_w_stop,
output);
};
} else {
conv_func = [=](const float *input, float *output) {
DepthwiseConv2dNCHW(input,
filter_data,
batch,
input_height,
input_width,
input_channels,
height,
width,
channels,
filter_h,
filter_w,
stride_h,
stride_w,
dilation_h,
dilation_w,
pad_top,
pad_left,
output);
};
}
conv_func(input_data, output_data);
if (bias_data != nullptr) {
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) {
for (index_t i = 0; i < height * width; ++i) {
output_data[(b * channels + c) * height * width + i] += bias_data[c];
}
}
}
}
DoActivation(output_data, output_data, output->size(), activation_,
relux_max_limit_);
}
} // namespace kernels
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
#include <arm_neon.h>
#endif
#include "mace/core/types.h"
namespace mace {
namespace kernels {
namespace {
void DepthwiseConv2dPixel(const float *in_base,
const float *filter,
const index_t out_h,
const index_t out_w,
const index_t in_h_start,
const index_t in_w_start,
const index_t out_width,
const index_t in_height,
const index_t in_width,
int filter_height,
int filter_width,
float *out_base) {
float sum = 0;
for (int i = 0; i < filter_height; ++i) {
for (int j = 0; j < filter_width; ++j) {
index_t in_h = in_h_start + i;
index_t in_w = in_w_start + j;
if (in_h >= 0 && in_h < in_height && in_w >= 0 && in_w < in_width) {
sum += in_base[in_h * in_width + in_w] * filter[i * filter_width + j];
}
}
}
out_base[out_h * out_width + out_w] = sum;
}
} // namespace
// Ho = 2, Wo = 4, Co = 1
void DepthwiseConv2dNeonK3x3S1(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
const int pad_top,
const int pad_left,
const int valid_h_start,
const int valid_h_stop,
const int valid_w_start,
const int valid_w_stop,
float *output) {
const index_t multiplier = out_channels / in_channels;
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = in_channels * in_image_size;
const index_t out_batch_size = out_channels * out_image_size;
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) {
for (index_t m = 0; m < out_channels; ++m) {
index_t c = m / multiplier;
index_t multi_index = m % multiplier;
const float *in_base = input + b * in_batch_size + c * in_image_size;
const float *filter_ptr = filter + multi_index * in_channels * 9 + c * 9;
float *out_base = output + b * out_batch_size + m * out_image_size;
index_t h, w;
// top
for (h = 0; h < valid_h_start; ++h) {
for (w = 0; w < out_width; ++w) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
h,
w,
h - pad_top,
w - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
}
}
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
// load filter (1 outch x 3 height x 3 width): vf_outch_height
float32x4_t vf00, vf01, vf02;
vf00 = vld1q_f32(filter_ptr);
vf01 = vld1q_f32(filter_ptr + 3);
vf02 = vld1q_f32(filter_ptr + 6);
for (h = valid_h_start; h + 1 < valid_h_stop; h += 2) {
// left
for (w = 0; w < valid_w_start; ++w) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
h,
w,
h - pad_top,
w - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
DepthwiseConv2dPixel(in_base,
filter_ptr,
h + 1,
w,
h + 1 - pad_top,
w - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
}
for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) {
// input (4 height x 3 slide): vi_height_slide
float32x4_t vi00, vi01, vi02, vi0n;
float32x4_t vi10, vi11, vi12, vi1n;
float32x4_t vi20, vi21, vi22, vi2n;
float32x4_t vi30, vi31, vi32, vi3n;
// output (1 outch x 2 height x 4 width): vo_outch_height
float32x4_t vo00, vo01;
// load input
index_t in_h = h - pad_top;
index_t in_w = w - pad_left;
index_t in_offset = in_h * in_width + in_w;
vi00 = vld1q_f32(in_base + in_offset);
vi0n = vld1q_f32(in_base + in_offset + 4);
vi10 = vld1q_f32(in_base + in_offset + in_width);
vi1n = vld1q_f32(in_base + in_offset + in_width + 4);
vi20 = vld1q_f32(in_base + in_offset + 2 * in_width);
vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 4);
vi30 = vld1q_f32(in_base + in_offset + 3 * in_width);
vi3n = vld1q_f32(in_base + in_offset + 3 * in_width + 4);
vi01 = vextq_f32(vi00, vi0n, 1);
vi02 = vextq_f32(vi00, vi0n, 2);
vi11 = vextq_f32(vi10, vi1n, 1);
vi12 = vextq_f32(vi10, vi1n, 2);
vi21 = vextq_f32(vi20, vi2n, 1);
vi22 = vextq_f32(vi20, vi2n, 2);
vi31 = vextq_f32(vi30, vi3n, 1);
vi32 = vextq_f32(vi30, vi3n, 2);
// load ouptut
index_t out_offset = h * out_width + w;
vo00 = vld1q_f32(out_base + out_offset);
vo01 = vld1q_f32(out_base + out_offset + out_width);
// outch 0, height 0
vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0);
vo00 = vfmaq_laneq_f32(vo00, vi01, vf00, 1);
vo00 = vfmaq_laneq_f32(vo00, vi02, vf00, 2);
vo00 = vfmaq_laneq_f32(vo00, vi10, vf01, 0);
vo00 = vfmaq_laneq_f32(vo00, vi11, vf01, 1);
vo00 = vfmaq_laneq_f32(vo00, vi12, vf01, 2);
vo00 = vfmaq_laneq_f32(vo00, vi20, vf02, 0);
vo00 = vfmaq_laneq_f32(vo00, vi21, vf02, 1);
vo00 = vfmaq_laneq_f32(vo00, vi22, vf02, 2);
// outch 0, height 1
vo01 = vfmaq_laneq_f32(vo01, vi10, vf00, 0);
vo01 = vfmaq_laneq_f32(vo01, vi11, vf00, 1);
vo01 = vfmaq_laneq_f32(vo01, vi12, vf00, 2);
vo01 = vfmaq_laneq_f32(vo01, vi20, vf01, 0);
vo01 = vfmaq_laneq_f32(vo01, vi21, vf01, 1);
vo01 = vfmaq_laneq_f32(vo01, vi22, vf01, 2);
vo01 = vfmaq_laneq_f32(vo01, vi30, vf02, 0);
vo01 = vfmaq_laneq_f32(vo01, vi31, vf02, 1);
vo01 = vfmaq_laneq_f32(vo01, vi32, vf02, 2);
vst1q_f32(out_base + out_offset, vo00);
vst1q_f32(out_base + out_offset + out_width, vo01);
} // w
// right
for (; w < out_width; ++w) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
h,
w,
h - pad_top,
w - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
DepthwiseConv2dPixel(in_base,
filter_ptr,
h + 1,
w,
h + 1 - pad_top,
w - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
}
} // h
#else
for (index_t ih = valid_h_start; ih < valid_h_stop; ++ih) {
for (index_t iw = 0; iw < out_width; ++iw) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
ih,
iw,
ih - pad_top,
iw - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
}
}
#endif
// bottom
for (; h < out_height; ++h) {
for (w = 0; w < out_width; ++w) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
h,
w,
h - pad_top,
w - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
}
}
} // m
} // b
}
void DepthwiseConv2dNeonK3x3S2(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
const int pad_top,
const int pad_left,
const int valid_h_start,
const int valid_h_stop,
const int valid_w_start,
const int valid_w_stop,
float *output) {
const index_t multiplier = out_channels / in_channels;
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = in_channels * in_image_size;
const index_t out_batch_size = out_channels * out_image_size;
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) {
for (index_t m = 0; m < out_channels; ++m) {
index_t c = m / multiplier;
index_t multi_index = m % multiplier;
const float *in_base = input + b * in_batch_size + c * in_image_size;
const float *filter_ptr = filter + multi_index * in_channels * 9 + c * 9;
float *out_base = output + b * out_batch_size + m * out_image_size;
index_t h, w;
// top
for (h = 0; h < valid_h_start; ++h) {
for (w = 0; w < out_width; ++w) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
h,
w,
h * 2 - pad_top,
w * 2 - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
}
}
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
// load filter (1 outch x 3 height x 3 width): vf_outch_height
float32x4_t vf00, vf01, vf02;
vf00 = vld1q_f32(filter_ptr);
vf01 = vld1q_f32(filter_ptr + 3);
vf02 = vld1q_f32(filter_ptr + 6);
for (h = valid_h_start; h < valid_h_stop; ++h) {
// left
for (w = 0; w < valid_w_start; ++w) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
h,
w,
h * 2 - pad_top,
w * 2 - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
}
for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) {
float32x4x2_t vi0, vi1, vi2;
float32x4_t vi0n, vi1n, vi2n;
// input (3 height x 3 slide): vi_height_slide
float32x4_t vi00, vi01, vi02;
float32x4_t vi10, vi11, vi12;
float32x4_t vi20, vi21, vi22;
// output (1 outch x 1 height x 4 width): vo
float32x4_t vo;
// load input
index_t in_h = h * 2 - pad_top;
index_t in_w = w * 2 - pad_left;
index_t in_offset = in_h * in_width + in_w;
vi0 = vld2q_f32(in_base + in_offset); // [0.2.4.6, 1.3.5.7]
vi1 = vld2q_f32(in_base + in_offset + in_width);
vi2 = vld2q_f32(in_base + in_offset + 2 * in_width);
vi0n = vld1q_f32(in_base + in_offset + 8); // [8.9.10.11]
vi1n = vld1q_f32(in_base + in_offset + in_width + 8);
vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 8);
// load ouptut
index_t out_offset = h * out_width + w;
vo = vld1q_f32(out_base + out_offset);
vi00 = vi0.val[0]; // [0.2.4.6]
vi01 = vi0.val[1]; // [1.3.5.7]
vi02 = vextq_f32(vi00, vi0n, 1); // [2.4.6.8]
vi10 = vi1.val[0];
vi11 = vi1.val[1];
vi12 = vextq_f32(vi10, vi1n, 1);
vi20 = vi2.val[0];
vi21 = vi2.val[1];
vi22 = vextq_f32(vi20, vi2n, 1);
// outch 0, height 0
vo = vfmaq_laneq_f32(vo, vi00, vf00, 0);
vo = vfmaq_laneq_f32(vo, vi01, vf00, 1);
vo = vfmaq_laneq_f32(vo, vi02, vf00, 2);
vo = vfmaq_laneq_f32(vo, vi10, vf01, 0);
vo = vfmaq_laneq_f32(vo, vi11, vf01, 1);
vo = vfmaq_laneq_f32(vo, vi12, vf01, 2);
vo = vfmaq_laneq_f32(vo, vi20, vf02, 0);
vo = vfmaq_laneq_f32(vo, vi21, vf02, 1);
vo = vfmaq_laneq_f32(vo, vi22, vf02, 2);
vst1q_f32(out_base + out_offset, vo);
} // w
// right
for (; w < out_width; ++w) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
h,
w,
h * 2 - pad_top,
w * 2 - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
}
} // h
#else
for (index_t ih = valid_h_start; ih < valid_h_stop; ++ih) {
for (index_t iw = 0; iw < out_width; ++iw) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
ih,
iw,
ih * 2 - pad_top,
iw * 2 - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
}
}
#endif
// bottom
for (; h < out_height; ++h) {
for (w = 0; w < out_width; ++w) {
DepthwiseConv2dPixel(in_base,
filter_ptr,
h,
w,
h * 2 - pad_top,
w * 2 - pad_left,
out_width,
in_height,
in_width,
3,
3,
out_base);
}
}
} // m
} // b
}
} // namespace kernels
} // namespace mace
//
// Copyright (c) 2018 XiaoMi All rights reserved.
//
#include "mace/kernels/pooling.h"
namespace mace {
namespace kernels {
namespace {
void MaxPooling(const float *input,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t channels,
const index_t out_height,
const index_t out_width,
const int filter_height,
const int filter_width,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int pad_top,
const int pad_left,
float *output) {
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = channels * in_image_size;
const index_t out_batch_size = channels * out_image_size;
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) {
const index_t out_base = b * out_batch_size + c * out_image_size;
const index_t in_base = b * in_batch_size + c * in_image_size;
for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w < out_width; ++w) {
const index_t out_offset = out_base + h * out_width + w;
float res = std::numeric_limits<float>::lowest();
for (int fh = 0; fh < filter_height; ++fh) {
for (int fw = 0; fw < filter_width; ++fw) {
int inh = h * stride_h + dilation_h * fh - pad_top;
int inw = w * stride_w + dilation_w * fw - pad_left;
if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
index_t input_offset = in_base + inh * in_width + inw;
res = std::max(res, input[input_offset]);
}
}
}
output[out_offset] = res;
}
}
}
}
}
void AvgPooling(const float *input,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t channels,
const index_t out_height,
const index_t out_width,
const int filter_height,
const int filter_width,
const int stride_h,
const int stride_w,
const int dilation_h,
const int dilation_w,
const int pad_top,
const int pad_left,
float *output) {
const index_t in_image_size = in_height * in_width;
const index_t out_image_size = out_height * out_width;
const index_t in_batch_size = channels * in_image_size;
const index_t out_batch_size = channels * out_image_size;
#pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) {
const index_t out_base = b * out_batch_size + c * out_image_size;
const index_t in_base = b * in_batch_size + c * in_image_size;
for (index_t h = 0; h < out_height; ++h) {
for (index_t w = 0; w < out_width; ++w) {
const index_t out_offset = out_base + h * out_width + w;
float res = 0;
int block_size = 0;
for (int fh = 0; fh < filter_height; ++fh) {
for (int fw = 0; fw < filter_width; ++fw) {
int inh = h * stride_h + dilation_h * fh - pad_top;
int inw = w * stride_w + dilation_w * fw - pad_left;
if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
index_t input_offset = in_base + inh * in_width + inw;
res += input[input_offset];
++block_size;
}
}
}
output[out_offset] = res / block_size;
}
}
}
}
}
} // namespace
void PoolingFunctor<DeviceType::NEON,
float>::operator()(const Tensor *input_tensor,
Tensor *output_tensor,
StatsFuture *future) {
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {
input_tensor->dim(1), input_tensor->dim(1), kernels_[0], kernels_[1]};
std::vector<int> paddings(2);
if (paddings_.empty()) {
kernels::CalcNCHWPaddingAndOutputSize(
input_tensor->shape().data(), filter_shape.data(), dilations_,
strides_, padding_type_, output_shape.data(), paddings.data());
} else {
paddings = paddings_;
CalcNCHWOutputSize(input_tensor->shape().data(), filter_shape.data(),
paddings_.data(), dilations_, strides_, RoundType::CEIL,
output_shape.data());
}
output_tensor->Resize(output_shape);
const float *input = input_tensor->data<float>();
float *output = output_tensor->mutable_data<float>();
const index_t *input_shape = input_tensor->shape().data();
index_t batch = output_shape[0];
index_t channels = output_shape[1];
index_t height = output_shape[2];
index_t width = output_shape[3];
index_t input_channels = input_shape[1];
index_t input_height = input_shape[2];
index_t input_width = input_shape[3];
index_t in_image_size = input_height * input_width;
int filter_h = kernels_[0];
int filter_w = kernels_[1];
int stride_h = strides_[0];
int stride_w = strides_[1];
int dilation_h = dilations_[0];
int dilation_w = dilations_[1];
int pad_top = paddings[0] / 2;
int pad_left = paddings[1] / 2;
if (pooling_type_ == PoolingType::MAX) {
MaxPooling(input,
batch,
input_height,
input_width,
channels,
height,
width,
filter_h,
filter_w,
stride_h,
stride_w,
dilation_h,
dilation_w,
pad_top,
pad_left,
output);
} else if (pooling_type_ == PoolingType::AVG) {
AvgPooling(input,
batch,
input_height,
input_width,
channels,
height,
width,
filter_h,
filter_w,
stride_h,
stride_w,
dilation_h,
dilation_w,
pad_top,
pad_left,
output);
} else {
MACE_NOT_IMPLEMENTED;
}
}
} // namespace kernels
} // namespace mace
//
// Copyright (c) 2018 XiaoMi All rights reserved.
//
#include "mace/kernels/softmax.h"
namespace mace {
namespace kernels {
void SoftmaxFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
Tensor *output,
StatsFuture *future) {
const index_t batch = input->dim(0);
const index_t class_count = input->dim(1);
const index_t class_size = input->dim(2) * input->dim(3);
const float *input_data = input->data<float>();
float *output_data = output->mutable_data<float>();
for (index_t b = 0; b < batch; ++b) {
std::vector<float>
max_val(class_size, std::numeric_limits<float>::lowest());
std::vector<float> sum_val(class_size, 0.f);
// calculate max for each class
for (index_t c = 0; c < class_count; ++c) {
const float *input_ptr = input_data + (b * class_count + c) * class_size;
for (index_t k = 0; k < class_size; ++k) {
max_val[k] = std::max(max_val[k], input_ptr[k]);
}
}
// calculate data - max for each class
#pragma omp parallel for
for (index_t c = 0; c < class_count; ++c) {
const float *input_ptr = input_data + (b * class_count + c) * class_size;
float *output_ptr = output_data + (b * class_count + c) * class_size;
for (index_t k = 0; k < class_size; ++k) {
output_ptr[k] = ::exp(input_ptr[k] - max_val[k]);
}
}
// calculate sum for each class
for (index_t c = 0; c < class_count; ++c) {
float *output_ptr = output_data + (b * class_count + c) * class_size;
for (index_t k = 0; k < class_size; ++k) {
sum_val[k] += output_ptr[k];
}
}
// calculate (data - max) / sum for each class
for (index_t c = 0; c < class_count; ++c) {
float *output_ptr = output_data + (b * class_count + c) * class_size;
for (index_t k = 0; k < class_size; ++k) {
output_ptr[k] /= sum_val[k];
}
}
}
}
} // namespace kernels
} // namespace mace
......@@ -133,14 +133,21 @@ struct BatchNormFunctor : BatchNormFunctorBase {
};
template <>
void BatchNormFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
const Tensor *scale,
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
const float epsilon,
Tensor *output,
StatsFuture *future);
struct BatchNormFunctor<DeviceType::NEON, float> : BatchNormFunctorBase {
BatchNormFunctor(const bool folded_constant,
const ActivationType activation,
const float relux_max_limit)
: BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {}
void operator()(const Tensor *input,
const Tensor *scale,
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
const float epsilon,
Tensor *output,
StatsFuture *future);
};
template <typename T>
struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
......
......@@ -212,6 +212,12 @@ switch (w_count) { \
case 2: \
MACE_DO_CONV2D(CC, CH, 2); \
break; \
case 3: \
MACE_DO_CONV2D(CC, CH, 3); \
break; \
case 4: \
MACE_DO_CONV2D(CC, CH, 4); \
break; \
default: \
LOG(FATAL) << "Unsupported w tile: " << w_count; \
}
......@@ -242,6 +248,42 @@ switch (c_count) { \
case 4: \
MACE_CASE_H_CONV2D(4); \
break; \
case 5: \
MACE_CASE_H_CONV2D(5); \
break; \
case 6: \
MACE_CASE_H_CONV2D(6); \
break; \
case 7: \
MACE_CASE_H_CONV2D(7); \
break; \
case 8: \
MACE_CASE_H_CONV2D(8); \
break; \
case 9: \
MACE_CASE_H_CONV2D(9); \
break; \
case 10: \
MACE_CASE_H_CONV2D(10); \
break; \
case 11: \
MACE_CASE_H_CONV2D(11); \
break; \
case 12: \
MACE_CASE_H_CONV2D(12); \
break; \
case 13: \
MACE_CASE_H_CONV2D(13); \
break; \
case 14: \
MACE_CASE_H_CONV2D(14); \
break; \
case 15: \
MACE_CASE_H_CONV2D(15); \
break; \
case 16: \
MACE_CASE_H_CONV2D(16); \
break; \
default: \
LOG(FATAL) << "Unsupported c tile: " << c_count; \
}
......@@ -373,11 +415,35 @@ struct Conv2dFunctor : Conv2dFunctorBase {
};
template <>
void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output,
StatsFuture *future);
struct Conv2dFunctor<DeviceType::NEON, float> : Conv2dFunctorBase {
Conv2dFunctor(const int *strides,
const Padding &padding_type,
const std::vector<int> &paddings,
const int *dilations,
const ActivationType activation,
const float relux_max_limit)
: Conv2dFunctorBase(strides,
padding_type,
paddings,
dilations,
activation,
relux_max_limit),
is_filter_transformed_(false) {}
void operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output,
StatsFuture *future);
// TODO(liyin): share tmp buffers among ops
Tensor padded_input_;
Tensor padded_output_;
Tensor transformed_input_;
Tensor transformed_filter_;
Tensor transformed_output_;
bool is_filter_transformed_;
};
template <typename T>
struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
......
......@@ -9,7 +9,7 @@
namespace mace {
namespace kernels {
void CalcPaddingAndOutputSize(const index_t *input_shape, // NCHW
void CalcNCHWPaddingAndOutputSize(const index_t *input_shape, // NCHW
const index_t *filter_shape, // OIHW
const int *dilations,
const int *strides,
......@@ -186,6 +186,55 @@ void CalcOutputSize(const index_t *input_shape, // NHWC
output_shape[3] = filter_shape[2];
}
void CalcNCHWOutputSize(const index_t *input_shape, // NCHW
const index_t *filter_shape, // OIHW
const int *padding_size,
const int *dilations,
const int *strides,
const RoundType round_type,
index_t *output_shape) {
MACE_CHECK(dilations[0] > 0 && dilations[1] > 0,
"Invalid dilations, must >= 1");
MACE_CHECK((dilations[0] == 1 || strides[0] == 1) &&
(dilations[1] == 1 || strides[1] == 1),
"If dilations > 1, strides should be 1");
MACE_CHECK_NOTNULL(output_shape);
MACE_CHECK_NOTNULL(padding_size);
/*
* Convolution arithmetic:
* o = floor((i + 2 * p - k - (k - 1) * (d - 1)) / s) + 1
* Pooling arithmetic:
* o = ceil((i + 2 * p - k - (k - 1) * (d - 1)) / s) + 1
* For details, see https://arxiv.org/pdf/1603.07285.pdf or
* http://deeplearning.net/software/theano/tutorial/conv_arithmetic.html
*/
output_shape[0] = input_shape[0];
if (round_type == FLOOR) {
output_shape[2] = static_cast<index_t>(
std::floor(1.0 * (input_shape[2] + padding_size[0] - filter_shape[2] -
(filter_shape[2] - 1) * (dilations[0] - 1)) /
strides[0]) +
1);
output_shape[3] = static_cast<index_t>(
std::floor(1.0 * (input_shape[3] + padding_size[1] - filter_shape[3] -
(filter_shape[3] - 1) * (dilations[1] - 1)) /
strides[1]) +
1);
} else {
output_shape[2] = static_cast<index_t>(
std::ceil(1.0 * (input_shape[2] + padding_size[0] - filter_shape[2] -
(filter_shape[2] - 1) * (dilations[0] - 1)) /
strides[0]) +
1);
output_shape[3] = static_cast<index_t>(
std::ceil(1.0 * (input_shape[3] + padding_size[1] - filter_shape[3] -
(filter_shape[3] - 1) * (dilations[1] - 1)) /
strides[1]) +
1);
}
output_shape[1] = filter_shape[0];
}
void CalPaddingSize(const index_t *input_shape, // NCHW
const index_t *filter_shape, // OIHW
const int *dilations,
......@@ -230,10 +279,11 @@ void CalPaddingSize(const index_t *input_shape, // NCHW
0, (output_width - 1) * strides[1] + k_extent_width - input_shape[3]);
}
void ConstructInputWithPadding(const Tensor *input_tensor,
const int *paddings,
Tensor *output_tensor,
bool padding_same_value) {
void ConstructNCHWInputWithPadding(const Tensor *input_tensor,
const int *paddings,
Tensor *output_tensor,
bool padding_same_value) {
Tensor::MappingGuard input_mapper(input_tensor);
const float *input = input_tensor->data<float>();
const index_t *input_shape = input_tensor->shape().data();
......@@ -244,7 +294,7 @@ void ConstructInputWithPadding(const Tensor *input_tensor,
index_t width = input_shape[3];
std::vector<index_t> output_shape(
{batch, channels, paddings[0] + height, paddings[1] + width});
{batch, channels, paddings[0] + height, paddings[1] + width});
const index_t output_width = output_shape[3];
const int padded_top = paddings[0] / 2;
......@@ -268,6 +318,7 @@ void ConstructInputWithPadding(const Tensor *input_tensor,
const int padded_bottom = paddings[0] - padded_top;
const int padded_right = paddings[1] - padded_left;
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
for (int k = 0; k < padded_top; ++k) {
......@@ -301,6 +352,51 @@ void ConstructInputWithPadding(const Tensor *input_tensor,
}
}
void ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor,
const int pad_top,
const int pad_bottom,
const int pad_left,
const int pad_right,
Tensor *output_tensor) {
Tensor::MappingGuard input_mapper(input_tensor);
const float *input = input_tensor->data<float>();
const index_t *input_shape = input_tensor->shape().data();
index_t batch = input_shape[0];
index_t channels = input_shape[1];
index_t height = input_shape[2];
index_t width = input_shape[3];
const int pad_height = pad_top + pad_bottom;
const int pad_width = pad_left + pad_right;
std::vector<index_t> output_shape(
{batch, channels, height + pad_height, width + pad_width});
output_tensor->Resize(output_shape);
Tensor::MappingGuard padded_output_mapper(output_tensor);
float *output_data = output_tensor->mutable_data<float>();
const index_t output_height = output_shape[2];
const index_t output_width = output_shape[3];
const index_t in_image_size = height * width;
const index_t out_image_size = output_height * output_width;
const index_t in_batch_size = channels * in_image_size;
const index_t out_batch_size = channels * out_image_size;
#pragma omp parallel for collapse(2)
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
for (int k = 0; k < height; ++k) {
memcpy(output_data + i * out_batch_size + j * out_image_size
+ (pad_top + k) * output_width + pad_left,
input + i * in_batch_size + j * in_image_size + k * width,
width * sizeof(float));
}
// Skip the padded bottom in this channel and top in the next channel
}
}
}
void ConstructNHWCInputWithPadding(const Tensor *input_tensor,
const int *paddings,
Tensor *output_tensor,
......
......@@ -22,16 +22,16 @@ enum RoundType {
namespace kernels {
void CalcPaddingAndOutputSize(const index_t *input_shape, // NCHW
const index_t *filter_shape, // OIHW
void CalcNCHWPaddingAndOutputSize(const index_t *input_shape,
const index_t *filter_shape,
const int *dilations,
const int *strides,
Padding padding,
index_t *output_shape,
int *padding_size);
void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NCHW
const index_t *filter_shape, // OIHW
void CalcNHWCPaddingAndOutputSize(const index_t *input_shape,
const index_t *filter_shape,
const int *dilations,
const int *strides,
Padding padding,
......@@ -46,6 +46,14 @@ void CalcOutputSize(const index_t *input_shape, // NHWC
const RoundType round_type,
index_t *output_shape);
void CalcNCHWOutputSize(const index_t *input_shape,
const index_t *filter_shape,
const int *padding_size,
const int *dilations,
const int *strides,
const RoundType round_type,
index_t *output_shape);
void CalPaddingSize(const index_t *input_shape, // NCHW
const index_t *filter_shape, // OIHW
const int *dilations,
......@@ -53,10 +61,15 @@ void CalPaddingSize(const index_t *input_shape, // NCHW
Padding padding,
int *padding_size);
void ConstructInputWithPadding(const Tensor *input,
const int *paddings,
Tensor *output_tensor,
bool padding_same_value = false);
void ConstructNCHWInputWithSpecificPadding(const Tensor *input,
const int pad_top, const int pad_bottom,
const int pad_left, const int pad_right,
Tensor *output_tensor);
void ConstructNCHWInputWithPadding(const Tensor *input,
const int *paddings,
Tensor *output_tensor,
bool padding_same_value = false);
void ConstructNHWCInputWithPadding(const Tensor *input,
const int *paddings,
......
......@@ -14,6 +14,7 @@
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/kernels/activation.h"
#include "mace/public/mace.h"
namespace mace {
......@@ -407,12 +408,27 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
};
template <>
void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output,
StatsFuture *future);
struct DepthwiseConv2dFunctor<DeviceType::NEON, float>
: DepthwiseConv2dFunctorBase {
DepthwiseConv2dFunctor(const int *strides,
const Padding padding_type,
const std::vector<int> &paddings,
const int *dilations,
const ActivationType activation,
const float relux_max_limit)
: DepthwiseConv2dFunctorBase(strides,
padding_type,
paddings,
dilations,
activation,
relux_max_limit) {}
void operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output,
StatsFuture *future);
};
template <typename T>
struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
......
//
// Copyright (c) 2018 XiaoMi All rights reserved.
//
#include <math.h>
#include <algorithm>
#include "mace/kernels/gemm.h"
#include "mace/utils/utils.h"
#include "mace/utils/logging.h"
namespace mace {
namespace kernels {
namespace {
void GemmRef(const float *A,
const float *B,
const index_t height,
const index_t K,
const index_t width,
float *C) {
memset(C, 0, sizeof(float) * height * width);
for (int i = 0; i < height; ++i) {
for (int j = 0; j < width; ++j) {
for (int k = 0; k < K; ++k) {
C[i * width + j] += A[i * K + k] * B[k * width + j];
}
}
}
}
inline void GemmBlock(const float *A,
const float *B,
const index_t height,
const index_t K,
const index_t width,
const index_t stride_k,
const index_t stride_w,
float *C) {
for (int i = 0; i < height; ++i) {
for (int j = 0; j < width; ++j) {
for (int k = 0; k < K; ++k) {
C[i * stride_w + j] += A[i * stride_k + k] * B[k * stride_w + j];
}
}
}
}
// TODO(liyin): may need implement 883 since RGB
inline void Gemm884(const float *a_ptr,
const float *b_ptr,
index_t stride_w,
index_t stride_k,
float *c_ptr) {
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
float32x4_t a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14,
a15;
float32x4_t b0, b1, b2, b3, b4, b5, b6, b7;
float32x4_t c0, c1, c2, c3, c4, c5, c6, c7;
a0 = vld1q_f32(a_ptr);
a1 = vld1q_f32(a_ptr + 4);
a2 = vld1q_f32(a_ptr + 1 * stride_k);
a3 = vld1q_f32(a_ptr + 1 * stride_k + 4);
a4 = vld1q_f32(a_ptr + 2 * stride_k);
a5 = vld1q_f32(a_ptr + 2 * stride_k + 4);
a6 = vld1q_f32(a_ptr + 3 * stride_k);
a7 = vld1q_f32(a_ptr + 3 * stride_k + 4);
a8 = vld1q_f32(a_ptr + 4 * stride_k);
a9 = vld1q_f32(a_ptr + 4 * stride_k + 4);
a10 = vld1q_f32(a_ptr + 5 * stride_k);
a11 = vld1q_f32(a_ptr + 5 * stride_k + 4);
a12 = vld1q_f32(a_ptr + 6 * stride_k);
a13 = vld1q_f32(a_ptr + 6 * stride_k + 4);
a14 = vld1q_f32(a_ptr + 7 * stride_k);
a15 = vld1q_f32(a_ptr + 7 * stride_k + 4);
b0 = vld1q_f32(b_ptr);
b1 = vld1q_f32(b_ptr + 1 * stride_w);
b2 = vld1q_f32(b_ptr + 2 * stride_w);
b3 = vld1q_f32(b_ptr + 3 * stride_w);
b4 = vld1q_f32(b_ptr + 4 * stride_w);
b5 = vld1q_f32(b_ptr + 5 * stride_w);
b6 = vld1q_f32(b_ptr + 6 * stride_w);
b7 = vld1q_f32(b_ptr + 7 * stride_w);
c0 = vld1q_f32(c_ptr);
c1 = vld1q_f32(c_ptr + 1 * stride_w);
c2 = vld1q_f32(c_ptr + 2 * stride_w);
c3 = vld1q_f32(c_ptr + 3 * stride_w);
c4 = vld1q_f32(c_ptr + 4 * stride_w);
c5 = vld1q_f32(c_ptr + 5 * stride_w);
c6 = vld1q_f32(c_ptr + 6 * stride_w);
c7 = vld1q_f32(c_ptr + 7 * stride_w);
#define MACE_CONV_1x1_REG_CAL(RC, RA, RAN) \
c##RC = vfmaq_laneq_f32(c##RC, b0, a##RA, 0); \
c##RC = vfmaq_laneq_f32(c##RC, b1, a##RA, 1); \
c##RC = vfmaq_laneq_f32(c##RC, b2, a##RA, 2); \
c##RC = vfmaq_laneq_f32(c##RC, b3, a##RA, 3); \
c##RC = vfmaq_laneq_f32(c##RC, b4, a##RAN, 0); \
c##RC = vfmaq_laneq_f32(c##RC, b5, a##RAN, 1); \
c##RC = vfmaq_laneq_f32(c##RC, b6, a##RAN, 2); \
c##RC = vfmaq_laneq_f32(c##RC, b7, a##RAN, 3);
MACE_CONV_1x1_REG_CAL(0, 0, 1);
MACE_CONV_1x1_REG_CAL(1, 2, 3);
MACE_CONV_1x1_REG_CAL(2, 4, 5);
MACE_CONV_1x1_REG_CAL(3, 6, 7);
MACE_CONV_1x1_REG_CAL(4, 8, 9);
MACE_CONV_1x1_REG_CAL(5, 10, 11);
MACE_CONV_1x1_REG_CAL(6, 12, 13);
MACE_CONV_1x1_REG_CAL(7, 14, 15);
vst1q_f32(c_ptr, c0);
vst1q_f32(c_ptr + 1 * stride_w, c1);
vst1q_f32(c_ptr + 2 * stride_w, c2);
vst1q_f32(c_ptr + 3 * stride_w, c3);
vst1q_f32(c_ptr + 4 * stride_w, c4);
vst1q_f32(c_ptr + 5 * stride_w, c5);
vst1q_f32(c_ptr + 6 * stride_w, c6);
vst1q_f32(c_ptr + 7 * stride_w, c7);
#else
GemmBlock(a_ptr, b_ptr, 8, 8, 4, stride_k, stride_w, c_ptr);
#endif
}
inline void GemmTile(const float *A,
const float *B,
const index_t height,
const index_t K,
const index_t width,
const index_t stride_k,
const index_t stride_w,
float *C) {
index_t h, w, k;
for (h = 0; h + 7 < height; h += 8) {
for (w = 0; w + 3 < width; w += 4) {
for (k = 0; k + 7 < K; k += 8) {
const float *a_ptr = A + (h * stride_k + k);
const float *b_ptr = B + (k * stride_w + w);
float *c_ptr = C + (h * stride_w + w);
Gemm884(a_ptr, b_ptr, stride_w, stride_k, c_ptr);
}
if (k < K) {
const float *a_ptr = A + (h * stride_k + k);
const float *b_ptr = B + (k * stride_w + w);
float *c_ptr = C + (h * stride_w + w);
GemmBlock(a_ptr, b_ptr, 8, K - k, 4, stride_k, stride_w, c_ptr);
}
}
if (w < width) {
const float *a_ptr = A + h * stride_k;
const float *b_ptr = B + w;
float *c_ptr = C + (h * stride_w + w);
GemmBlock(a_ptr,
b_ptr,
8,
K,
width - w,
stride_k,
stride_w,
c_ptr);
}
}
if (h < height) {
// TODO(liyin): may use Gemm444
const float *a_ptr = A + (h * stride_k);
const float *b_ptr = B;
float *c_ptr = C + h * stride_w;
GemmBlock(a_ptr,
b_ptr,
height - h,
K,
width,
stride_k,
stride_w,
c_ptr);
}
}
} // namespace
void Gemm(const float *A,
const float *B,
const index_t batch,
const index_t height,
const index_t K,
const index_t width,
float *C) {
memset(C, 0, sizeof(float) * batch * height * width);
// It is better to use large block size if it fits for fast cache.
// Assume l1 cache size is 32k, we load three blocks at a time (A, B, C),
// the block size should be sqrt(32k / sizeof(T) / 3).
const index_t block_size = 48;
const index_t block_tile_height = RoundUpDiv(height, block_size);
const index_t block_tile_width = RoundUpDiv(width, block_size);
const index_t block_tile_k = RoundUpDiv(K, block_size);
const index_t remain_height = height % block_size;
const index_t remain_width = width % block_size;
const index_t remain_k = K % block_size;
#pragma omp parallel for collapse(3)
for (index_t n = 0; n < batch; ++n) {
for (index_t bh = 0; bh < block_tile_height; ++bh) {
for (index_t bw = 0; bw < block_tile_width; ++bw) {
const float *a_base = A + n * height * K;
const float *b_base = B + n * K * width;
float *c_base = C + n * height * width;
const index_t ih_begin = bh * block_size;
const index_t ih_end =
bh * block_size + (bh == block_tile_height - 1 && remain_height > 0
? remain_height : block_size);
const index_t iw_begin = bw * block_size;
const index_t iw_end =
bw * block_size
+ (bw == block_tile_width - 1 && remain_width > 0 ? remain_width
: block_size);
for (index_t bk = 0; bk < block_tile_k; ++bk) {
const index_t ik_begin = bk * block_size;
const index_t ik_end =
bk * block_size
+ (bk == block_tile_k - 1 && remain_k > 0 ? remain_k
: block_size);
// inside block:
// calculate C[bh, bw] += A[bh, bk] * B[bk, bw] for one k
GemmTile(a_base + (ih_begin * K + ik_begin),
b_base + (ik_begin * width + iw_begin),
ih_end - ih_begin,
ik_end - ik_begin,
iw_end - iw_begin,
K,
width,
c_base + (ih_begin * width + iw_begin));
} // bk
} // bw
} // bh
} // n
}
} // namespace kernels
} // namespace mace
//
// Copyright (c) 2018 XiaoMi All rights reserved.
//
#ifndef MACE_KERNELS_GEMM_H_
#define MACE_KERNELS_GEMM_H_
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
#include <arm_neon.h>
#endif
#include "mace/core/types.h"
namespace mace {
namespace kernels {
void Gemm(const float *A,
const float *B,
const index_t batch,
const index_t height,
const index_t K,
const index_t width,
float *C);
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_GEMM_H_
//
// Copyright (c) 2018 XiaoMi All rights reserved.
//
#include <gtest/gtest.h>
#include <random>
#include "mace/kernels/gemm.h"
#include "mace/core/types.h"
namespace mace {
TEST(GEMMTest, gemm) {
index_t N = 17;
index_t M = 33;
index_t K = 64;
float *A = new float[N * K];
float *B = new float[K * M];
float *C = new float[N * M];
float *C_ref = new float[N * M];
std::random_device rd;
std::mt19937 gen(rd());
std::normal_distribution<float> nd(0, 1);
std::generate(A, A + N * K,
[&gen, &nd] {
return nd(gen);
});
std::generate(B, B + K * M,
[&gen, &nd] {
return nd(gen);
});
kernels::Gemm(A, B, N, K, M, C);
kernels::GemmRef(A, B, N, K, M, C_ref);
for (int i = 0; i < N * M; ++i) {
EXPECT_NEAR(C_ref[i], C[i], 0.1);
}
delete[]A;
delete[]B;
delete[]C;
}
} // namespace mace
......@@ -16,142 +16,12 @@
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
#include "mace/kernels/gemm.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
template<typename T,
int register_tile_size,
int h_count,
int w_count,
int k_count>
inline void MatMulKernelFunc(const T *A,
const T *B,
T *C,
index_t offset_h,
index_t offset_w,
index_t offset_k,
index_t stride_h,
index_t stride_w,
index_t stride_k) {
T a_tmp[register_tile_size][register_tile_size] = {0};
T b_tmp[register_tile_size][register_tile_size] = {0};
T c_tmp[register_tile_size][register_tile_size] = {0};
for (int h = 0; h < h_count; ++h) {
for (int k = 0; k < k_count; ++k) {
a_tmp[h][k] = A[(offset_h + h) * stride_k + (offset_k + k)];
}
}
for (int k = 0; k < k_count; ++k) {
for (int w = 0; w < w_count; ++w) {
b_tmp[k][w] = B[(offset_k + k) * stride_w + (offset_w + w)];
}
}
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
static_assert(register_tile_size == 4, "register tile size must be 4");
float32x4_t a_dup;
float32x4_t b_vec[4] =
{vld1q_f32(b_tmp[0]), vld1q_f32(b_tmp[1]), vld1q_f32(b_tmp[2]),
vld1q_f32(b_tmp[3])};
float32x4_t
c_vec[4] = {vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0)};
for (int h = 0; h < register_tile_size; ++h) {
for (int k = 0; k < register_tile_size; ++k) {
a_dup = vdupq_n_f32(a_tmp[h][k]);
c_vec[h] = vfmaq_f32(c_vec[h], a_dup, b_vec[k]);
}
}
for (int h = 0; h < register_tile_size; ++h) {
vst1q_f32(c_tmp[h], c_vec[h]);
}
#else
for (int h = 0; h < register_tile_size; ++h) {
for (int w = 0; w < register_tile_size; ++w) {
for (int k = 0; k < register_tile_size; ++k) {
c_tmp[h][w] += a_tmp[h][k] * b_tmp[k][w];
}
}
}
#endif
for (int h = 0; h < h_count; ++h) {
for (int w = 0; w < w_count; ++w) {
C[(offset_h + h) * stride_w + (offset_w + w)] += c_tmp[h][w];
}
}
}
#define MACE_DO_MATMUL(HC, WC, KC) \
MatMulKernelFunc<T, register_tile_size, HC, WC, KC>(a_ptr_batch_base, \
b_ptr_batch_base, \
c_ptr_batch_base, \
ih, \
iw, \
ik, \
height, \
width, \
K);
#define MACE_CASE_K_MATMUL(HC, WC) \
switch (k_count) { \
case 1: \
MACE_DO_MATMUL(HC, WC, 1); \
break; \
case 2: \
MACE_DO_MATMUL(HC, WC, 2); \
break; \
case 3: \
MACE_DO_MATMUL(HC, WC, 3); \
break; \
case 4: \
MACE_DO_MATMUL(HC, WC, 4); \
break; \
default: \
LOG(FATAL) << "Unsupported k tile: " << k_count; \
}
#define MACE_CASE_W_MATMUL(HC) \
switch (w_count) { \
case 1: \
MACE_CASE_K_MATMUL(HC, 1); \
break; \
case 2: \
MACE_CASE_K_MATMUL(HC, 2); \
break; \
case 3: \
MACE_CASE_K_MATMUL(HC, 3); \
break; \
case 4: \
MACE_CASE_K_MATMUL(HC, 4); \
break; \
default: \
LOG(FATAL) << "Unsupported w tile: " << w_count; \
}
#define MACE_CASE_H_MATMUL \
switch (h_count) { \
case 1: \
MACE_CASE_W_MATMUL(1); \
break; \
case 2: \
MACE_CASE_W_MATMUL(2); \
break; \
case 3: \
MACE_CASE_W_MATMUL(3); \
break; \
case 4: \
MACE_CASE_W_MATMUL(4); \
break; \
default: \
LOG(FATAL) << "Unsupported h tile: " << h_count; \
}
template<DeviceType D, typename T>
struct MatMulFunctor {
void operator()(const Tensor *A,
......@@ -185,51 +55,7 @@ struct MatMulFunctor {
constexpr index_t register_tile_size = 4;
memset(c_ptr_base, 0, batch * height * width * sizeof(T));
#pragma omp parallel for collapse(3)
for (index_t n = 0; n < batch; ++n) {
// handle block
for (index_t bh = 0; bh < block_tile_height; ++bh) {
for (index_t bw = 0; bw < block_tile_width; ++bw) {
const T *a_ptr_batch_base = a_ptr_base + n * height * K;
const T *b_ptr_batch_base = b_ptr_base + n * K * width;
T *c_ptr_batch_base = c_ptr_base + n * height * width;
const index_t ih_begin = bh * block_size;
const index_t ih_end =
bh * block_size + (bh == block_tile_height - 1 && remain_height > 0
? remain_height : block_size);
const index_t iw_begin = bw * block_size;
const index_t iw_end =
bw * block_size
+ (bw == block_tile_width - 1 && remain_width > 0 ? remain_width
: block_size);
for (index_t bk = 0; bk < block_tile_k; ++bk) {
const index_t ik_begin = bk * block_size;
const index_t ik_end =
bk * block_size
+ (bk == block_tile_k - 1 && remain_k > 0 ? remain_k
: block_size);
// inside block:
// calculate C[bh, bw] += A[bh, bk] * B[bk, bw] for one k
for (index_t ih = ih_begin; ih < ih_end;
ih += register_tile_size) {
for (index_t iw = iw_begin; iw < iw_end;
iw += register_tile_size) {
for (index_t ik = ik_begin; ik < ik_end;
ik += register_tile_size) {
const int h_count = std::min(register_tile_size, ih_end - ih);
const int w_count = std::min(register_tile_size, iw_end - iw);
const int k_count = std::min(register_tile_size, ik_end - ik);
MACE_CASE_H_MATMUL;
} // ik
} // iw
} // ih
} // bk
} // bw
} // bh
} // n
Gemm(a_ptr_base, b_ptr_base, batch, height, K, width, c_ptr_base);
}
};
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <arm_neon.h>
#include <float.h>
#include <limits>
namespace mace {
namespace kernels {
void PoolingAvgNeonK2x2S2x2(const float *input,
const index_t *in_shape,
float *output,
const index_t *out_shape,
const int *paddings) {
index_t batch = in_shape[0];
index_t channels = in_shape[1];
index_t in_height = in_shape[2];
index_t in_width = in_shape[3];
index_t out_height = out_shape[2];
index_t out_width = out_shape[3];
int padding_top = paddings[0] / 2;
int padding_bottom = paddings[0] - padding_top;
int padding_left = paddings[1] / 2;
int padding_right = paddings[1] - padding_left;
int in_image_size = in_height * in_width;
int out_image_size = out_height * out_width;
index_t input_offset = 0;
index_t output_offset = 0;
float avg_factors[4] = {0.25, 0.25, 0.25, 0.25};
#pragma omp parallel for collapse(2)
for (int b = 0; b < batch; ++b) {
for (int c = 0; c < channels; ++c) {
float *outptr = output + output_offset;
const float *r0, *r1;
for (int h = 0; h < out_height; ++h) {
int w = 0;
int num_vectors = 0;
if (!((h == 0 && padding_top > 0) ||
(h == out_height - 1 && padding_bottom > 0))) {
r0 = input + input_offset + (h * 2 - padding_top) * in_width;
r1 = r0 + in_width;
if (padding_left > 0) {
*outptr = (r0[0] + r1[0]) * 0.25;
++r0;
++r1;
++outptr;
++w;
}
if (padding_right > 0) {
num_vectors = (out_width - w - 1) >> 2;
} else {
num_vectors = (out_width - w) >> 2;
}
}
w += num_vectors << 2;
float32x4_t factors = vld1q_f32(avg_factors);
for (; num_vectors > 0; --num_vectors) {
float32x4_t r00 = vld1q_f32(r0);
float32x4_t r10 = vld1q_f32(r1);
float32x4_t r01 = vld1q_f32(r0 + 4);
float32x4_t r11 = vld1q_f32(r1 + 4);
float32x4_t sum0 = vaddq_f32(r00, r10);
float32x4_t sum1 = vaddq_f32(r01, r11);
float32x4_t sum_result = vpaddq_f32(sum0, sum1);
float32x4_t avg_result = vmulq_f32(sum_result, factors);
vst1q_f32(outptr, avg_result);
r0 += 8;
r1 += 8;
outptr += 4;
}
for (; w < out_width; ++w) {
float sum = 0.0;
for (int kh = 0; kh < 2; ++kh) {
for (int kw = 0; kw < 2; ++kw) {
int inh = h * 2 - padding_top + kh;
int inw = w * 2 - padding_left + kw;
if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
sum += input[input_offset + inh * in_width + inw];
}
}
}
*outptr = sum * 0.25;
++outptr;
}
}
input_offset += in_image_size;
output_offset += out_image_size;
}
}
}
// assume the input has already been padded
void PoolingAvgNeonK2x2S2x2Padded(const float *input,
const index_t *in_shape,
float *output,
const index_t *out_shape) {
index_t batch = in_shape[0];
index_t channels = in_shape[1];
index_t in_height = in_shape[2];
index_t in_width = in_shape[3];
index_t out_height = out_shape[2];
index_t out_width = out_shape[3];
int in_image_size = in_height * in_width;
int out_image_size = out_height * out_width;
index_t input_offset = 0;
index_t output_offset = 0;
float avg_factors[4] = {0.25, 0.25, 0.25, 0.25};
#pragma omp parallel for collapse(2)
for (int b = 0; b < batch; ++b) {
for (int c = 0; c < channels; ++c) {
const float *img0 = input + input_offset;
float *outptr = output + output_offset;
const float *r0 = img0;
const float *r1 = img0 + in_width;
for (int h = 0; h < out_height; ++h) {
int num_vectors = out_width >> 2;
int remain = out_width - (num_vectors << 2);
float32x4_t factors = vld1q_f32(avg_factors);
for (; num_vectors > 0; --num_vectors) {
float32x4_t r00 = vld1q_f32(r0);
float32x4_t r10 = vld1q_f32(r1);
float32x4_t r01 = vld1q_f32(r0 + 4);
float32x4_t r11 = vld1q_f32(r1 + 4);
float32x4_t sum0 = vaddq_f32(r00, r10);
float32x4_t sum1 = vaddq_f32(r01, r11);
float32x4_t sum_result = vpaddq_f32(sum0, sum1);
float32x4_t avg_result = vmulq_f32(sum_result, factors);
vst1q_f32(outptr, avg_result);
r0 += 8;
r1 += 8;
outptr += 4;
}
for (; remain > 0; --remain) {
*outptr = (r0[0] + r0[1] + r1[0] + r1[1]) * 0.25;
r0 += 2;
r1 += 2;
outptr++;
}
r0 += in_width;
r1 += in_width;
}
input_offset += in_image_size;
output_offset += out_image_size;
}
}
}
} // namespace kernels
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/batch_norm.h"
#include <arm_neon.h>
namespace mace {
namespace kernels {
template <>
void BatchNormFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input,
const Tensor *scale,
const Tensor *offset,
const Tensor *mean,
const Tensor *var,
const float epsilon,
Tensor *output,
StatsFuture *future) {
// Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
// The calculation formula for inference is
// Y = \frac{ \scale } { \sqrt{var+\epsilon} } * X +
// ( \offset - \frac { \scale * mean } { \sqrt{var+\epsilon}
// }
// new_scale = \frac{ \scale } { \sqrt{var+\epsilon} }
// new_offset = \offset - mean * common_val;
// Y = new_scale * X + new_offset;
const index_t n = input->dim(0);
const index_t sample_size = input->dim(1) * input->dim(2);
const index_t channel = input->dim(3);
const float *input_ptr = input->data<float>();
const float *scale_ptr = scale->data<float>();
const float *offset_ptr = offset->data<float>();
const float *mean_ptr = mean->data<float>();
const float *var_ptr = var->data<float>();
float *output_ptr = output->mutable_data<float>();
const index_t ch_blks = channel >> 2;
const index_t remain_chs = channel - (ch_blks << 2);
std::vector<float> new_scale(channel);
std::vector<float> new_offset(channel);
#pragma omp parallel for
for (index_t c = 0; c < channel; ++c) {
new_scale[c] = scale_ptr[c] / std::sqrt(var_ptr[c] + epsilon);
new_offset[c] = offset_ptr[c] - mean_ptr[c] * new_scale[c];
}
#pragma omp parallel for collapse(2)
for (index_t i = 0; i < n; ++i) {
for (index_t j = 0; j < sample_size; ++j) {
const float *input_sample_ptr =
input_ptr + (i * sample_size + j) * channel;
float *output_sample_ptr = output_ptr + (i * sample_size + j) * channel;
const float *new_scale_ptr = new_scale.data();
const float *new_offset_ptr = new_offset.data();
for (index_t cb = 0; cb < ch_blks; ++cb) {
float32x4_t new_scale_f = vld1q_f32(new_scale_ptr);
float32x4_t new_offset_f = vld1q_f32(new_offset_ptr);
float32x4_t input_f = vld1q_f32(input_sample_ptr);
float32x4_t output_f = vfmaq_f32(new_offset_f, input_f, new_scale_f);
vst1q_f32(output_sample_ptr, output_f);
input_sample_ptr += 4;
output_sample_ptr += 4;
new_scale_ptr += 4;
new_offset_ptr += 4;
}
for (index_t c = (ch_blks << 2); c < channel; ++c) {
*output_sample_ptr = new_scale[c] * *input_sample_ptr + new_offset[c];
++output_sample_ptr;
++input_sample_ptr;
++new_scale_ptr;
++new_offset_ptr;
}
}
}
}
} // namespace kernels
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/conv_pool_2d_util.h"
namespace mace {
namespace kernels {
extern void Conv2dNeonK1x1S1(const float *input,
const index_t *input_shape,
const float *filter,
const index_t *filter_shape,
const float *bias,
float *output,
const index_t *output_shape);
extern void Conv2dNeonK3x3S1(const float *input,
const index_t *input_shape,
const float *filter,
const index_t *filter_shape,
const float *bias,
float *output,
const index_t *output_shape);
extern void Conv2dNeonK3x3S2(const float *input,
const index_t *input_shape,
const float *filter,
const index_t *filter_shape,
const float *bias,
float *output,
const index_t *output_shape);
extern void Conv2dNeonK5x5S1(const float *input,
const index_t *input_shape,
const float *filter,
const index_t *filter_shape,
const float *bias,
float *output,
const index_t *output_shape);
template <>
void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
MACE_CHECK_NOTNULL(input);
MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(output);
std::vector<index_t> output_shape_vec(4);
std::vector<int> paddings(2);
kernels::CalcPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations_, strides_,
paddings_, output_shape_vec.data(), paddings.data());
output->Resize(output_shape_vec);
typedef void (*Conv2dNeonFunction)(
const float *input, const index_t *input_shape, const float *filter,
const index_t *filter_shape, const float *bias, float *output,
const index_t *output_shape);
// Selection matrix: kernel_size x stride_size
static const Conv2dNeonFunction selector[5][2] = {
{Conv2dNeonK1x1S1, nullptr},
{nullptr, nullptr},
{Conv2dNeonK3x3S1, Conv2dNeonK3x3S2},
{nullptr, nullptr},
{Conv2dNeonK5x5S1, nullptr}};
// not implement yet
index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3);
if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] ||
strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 ||
selector[kernel_h - 1][strides_[0] - 1] == nullptr) {
LOG(WARNING) << "NEON conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides_[0] << "x" << strides_[1]
<< " is not implemented yet, using slow version";
Conv2dFunctor<DeviceType::CPU, float>(strides_, paddings_, dilations_)(
input, filter, bias, output, future);
return;
}
Tensor padded_input;
// Keep this alive during kernel execution
if (paddings[0] > 0 || paddings[1] > 0) {
ConstructInputWithPadding(input, paddings.data(), &padded_input);
input = &padded_input;
}
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard bias_mapper(bias);
Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>();
auto input_shape = input->shape().data();
auto filter_data = filter->data<float>();
auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
auto output_data = output->mutable_data<float>();
auto output_shape = output->shape().data();
auto conv2d_neon_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_neon_func(input_data, input_shape, filter_data, nullptr, bias_data,
output_data, output_shape);
}
} // namespace kernels
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <arm_neon.h>
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
static constexpr index_t kInputChannelBlockSize = 2;
static constexpr index_t kOutputChannelBlockSize = 4;
static __attribute__((__aligned__(64)))
int32_t mask_array[8] = {0, 0, 0, 0, -1, -1, -1, -1};
static inline void NeonConv2x4Kernel(index_t input_channels,
index_t pixel_size,
const float *input,
const float *filter,
float *output) {
const float *input0 = input;
const float *input1 = input + pixel_size;
const float32x2_t vfilter0x = vld1_f32(filter);
filter += input_channels;
const float32x2_t vfilter1x = vld1_f32(filter);
filter += input_channels;
const float32x2_t vfilter2x = vld1_f32(filter);
filter += input_channels;
const float32x2_t vfilter3x = vld1_f32(filter);
float *output0 = output;
float *output1 = output0 + pixel_size;
float *output2 = output1 + pixel_size;
float *output3 = output2 + pixel_size;
while (pixel_size >= 4) {
float32x4_t voutput0 = vld1q_f32(output0);
float32x4_t voutput1 = vld1q_f32(output1);
float32x4_t voutput2 = vld1q_f32(output2);
float32x4_t voutput3 = vld1q_f32(output3);
const float32x4_t vinput0 = vld1q_f32(input0);
input0 += 4;
voutput0 = vfmaq_lane_f32(voutput0, vinput0, vfilter0x, 0);
voutput1 = vfmaq_lane_f32(voutput1, vinput0, vfilter1x, 0);
voutput2 = vfmaq_lane_f32(voutput2, vinput0, vfilter2x, 0);
voutput3 = vfmaq_lane_f32(voutput3, vinput0, vfilter3x, 0);
const float32x4_t vinput1 = vld1q_f32(input1);
input1 += 4;
voutput0 = vfmaq_lane_f32(voutput0, vinput1, vfilter0x, 1);
voutput1 = vfmaq_lane_f32(voutput1, vinput1, vfilter1x, 1);
voutput2 = vfmaq_lane_f32(voutput2, vinput1, vfilter2x, 1);
voutput3 = vfmaq_lane_f32(voutput3, vinput1, vfilter3x, 1);
vst1q_f32(output0, voutput0);
output0 += 4;
vst1q_f32(output1, voutput1);
output1 += 4;
vst1q_f32(output2, voutput2);
output2 += 4;
vst1q_f32(output3, voutput3);
output3 += 4;
pixel_size -= 4;
}
if (pixel_size != 0) {
const int32x4_t vmask = vld1q_s32(&mask_array[pixel_size]);
output0 = output0 + pixel_size - 4;
float32x4_t voutput0 = vld1q_f32(output0);
output1 = output1 + pixel_size - 4;
float32x4_t voutput1 = vld1q_f32(output1);
output2 = output2 + pixel_size - 4;
float32x4_t voutput2 = vld1q_f32(output2);
output3 = output3 + pixel_size - 4;
float32x4_t voutput3 = vld1q_f32(output3);
const float32x4_t vinput0 = vreinterpretq_f32_s32(vandq_s32(
vmask, vreinterpretq_s32_f32(vld1q_f32(&input0[pixel_size - 4]))));
voutput0 = vfmaq_lane_f32(voutput0, vinput0, vfilter0x, 0);
voutput1 = vfmaq_lane_f32(voutput1, vinput0, vfilter1x, 0);
voutput2 = vfmaq_lane_f32(voutput2, vinput0, vfilter2x, 0);
voutput3 = vfmaq_lane_f32(voutput3, vinput0, vfilter3x, 0);
const float32x4_t vinput1 = vreinterpretq_f32_s32(vandq_s32(
vmask, vreinterpretq_s32_f32(vld1q_f32(&input1[pixel_size - 4]))));
voutput0 = vfmaq_lane_f32(voutput0, vinput1, vfilter0x, 1);
voutput1 = vfmaq_lane_f32(voutput1, vinput1, vfilter1x, 1);
voutput2 = vfmaq_lane_f32(voutput2, vinput1, vfilter2x, 1);
voutput3 = vfmaq_lane_f32(voutput3, vinput1, vfilter3x, 1);
vst1q_f32(output0, voutput0);
vst1q_f32(output1, voutput1);
vst1q_f32(output2, voutput2);
vst1q_f32(output3, voutput3);
}
}
static inline void NeonConv2x4SubBlockKernel(
index_t input_channels_subblock_size,
index_t output_channels_subblock_size,
index_t input_channels,
index_t pixel_size,
const float *input,
const float *filter,
float *output) {
const float *input0 = input;
const float *input1 = input + pixel_size;
float32x2_t vfilter0x, vfilter1x, vfilter2x, vfilter3x;
vfilter0x = vld1_dup_f32(&filter[0]);
if (input_channels_subblock_size > 1) {
vfilter0x = vld1_lane_f32(&filter[1], vfilter0x, 1);
}
if (output_channels_subblock_size > 1) {
filter += input_channels;
vfilter1x = vld1_dup_f32(&filter[0]);
if (input_channels_subblock_size > 1) {
vfilter1x = vld1_lane_f32(&filter[1], vfilter1x, 1);
}
if (output_channels_subblock_size > 2) {
filter += input_channels;
vfilter2x = vld1_dup_f32(&filter[0]);
if (input_channels_subblock_size > 1) {
vfilter2x = vld1_lane_f32(&filter[1], vfilter2x, 1);
}
if (output_channels_subblock_size > 3) {
filter += input_channels;
vfilter3x = vld1_dup_f32(&filter[0]);
if (input_channels_subblock_size > 1) {
vfilter3x = vld1_lane_f32(&filter[1], vfilter3x, 1);
}
}
}
}
float *output0 = output;
float *output1 = output0 + pixel_size;
float *output2 = output1 + pixel_size;
float *output3 = output2 + pixel_size;
while (pixel_size >= 4) {
float32x4_t voutput0, voutput1, voutput2, voutput3;
voutput0 = vld1q_f32(output0);
if (output_channels_subblock_size > 1) {
voutput1 = vld1q_f32(output1);
if (output_channels_subblock_size > 2) {
voutput2 = vld1q_f32(output2);
if (output_channels_subblock_size > 3) {
voutput3 = vld1q_f32(output3);
}
}
}
const float32x4_t vinput0 = vld1q_f32(input0);
input0 += 4;
voutput0 = vfmaq_lane_f32(voutput0, vinput0, vfilter0x, 0);
voutput1 = vfmaq_lane_f32(voutput1, vinput0, vfilter1x, 0);
voutput2 = vfmaq_lane_f32(voutput2, vinput0, vfilter2x, 0);
voutput3 = vfmaq_lane_f32(voutput3, vinput0, vfilter3x, 0);
if (input_channels_subblock_size > 1) {
const float32x4_t vinput1 = vld1q_f32(input1);
input1 += 4;
voutput0 = vfmaq_lane_f32(voutput0, vinput1, vfilter0x, 1);
voutput1 = vfmaq_lane_f32(voutput1, vinput1, vfilter1x, 1);
voutput2 = vfmaq_lane_f32(voutput2, vinput1, vfilter2x, 1);
voutput3 = vfmaq_lane_f32(voutput3, vinput1, vfilter3x, 1);
}
vst1q_f32(output0, voutput0);
output0 += 4;
if (output_channels_subblock_size > 1) {
vst1q_f32(output1, voutput1);
output1 += 4;
if (output_channels_subblock_size > 2) {
vst1q_f32(output2, voutput2);
output2 += 4;
if (output_channels_subblock_size > 3) {
vst1q_f32(output3, voutput3);
output3 += 4;
}
}
}
pixel_size -= 4;
}
if (pixel_size != 0) {
const int32x4_t vmask = vld1q_s32(&mask_array[pixel_size]);
float32x4_t voutput0, voutput1, voutput2, voutput3;
output0 += pixel_size - 4;
voutput0 = vld1q_f32(output0);
if (output_channels_subblock_size > 1) {
output1 += pixel_size - 4;
voutput1 = vld1q_f32(output1);
if (output_channels_subblock_size > 2) {
output2 += pixel_size - 4;
voutput2 = vld1q_f32(output2);
if (output_channels_subblock_size > 3) {
output3 += pixel_size - 4;
voutput3 = vld1q_f32(output3);
}
}
}
const float32x4_t vinput0 = vreinterpretq_f32_s32(vandq_s32(
vmask, vreinterpretq_s32_f32(vld1q_f32(&input0[pixel_size - 4]))));
voutput0 = vfmaq_lane_f32(voutput0, vinput0, vfilter0x, 0);
voutput1 = vfmaq_lane_f32(voutput1, vinput0, vfilter1x, 0);
voutput2 = vfmaq_lane_f32(voutput2, vinput0, vfilter2x, 0);
voutput3 = vfmaq_lane_f32(voutput3, vinput0, vfilter3x, 0);
if (input_channels_subblock_size > 1) {
const float32x4_t vinput1 = vreinterpretq_f32_s32(vandq_s32(
vmask, vreinterpretq_s32_f32(vld1q_f32(&input1[pixel_size - 4]))));
voutput0 = vfmaq_lane_f32(voutput0, vinput1, vfilter0x, 1);
voutput1 = vfmaq_lane_f32(voutput1, vinput1, vfilter1x, 1);
voutput2 = vfmaq_lane_f32(voutput2, vinput1, vfilter2x, 1);
voutput3 = vfmaq_lane_f32(voutput3, vinput1, vfilter3x, 1);
}
vst1q_f32(output0, voutput0);
if (output_channels_subblock_size > 1) {
vst1q_f32(output1, voutput1);
if (output_channels_subblock_size > 2) {
vst1q_f32(output2, voutput2);
if (output_channels_subblock_size > 3) {
vst1q_f32(output3, voutput3);
}
}
}
}
}
void Conv2dNeonK1x1S1(const float *input, // NCHW
const index_t *input_shape,
const float *filter, // c_out, c_in, filter_h, filter_w
const index_t *filter_shape,
const float *bias, // c_out
float *output, // NCHW
const index_t *output_shape) {
const index_t batch = output_shape[0];
const index_t channels = output_shape[1];
const index_t height = output_shape[2];
const index_t width = output_shape[3];
const index_t input_batch = input_shape[0];
const index_t input_channels = input_shape[1];
const index_t input_height = input_shape[2];
const index_t input_width = input_shape[3];
MACE_CHECK(input_batch == batch && input_height == height &&
input_width == width);
const index_t total_pixels = height * width;
const index_t round_up_channels = RoundUp(channels, kOutputChannelBlockSize);
#pragma omp parallel for collapse(2)
for (index_t n = 0; n < batch; ++n) {
for (int i = 0; i < channels; ++i) {
float *output_ptr_base =
output + n * channels * total_pixels + i * total_pixels;
std::fill(output_ptr_base, output_ptr_base + total_pixels,
bias ? bias[i] : 0);
}
}
#pragma omp parallel for collapse(2)
for (index_t n = 0; n < batch; ++n) {
for (index_t c = 0; c < round_up_channels; c += kOutputChannelBlockSize) {
const float *input_ptr = input + n * input_channels * total_pixels;
const float *filter_ptr = filter + c * input_channels;
float *output_ptr =
output + n * channels * total_pixels + c * total_pixels;
const index_t output_channel_block_size =
std::min(channels - c, kOutputChannelBlockSize);
index_t remain_input_channels = input_channels;
if (c + kOutputChannelBlockSize <= channels) {
while (remain_input_channels >= kInputChannelBlockSize) {
NeonConv2x4Kernel(input_channels, total_pixels, input_ptr, filter_ptr,
output_ptr);
input_ptr += kInputChannelBlockSize * total_pixels;
filter_ptr += kInputChannelBlockSize;
remain_input_channels -= kInputChannelBlockSize;
}
}
while (remain_input_channels != 0) {
const index_t input_channel_block_size =
std::min(remain_input_channels, kInputChannelBlockSize);
NeonConv2x4SubBlockKernel(
input_channel_block_size, output_channel_block_size, input_channels,
total_pixels, input_ptr, filter_ptr, output_ptr);
input_ptr += kInputChannelBlockSize * total_pixels;
filter_ptr += kInputChannelBlockSize;
remain_input_channels -= input_channel_block_size;
}
}
}
}
void Conv2dNeonPixelK1x1S1(
const float *input, // NCHW
const index_t *input_shape,
const float *filter, // c_out, c_in, kernel_h, kernel_w
const index_t *filter_shape,
const float *bias, // c_out
float *output, // NCHW
const index_t *output_shape) {
const index_t batch = output_shape[0];
const index_t channels = output_shape[1];
const index_t height = output_shape[2];
const index_t width = output_shape[3];
const index_t input_batch = input_shape[0];
const index_t input_channels = input_shape[1];
const index_t input_height = input_shape[2];
const index_t input_width = input_shape[3];
MACE_CHECK(input_batch == batch && input_height == height &&
input_width == width);
const index_t total_pixels = height * width;
// Process 4 * 2 = 8 pixels for each innermost loop
// TODO(heliangliang): Does 64 bit v.s. 32 bit index matters? need benchmark
const index_t total_loops = total_pixels >> 3;
const index_t loop_remaining = total_pixels & 7;
#pragma omp parallel for collapse(2)
for (index_t n = 0; n < batch; ++n) {
for (index_t c = 0; c < channels; ++c) {
const float *filter_ptr = filter + c * input_channels;
// TODO(heliangliang): Will GCC opt these out?
float *channel_output_start =
output + n * channels * height * width + c * height * width;
const float *input_ptr =
input + n * input_channels * input_height * input_width;
// Fill with bias
float *output_ptr = channel_output_start;
std::fill(output_ptr, output_ptr + total_pixels, bias ? bias[c] : 0);
index_t inc = 0;
// Process 4 input channels in batch
for (; inc + 3 < input_channels; inc += 4) {
float *output_ptr = channel_output_start;
// The begining of each input feature map channel
MACE_ASSERT(input_ptr ==
input + n * input_channels * input_height * input_width +
inc * input_height * input_width);
const float *input_ptr1 = input_ptr + total_pixels;
const float *input_ptr2 = input_ptr1 + total_pixels;
const float *input_ptr3 = input_ptr2 + total_pixels;
// filter is in c_out, c_in, 1, 1 order
MACE_ASSERT(filter_ptr == filter + c * input_channels + inc);
const float k0 = filter_ptr[0];
const float k1 = filter_ptr[1];
const float k2 = filter_ptr[2];
const float k3 = filter_ptr[3];
filter_ptr += 4;
const float32x4_t vk0 = vdupq_n_f32(k0);
const float32x4_t vk1 = vdupq_n_f32(k1);
const float32x4_t vk2 = vdupq_n_f32(k2);
const float32x4_t vk3 = vdupq_n_f32(k3);
index_t loop_itr = total_loops;
for (; loop_itr > 0; --loop_itr) {
// Process 2 group of 4 floats
float32x4_t out0 = vld1q_f32(output_ptr);
float32x4_t out4 = vld1q_f32(output_ptr + 4);
const float32x4_t in00 = vld1q_f32(input_ptr);
const float32x4_t in04 = vld1q_f32(input_ptr + 4);
out0 = vfmaq_f32(out0, in00, vk0);
out4 = vfmaq_f32(out4, in04, vk0);
const float32x4_t in10 = vld1q_f32(input_ptr1);
const float32x4_t in14 = vld1q_f32(input_ptr1 + 4);
out0 = vfmaq_f32(out0, in10, vk1);
out4 = vfmaq_f32(out4, in14, vk1);
const float32x4_t in20 = vld1q_f32(input_ptr2);
const float32x4_t in24 = vld1q_f32(input_ptr2 + 4);
out0 = vfmaq_f32(out0, in20, vk2);
out4 = vfmaq_f32(out4, in24, vk2);
const float32x4_t in30 = vld1q_f32(input_ptr3);
const float32x4_t in34 = vld1q_f32(input_ptr3 + 4);
out0 = vfmaq_f32(out0, in30, vk3);
out4 = vfmaq_f32(out4, in34, vk3);
float prev_output = output_ptr[0];
// Save output
vst1q_f32(output_ptr, out0);
vst1q_f32(output_ptr + 4, out4);
output_ptr += 8;
input_ptr += 8;
input_ptr1 += 8;
input_ptr2 += 8;
input_ptr3 += 8;
}
// Process the remaining pixels
index_t remaining_pixels = loop_remaining;
for (; remaining_pixels > 0; --remaining_pixels) {
const float mul = *input_ptr * k0;
const float mul1 = *input_ptr1 * k1;
const float mul2 = *input_ptr2 * k2;
const float mul3 = *input_ptr3 * k3;
float prev_output = output_ptr[0];
*output_ptr += mul + mul1 + mul2 + mul3;
++output_ptr;
++input_ptr;
++input_ptr1;
++input_ptr2;
++input_ptr3;
}
// Skip these 4 feature maps
input_ptr += 3 * total_pixels;
}
// Process the remaining channels
for (; inc < input_channels; ++inc) {
float *output_ptr = channel_output_start;
MACE_ASSERT(input_ptr ==
input + n * input_channels * input_height * input_width +
inc * input_height * input_width);
MACE_ASSERT(filter_ptr == filter + c * input_channels + inc);
const float k0 = filter_ptr[0];
++filter_ptr;
const float32x4_t vk0 = vdupq_n_f32(k0);
index_t loop_itr = total_loops;
for (; loop_itr > 0; --loop_itr) {
float32x4_t out0 = vld1q_f32(output_ptr);
float32x4_t out4 = vld1q_f32(output_ptr + 4);
const float32x4_t in0 = vld1q_f32(input_ptr);
const float32x4_t in4 = vld1q_f32(input_ptr + 4);
out0 = vfmaq_f32(out0, in0, vk0);
out4 = vfmaq_f32(out4, in4, vk0);
// Save output
vst1q_f32(output_ptr, out0);
vst1q_f32(output_ptr + 4, out4);
output_ptr += 8;
input_ptr += 8;
}
// Process the remaining pixels
index_t remaining_pixels = loop_remaining;
for (; remaining_pixels > 0; --remaining_pixels) {
const float mul = *input_ptr * k0;
*output_ptr += mul;
++output_ptr;
++input_ptr;
}
}
}
}
}
} // namespace kernels
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <arm_neon.h>
namespace mace {
namespace kernels {
static const int kRegisterSize = 4;
static const int kFilterSize = 9;
void Conv2dNeonK3x3S1(const float *input, // NCHW
const index_t *input_shape,
const float *filter, // c_out, c_in, kernel_h, kernel_w
const index_t *filter_shape,
const float *bias, // c_out
float *output, // NCHW
const index_t *output_shape) {
int height_count = (output_shape[2] >> 1) << 1;
int output_batch = output_shape[0];
int output_channels = output_shape[1];
int output_height = output_shape[2];
int output_width = output_shape[3];
int input_batch = input_shape[0];
int input_channels = input_shape[1];
int input_height = input_shape[2];
int input_width = input_shape[3];
int multiplier = filter_shape == nullptr ? 0 : filter_shape[0];
int filter_in_channels = filter_shape == nullptr ? input_channels : 1;
#pragma omp parallel for collapse(2)
for (int b = 0; b < output_batch; ++b) {
for (int oc = 0; oc < output_channels; ++oc) {
float *output_ptr_base =
output + b * output_channels * output_height * output_width;
const float *filter_ptr = filter + oc * filter_in_channels * kFilterSize;
const float *input_ptr =
input + b * input_channels * input_height * input_width;
if (filter_shape != nullptr) {
input_ptr += (oc / multiplier) * input_height * input_width;
}
float *output_ptr = output_ptr_base + oc * output_height * output_width;
std::fill(output_ptr, output_ptr + output_height * output_width,
bias ? bias[oc] : 0);
for (int ic = 0; ic < filter_in_channels; ++ic) {
float32x4_t n_filter_v[3] = {vld1q_f32(filter_ptr),
vld1q_f32(filter_ptr + 3),
vld1q_f32(filter_ptr + 6)};
const float *row_ptr_v[kRegisterSize] = {
input_ptr, input_ptr + input_width, input_ptr + 2 * input_width,
input_ptr + 3 * input_width};
float *output_ptr_v[] = {output_ptr, output_ptr + output_width};
for (int h = 0; h < height_count; h += 2) {
int count = output_width >> 2;
int remain_count = output_width & 3;
for (; count > 0; --count) {
float32x4_t n_sum0 = vdupq_n_f32(.0f);
float32x4_t n_row_former = vld1q_f32(row_ptr_v[0]);
float32x4_t n_row_latter = vld1q_f32(row_ptr_v[0] + kRegisterSize);
float32x4_t n_row_ext0 = vextq_f32(n_row_former, n_row_latter, 1);
float32x4_t n_row_ext1 = vextq_f32(n_row_former, n_row_latter, 2);
n_sum0 = vfmaq_laneq_f32(n_sum0, n_row_former, n_filter_v[0], 0);
n_sum0 = vfmaq_laneq_f32(n_sum0, n_row_ext0, n_filter_v[0], 1);
n_sum0 = vfmaq_laneq_f32(n_sum0, n_row_ext1, n_filter_v[0], 2);
float32x4_t n_row1_former = vld1q_f32(row_ptr_v[1]);
float32x4_t n_row1_latter = vld1q_f32(row_ptr_v[1] + kRegisterSize);
float32x4_t n_row1_ext0 =
vextq_f32(n_row1_former, n_row1_latter, 1);
float32x4_t n_row1_ext1 =
vextq_f32(n_row1_former, n_row1_latter, 2);
n_sum0 = vfmaq_laneq_f32(n_sum0, n_row1_former, n_filter_v[1], 0);
n_sum0 = vfmaq_laneq_f32(n_sum0, n_row1_ext0, n_filter_v[1], 1);
n_sum0 = vfmaq_laneq_f32(n_sum0, n_row1_ext1, n_filter_v[1], 2);
n_row_former = vld1q_f32(row_ptr_v[2]);
n_row_latter = vld1q_f32(row_ptr_v[2] + kRegisterSize);
n_row_ext0 = vextq_f32(n_row_former, n_row_latter, 1);
n_row_ext1 = vextq_f32(n_row_former, n_row_latter, 2);
n_sum0 = vfmaq_laneq_f32(n_sum0, n_row_former, n_filter_v[2], 0);
n_sum0 = vfmaq_laneq_f32(n_sum0, n_row_ext0, n_filter_v[2], 1);
n_sum0 = vfmaq_laneq_f32(n_sum0, n_row_ext1, n_filter_v[2], 2);
// second row
float32x4_t n_sum1 = vdupq_n_f32(.0f);
n_sum1 = vfmaq_laneq_f32(n_sum1, n_row1_former, n_filter_v[0], 0);
n_sum1 = vfmaq_laneq_f32(n_sum1, n_row1_ext0, n_filter_v[0], 1);
n_sum1 = vfmaq_laneq_f32(n_sum1, n_row1_ext1, n_filter_v[0], 2);
n_sum1 = vfmaq_laneq_f32(n_sum1, n_row_former, n_filter_v[1], 0);
n_sum1 = vfmaq_laneq_f32(n_sum1, n_row_ext0, n_filter_v[1], 1);
n_sum1 = vfmaq_laneq_f32(n_sum1, n_row_ext1, n_filter_v[1], 2);
n_row1_former = vld1q_f32(row_ptr_v[3]);
n_row1_latter = vld1q_f32(row_ptr_v[3] + kRegisterSize);
n_row1_ext0 = vextq_f32(n_row1_former, n_row1_latter, 1);
n_row1_ext1 = vextq_f32(n_row1_former, n_row1_latter, 2);
n_sum1 = vfmaq_laneq_f32(n_sum1, n_row1_former, n_filter_v[2], 0);
n_sum1 = vfmaq_laneq_f32(n_sum1, n_row1_ext0, n_filter_v[2], 1);
n_sum1 = vfmaq_laneq_f32(n_sum1, n_row1_ext1, n_filter_v[2], 2);
float32x4_t n_output_row = vld1q_f32(output_ptr_v[0]);
float32x4_t n_output_row1 = vld1q_f32(output_ptr_v[1]);
n_output_row = vaddq_f32(n_output_row, n_sum0);
n_output_row1 = vaddq_f32(n_output_row1, n_sum1);
vst1q_f32(output_ptr_v[0], n_output_row);
vst1q_f32(output_ptr_v[1], n_output_row1);
output_ptr_v[0] += kRegisterSize;
output_ptr_v[1] += kRegisterSize;
for (int i = 0; i < kRegisterSize; ++i) {
row_ptr_v[i] += kRegisterSize;
}
}
for (; remain_count > 0; --remain_count) {
float32x4_t n_row_v[] = {vld1q_f32(row_ptr_v[0]),
vld1q_f32(row_ptr_v[1]),
vld1q_f32(row_ptr_v[2])};
float32x4_t n_sum0 = vmulq_f32(n_row_v[0], n_filter_v[0]);
n_sum0 = vmlaq_f32(n_sum0, n_row_v[1], n_filter_v[1]);
n_sum0 = vmlaq_f32(n_sum0, n_row_v[2], n_filter_v[2]);
n_sum0 = vsetq_lane_f32(*output_ptr_v[0], n_sum0, 3);
*output_ptr_v[0] = vaddvq_f32(n_sum0);
float32x4_t n_row3 = vld1q_f32(row_ptr_v[3]);
float32x4_t n_sum1 = vmulq_f32(n_row_v[1], n_filter_v[0]);
n_sum1 = vmlaq_f32(n_sum1, n_row_v[2], n_filter_v[1]);
n_sum1 = vmlaq_f32(n_sum1, n_row3, n_filter_v[2]);
n_sum1 = vsetq_lane_f32(*output_ptr_v[1], n_sum1, 3);
*output_ptr_v[1] = vaddvq_f32(n_sum1);
++output_ptr_v[0];
++output_ptr_v[1];
for (int i = 0; i < kRegisterSize; ++i) {
row_ptr_v[i] += 1;
}
}
output_ptr_v[0] += output_width;
output_ptr_v[1] += output_width;
for (int i = 0; i < kRegisterSize; ++i) {
row_ptr_v[i] += 2 + input_width;
}
}
if (output_height != height_count) {
int count = output_width >> 2;
int remain_count = output_width & 3;
for (; count > 0; --count) {
float32x4_t n_sum = vdupq_n_f32(.0f);
float32x4_t n_row_former = vld1q_f32(row_ptr_v[0]);
float32x4_t n_row_latter = vld1q_f32(row_ptr_v[0] + kRegisterSize);
float32x4_t n_row_ext1 = vextq_f32(n_row_former, n_row_latter, 1);
float32x4_t n_row_ext2 = vextq_f32(n_row_former, n_row_latter, 2);
n_sum = vfmaq_laneq_f32(n_sum, n_row_former, n_filter_v[0], 0);
n_sum = vfmaq_laneq_f32(n_sum, n_row_ext1, n_filter_v[0], 1);
n_sum = vfmaq_laneq_f32(n_sum, n_row_ext2, n_filter_v[0], 2);
n_row_former = vld1q_f32(row_ptr_v[1]);
n_row_latter = vld1q_f32(row_ptr_v[1] + kRegisterSize);
n_row_ext1 = vextq_f32(n_row_former, n_row_latter, 1);
n_row_ext2 = vextq_f32(n_row_former, n_row_latter, 2);
n_sum = vfmaq_laneq_f32(n_sum, n_row_former, n_filter_v[1], 0);
n_sum = vfmaq_laneq_f32(n_sum, n_row_ext1, n_filter_v[1], 1);
n_sum = vfmaq_laneq_f32(n_sum, n_row_ext2, n_filter_v[1], 2);
n_row_former = vld1q_f32(row_ptr_v[2]);
n_row_latter = vld1q_f32(row_ptr_v[2] + kRegisterSize);
n_row_ext1 = vextq_f32(n_row_former, n_row_latter, 1);
n_row_ext2 = vextq_f32(n_row_former, n_row_latter, 2);
n_sum = vfmaq_laneq_f32(n_sum, n_row_former, n_filter_v[2], 0);
n_sum = vfmaq_laneq_f32(n_sum, n_row_ext1, n_filter_v[2], 1);
n_sum = vfmaq_laneq_f32(n_sum, n_row_ext2, n_filter_v[2], 2);
float32x4_t n_output_row = vld1q_f32(output_ptr_v[0]);
n_output_row = vaddq_f32(n_output_row, n_sum);
vst1q_f32(output_ptr_v[0], n_output_row);
output_ptr_v[0] += kRegisterSize;
for (int i = 0; i < 3; ++i) {
row_ptr_v[i] += kRegisterSize;
}
}
for (; remain_count > 0; --remain_count) {
float32x4_t n_row_v[] = {
vld1q_f32(row_ptr_v[0]), vld1q_f32(row_ptr_v[1]),
vld1q_f32(row_ptr_v[2]),
};
float32x4_t n_sum = vmulq_f32(n_row_v[0], n_filter_v[0]);
n_sum = vmlaq_f32(n_sum, n_row_v[1], n_filter_v[1]);
n_sum = vmlaq_f32(n_sum, n_row_v[2], n_filter_v[2]);
n_sum = vsetq_lane_f32(*output_ptr_v[0], n_sum, 3);
*output_ptr_v[0] = vaddvq_f32(n_sum);
++output_ptr_v[0];
for (int i = 0; i < 3; ++i) {
row_ptr_v[i] += 1;
}
}
}
filter_ptr += kFilterSize;
input_ptr += input_height * input_width;
}
}
}
}
void Conv2dNeonK3x3S2(const float *input, // NCHW
const index_t *input_shape,
const float *filter, // c_out, c_in, kernel_h, kernel_w
const index_t *filter_shape,
const float *bias, // c_out
float *output, // NCHW
const index_t *output_shape) {
int tail_step = 2 * (input_shape[3] - output_shape[3]);
int output_batch = output_shape[0];
int output_channels = output_shape[1];
int output_height = output_shape[2];
int output_width = output_shape[3];
int input_batch = input_shape[0];
int input_channels = input_shape[1];
int input_height = input_shape[2];
int input_width = input_shape[3];
int multiplier = filter_shape == nullptr ? 0 : filter_shape[0];
int filter_in_channels = filter_shape == nullptr ? input_channels : 1;
#pragma omp parallel for collapse(2)
for (int b = 0; b < output_batch; ++b) {
for (int oc = 0; oc < output_channels; ++oc) {
float *output_ptr_base =
output + b * output_channels * output_height * output_width;
const float *filter_ptr = filter + oc * filter_in_channels * kFilterSize;
const float *input_ptr =
input + b * input_channels * input_height * input_width;
if (filter_shape != nullptr) {
input_ptr += (oc / multiplier) * input_height * input_width;
}
float *output_ptr = output_ptr_base + oc * output_height * output_width;
std::fill(output_ptr, output_ptr + output_height * output_width,
bias ? bias[oc] : 0);
for (int ic = 0; ic < filter_in_channels; ++ic) {
float32x4_t n_filter_v[3] = {vld1q_f32(filter_ptr),
vld1q_f32(filter_ptr + 3),
vld1q_f32(filter_ptr + 6)};
const float *row_ptr_v[3] = {input_ptr, input_ptr + input_width,
input_ptr + 2 * input_width};
float *output_ptr_inner = output_ptr;
for (int h = 0; h < output_height; ++h) {
int count = output_width >> 2;
int remain_count = output_width & 3;
for (; count > 0; --count) {
float32x4_t n_sum = vdupq_n_f32(.0f);
float32x4x2_t n_row_former = vld2q_f32(row_ptr_v[0]);
float32x4_t n_row_latter = vld1q_f32(row_ptr_v[0] + 8);
float32x4_t n_row_ext =
vextq_f32(n_row_former.val[0], n_row_latter, 1);
n_sum =
vfmaq_laneq_f32(n_sum, n_row_former.val[0], n_filter_v[0], 0);
n_sum =
vfmaq_laneq_f32(n_sum, n_row_former.val[1], n_filter_v[0], 1);
n_sum = vfmaq_laneq_f32(n_sum, n_row_ext, n_filter_v[0], 2);
float32x4x2_t n_row1_former = vld2q_f32(row_ptr_v[1]);
float32x4_t n_row1_latter = vld1q_f32(row_ptr_v[1] + 8);
float32x4_t n_row1_ext =
vextq_f32(n_row1_former.val[0], n_row1_latter, 1);
n_sum =
vfmaq_laneq_f32(n_sum, n_row1_former.val[0], n_filter_v[1], 0);
n_sum =
vfmaq_laneq_f32(n_sum, n_row1_former.val[1], n_filter_v[1], 1);
n_sum = vfmaq_laneq_f32(n_sum, n_row1_ext, n_filter_v[1], 2);
float32x4x2_t n_row2_former = vld2q_f32(row_ptr_v[2]);
float32x4_t n_row2_latter = vld1q_f32(row_ptr_v[2] + 8);
float32x4_t n_row2_ext =
vextq_f32(n_row2_former.val[0], n_row2_latter, 1);
n_sum =
vfmaq_laneq_f32(n_sum, n_row2_former.val[0], n_filter_v[2], 0);
n_sum =
vfmaq_laneq_f32(n_sum, n_row2_former.val[1], n_filter_v[2], 1);
n_sum = vfmaq_laneq_f32(n_sum, n_row2_ext, n_filter_v[2], 2);
float32x4_t n_output_row = vld1q_f32(output_ptr_inner);
n_output_row = vaddq_f32(n_output_row, n_sum);
vst1q_f32(output_ptr_inner, n_output_row);
output_ptr_inner += kRegisterSize;
for (int i = 0; i < 3; ++i) {
row_ptr_v[i] += 2 * kRegisterSize;
}
}
for (; remain_count > 0; --remain_count) {
float32x4_t n_row_v[] = {vld1q_f32(row_ptr_v[0]),
vld1q_f32(row_ptr_v[1]),
vld1q_f32(row_ptr_v[2])};
float32x4_t n_sum = vmulq_f32(n_row_v[0], n_filter_v[0]);
n_sum = vmlaq_f32(n_sum, n_row_v[1], n_filter_v[1]);
n_sum = vmlaq_f32(n_sum, n_row_v[2], n_filter_v[2]);
n_sum = vsetq_lane_f32(*output_ptr_inner, n_sum, 3);
*output_ptr_inner = vaddvq_f32(n_sum);
++output_ptr_inner;
for (int i = 0; i < 3; ++i) {
row_ptr_v[i] += 2;
}
}
for (int i = 0; i < 3; ++i) {
row_ptr_v[i] += tail_step;
}
}
filter_ptr += kFilterSize;
input_ptr += input_height * input_width;
}
}
}
}
} // namespace kernels
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_KERNELS_NEON_CONV_2D_NEON_5X5_H_
#define MACE_KERNELS_NEON_CONV_2D_NEON_5X5_H_
#include <arm_neon.h>
namespace mace {
namespace kernels {
void Conv2dNeonK5x5S1(const float *input, // NCHW
const index_t *input_shape,
const float *filter, // c_out, c_in, kernel_h, kernel_w
const index_t *filter_shape,
const float *bias, // c_out
float *output, // NCHW
const index_t *output_shape) {
const index_t batch = output_shape[0];
const index_t channels = output_shape[1];
const index_t height = output_shape[2];
const index_t width = output_shape[3];
const index_t input_batch = input_shape[0];
const index_t input_channels = input_shape[1];
const index_t input_height = input_shape[2];
const index_t input_width = input_shape[3];
MACE_ASSERT(input_batch == batch);
const index_t input_total_pixels_per_channel = input_height * input_width;
const index_t output_total_pixels_per_channel = height * width;
const index_t input_total_pixels_per_batch =
input_total_pixels_per_channel * input_channels;
const index_t output_total_pixels_per_batch =
output_total_pixels_per_channel * channels;
const index_t patch_size = input_channels * 25;
#pragma omp parallel for collapse(2)
for (index_t n = 0; n < batch; ++n) {
for (index_t c = 0; c < channels; ++c) {
float *output_ptr = output + n * output_total_pixels_per_batch +
c * output_total_pixels_per_channel;
const float *input_ptr = input + n * input_total_pixels_per_batch;
// Fill with bias
std::fill(output_ptr, output_ptr + output_total_pixels_per_channel,
bias ? bias[c] : 0);
for (index_t inc = 0; inc < input_channels; ++inc) {
float *outptr = output_ptr;
float *outptr2 = outptr + width;
const float *inptr = input_ptr + inc * input_total_pixels_per_channel;
const float *filter_ptr = filter + c * patch_size + inc * 25;
const float *r0 = inptr;
const float *r1 = inptr + input_width;
const float *r2 = inptr + input_width * 2;
const float *r3 = inptr + input_width * 3;
const float *r4 = inptr + input_width * 4;
const float *r5 = inptr + input_width * 5;
const float *k0 = filter_ptr;
const float *k1 = filter_ptr + 5;
const float *k2 = filter_ptr + 10;
const float *k3 = filter_ptr + 15;
const float *k4 = filter_ptr + 20;
float32x4_t _k0123 = vld1q_f32(filter_ptr);
float32x4_t _k4567 = vld1q_f32(filter_ptr + 4);
float32x4_t _k891011 = vld1q_f32(filter_ptr + 8);
float32x4_t _k12131415 = vld1q_f32(filter_ptr + 12);
float32x4_t _k16171819 = vld1q_f32(filter_ptr + 16);
float32x4_t _k20212223 = vld1q_f32(filter_ptr + 20);
float32x4_t _k24242424 = vdupq_n_f32(filter_ptr[24]);
// height_block_size = 2, width_block_size = 4
int h = 0;
for (; h + 1 < height; h += 2) {
int width_blocks = width >> 2;
int remain = width - (width_blocks << 2);
for (; width_blocks > 0; --width_blocks) {
float32x4_t _sum = vld1q_f32(outptr);
float32x4_t _sum2 = vld1q_f32(outptr2);
float32x4_t _r00 = vld1q_f32(r0);
float32x4_t _r04 = vld1q_f32(r0 + 4);
float32x4_t _r01 = vextq_f32(_r00, _r04, 1);
float32x4_t _r02 = vextq_f32(_r00, _r04, 2);
float32x4_t _r03 = vextq_f32(_r00, _r04, 3);
float32x4_t _r10 = vld1q_f32(r1);
float32x4_t _r14 = vld1q_f32(r1 + 4);
float32x4_t _r11 = vextq_f32(_r10, _r14, 1);
float32x4_t _r12 = vextq_f32(_r10, _r14, 2);
float32x4_t _r13 = vextq_f32(_r10, _r14, 3);
float32x4_t _r20 = vld1q_f32(r2);
float32x4_t _r24 = vld1q_f32(r2 + 4);
float32x4_t _r21 = vextq_f32(_r20, _r24, 1);
float32x4_t _r22 = vextq_f32(_r20, _r24, 2);
float32x4_t _r23 = vextq_f32(_r20, _r24, 3);
float32x4_t _r30 = vld1q_f32(r3);
float32x4_t _r34 = vld1q_f32(r3 + 4);
float32x4_t _r31 = vextq_f32(_r30, _r34, 1);
float32x4_t _r32 = vextq_f32(_r30, _r34, 2);
float32x4_t _r33 = vextq_f32(_r30, _r34, 3);
float32x4_t _r40 = vld1q_f32(r4);
float32x4_t _r44 = vld1q_f32(r4 + 4);
float32x4_t _r41 = vextq_f32(_r40, _r44, 1);
float32x4_t _r42 = vextq_f32(_r40, _r44, 2);
float32x4_t _r43 = vextq_f32(_r40, _r44, 3);
float32x4_t _r50 = vld1q_f32(r5);
float32x4_t _r54 = vld1q_f32(r5 + 4);
float32x4_t _r51 = vextq_f32(_r50, _r54, 1);
float32x4_t _r52 = vextq_f32(_r50, _r54, 2);
float32x4_t _r53 = vextq_f32(_r50, _r54, 3);
_sum = vfmaq_laneq_f32(_sum, _r00, _k0123, 0);
_sum = vfmaq_laneq_f32(_sum, _r01, _k0123, 1);
_sum = vfmaq_laneq_f32(_sum, _r02, _k0123, 2);
_sum = vfmaq_laneq_f32(_sum, _r03, _k0123, 3);
_sum = vfmaq_laneq_f32(_sum, _r04, _k4567, 0);
_sum = vfmaq_laneq_f32(_sum, _r10, _k4567, 1);
_sum = vfmaq_laneq_f32(_sum, _r11, _k4567, 2);
_sum = vfmaq_laneq_f32(_sum, _r12, _k4567, 3);
_sum = vfmaq_laneq_f32(_sum, _r13, _k891011, 0);
_sum = vfmaq_laneq_f32(_sum, _r14, _k891011, 1);
_sum = vfmaq_laneq_f32(_sum, _r20, _k891011, 2);
_sum = vfmaq_laneq_f32(_sum, _r21, _k891011, 3);
_sum = vfmaq_laneq_f32(_sum, _r22, _k12131415, 0);
_sum = vfmaq_laneq_f32(_sum, _r23, _k12131415, 1);
_sum = vfmaq_laneq_f32(_sum, _r24, _k12131415, 2);
_sum = vfmaq_laneq_f32(_sum, _r30, _k12131415, 3);
_sum = vfmaq_laneq_f32(_sum, _r31, _k16171819, 0);
_sum = vfmaq_laneq_f32(_sum, _r32, _k16171819, 1);
_sum = vfmaq_laneq_f32(_sum, _r33, _k16171819, 2);
_sum = vfmaq_laneq_f32(_sum, _r34, _k16171819, 3);
_sum = vfmaq_laneq_f32(_sum, _r40, _k20212223, 0);
_sum = vfmaq_laneq_f32(_sum, _r41, _k20212223, 1);
_sum = vfmaq_laneq_f32(_sum, _r42, _k20212223, 2);
_sum = vfmaq_laneq_f32(_sum, _r43, _k20212223, 3);
_sum = vfmaq_laneq_f32(_sum, _r44, _k24242424, 0);
_sum2 = vfmaq_laneq_f32(_sum2, _r10, _k0123, 0);
_sum2 = vfmaq_laneq_f32(_sum2, _r11, _k0123, 1);
_sum2 = vfmaq_laneq_f32(_sum2, _r12, _k0123, 2);
_sum2 = vfmaq_laneq_f32(_sum2, _r13, _k0123, 3);
_sum2 = vfmaq_laneq_f32(_sum2, _r14, _k4567, 0);
_sum2 = vfmaq_laneq_f32(_sum2, _r20, _k4567, 1);
_sum2 = vfmaq_laneq_f32(_sum2, _r21, _k4567, 2);
_sum2 = vfmaq_laneq_f32(_sum2, _r22, _k4567, 3);
_sum2 = vfmaq_laneq_f32(_sum2, _r23, _k891011, 0);
_sum2 = vfmaq_laneq_f32(_sum2, _r24, _k891011, 1);
_sum2 = vfmaq_laneq_f32(_sum2, _r30, _k891011, 2);
_sum2 = vfmaq_laneq_f32(_sum2, _r31, _k891011, 3);
_sum2 = vfmaq_laneq_f32(_sum2, _r32, _k12131415, 0);
_sum2 = vfmaq_laneq_f32(_sum2, _r33, _k12131415, 1);
_sum2 = vfmaq_laneq_f32(_sum2, _r34, _k12131415, 2);
_sum2 = vfmaq_laneq_f32(_sum2, _r40, _k12131415, 3);
_sum2 = vfmaq_laneq_f32(_sum2, _r41, _k16171819, 0);
_sum2 = vfmaq_laneq_f32(_sum2, _r42, _k16171819, 1);
_sum2 = vfmaq_laneq_f32(_sum2, _r43, _k16171819, 2);
_sum2 = vfmaq_laneq_f32(_sum2, _r44, _k16171819, 3);
_sum2 = vfmaq_laneq_f32(_sum2, _r50, _k20212223, 0);
_sum2 = vfmaq_laneq_f32(_sum2, _r51, _k20212223, 1);
_sum2 = vfmaq_laneq_f32(_sum2, _r52, _k20212223, 2);
_sum2 = vfmaq_laneq_f32(_sum2, _r53, _k20212223, 3);
_sum2 = vfmaq_laneq_f32(_sum2, _r54, _k24242424, 0);
vst1q_f32(outptr, _sum);
vst1q_f32(outptr2, _sum2);
r0 += 4;
r1 += 4;
r2 += 4;
r3 += 4;
r4 += 4;
r5 += 4;
outptr += 4;
outptr2 += 4;
}
for (; remain > 0; --remain) {
float sum = 0;
float sum2 = 0;
float32x4_t _r1 = vld1q_f32(r1);
float32x4_t _k1 = vld1q_f32(k1);
float32x4_t _sum = vmulq_f32(_r1, _k1);
float32x4_t _sum2 = vmulq_f32(_r1, _k0123);
float32x4_t _r2 = vld1q_f32(r2);
float32x4_t _k2 = vld1q_f32(k2);
_sum = vmlaq_f32(_sum, _r2, _k2);
_sum2 = vmlaq_f32(_sum2, _r2, _k1);
float32x4_t _r3 = vld1q_f32(r3);
float32x4_t _k3 = vld1q_f32(k3);
_sum = vmlaq_f32(_sum, _r3, _k3);
_sum2 = vmlaq_f32(_sum2, _r3, _k2);
float32x4_t _r4 = vld1q_f32(r4);
_sum = vmlaq_f32(_sum, _r4, _k20212223);
_sum2 = vmlaq_f32(_sum2, _r4, _k3);
float32x4_t _r0 = vld1q_f32(r0);
_sum = vmlaq_f32(_sum, _r0, _k0123);
float32x4_t _r5 = vld1q_f32(r5);
_sum2 = vmlaq_f32(_sum2, _r5, _k20212223);
float32x4_t _k_t4;
_k_t4 = vsetq_lane_f32(k0[4], _k_t4, 0);
_k_t4 = vsetq_lane_f32(k1[4], _k_t4, 1);
_k_t4 = vsetq_lane_f32(k2[4], _k_t4, 2);
_k_t4 = vsetq_lane_f32(k3[4], _k_t4, 3);
float32x4_t _r_t4;
_r_t4 = vsetq_lane_f32(r0[4], _r_t4, 0);
_r_t4 = vsetq_lane_f32(r1[4], _r_t4, 1);
_r_t4 = vsetq_lane_f32(r2[4], _r_t4, 2);
_r_t4 = vsetq_lane_f32(r3[4], _r_t4, 3);
_sum = vmlaq_f32(_sum, _r_t4, _k_t4);
sum = r4[4] * k4[4];
_r_t4 = vextq_f32(_r_t4, _r_t4, 1);
_r_t4 = vsetq_lane_f32(r4[4], _r_t4, 3);
_sum2 = vmlaq_f32(_sum2, _r_t4, _k_t4);
sum2 = r5[4] * k4[4];
float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
float32x2_t _ss2 =
vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
float32x2_t _ss_ss2 = vpadd_f32(_ss, _ss2);
sum += vget_lane_f32(_ss_ss2, 0);
sum2 += vget_lane_f32(_ss_ss2, 1);
*outptr += sum;
*outptr2 += sum2;
++r0;
++r1;
++r2;
++r3;
++r4;
++r5;
++outptr;
++outptr2;
}
r0 += 4 + input_width; // 4 = 5 - 1
r1 += 4 + input_width;
r2 += 4 + input_width;
r3 += 4 + input_width;
r4 += 4 + input_width;
r5 += 4 + input_width;
outptr += width;
outptr2 += width;
}
for (; h < height; ++h) {
// may left one row if odd rows
int width_blocks = width >> 2;
int remain = width - (width_blocks << 2);
for (; width_blocks > 0; --width_blocks) {
float32x4_t _sum = vld1q_f32(outptr);
float32x4_t _r00 = vld1q_f32(r0);
float32x4_t _r04 = vld1q_f32(r0 + 4);
float32x4_t _r01 = vextq_f32(_r00, _r04, 1);
float32x4_t _r02 = vextq_f32(_r00, _r04, 2);
float32x4_t _r03 = vextq_f32(_r00, _r04, 3);
float32x4_t _r10 = vld1q_f32(r1);
float32x4_t _r14 = vld1q_f32(r1 + 4);
float32x4_t _r11 = vextq_f32(_r10, _r14, 1);
float32x4_t _r12 = vextq_f32(_r10, _r14, 2);
float32x4_t _r13 = vextq_f32(_r10, _r14, 3);
float32x4_t _r20 = vld1q_f32(r2);
float32x4_t _r24 = vld1q_f32(r2 + 4);
float32x4_t _r21 = vextq_f32(_r20, _r24, 1);
float32x4_t _r22 = vextq_f32(_r20, _r24, 2);
float32x4_t _r23 = vextq_f32(_r20, _r24, 3);
float32x4_t _r30 = vld1q_f32(r3);
float32x4_t _r34 = vld1q_f32(r3 + 4);
float32x4_t _r31 = vextq_f32(_r30, _r34, 1);
float32x4_t _r32 = vextq_f32(_r30, _r34, 2);
float32x4_t _r33 = vextq_f32(_r30, _r34, 3);
float32x4_t _r40 = vld1q_f32(r4);
float32x4_t _r44 = vld1q_f32(r4 + 4);
float32x4_t _r41 = vextq_f32(_r40, _r44, 1);
float32x4_t _r42 = vextq_f32(_r40, _r44, 2);
float32x4_t _r43 = vextq_f32(_r40, _r44, 3);
_sum = vfmaq_laneq_f32(_sum, _r00, _k0123, 0);
_sum = vfmaq_laneq_f32(_sum, _r01, _k0123, 1);
_sum = vfmaq_laneq_f32(_sum, _r02, _k0123, 2);
_sum = vfmaq_laneq_f32(_sum, _r03, _k0123, 3);
_sum = vfmaq_laneq_f32(_sum, _r04, _k4567, 0);
_sum = vfmaq_laneq_f32(_sum, _r10, _k4567, 1);
_sum = vfmaq_laneq_f32(_sum, _r11, _k4567, 2);
_sum = vfmaq_laneq_f32(_sum, _r12, _k4567, 3);
_sum = vfmaq_laneq_f32(_sum, _r13, _k891011, 0);
_sum = vfmaq_laneq_f32(_sum, _r14, _k891011, 1);
_sum = vfmaq_laneq_f32(_sum, _r20, _k891011, 2);
_sum = vfmaq_laneq_f32(_sum, _r21, _k891011, 3);
_sum = vfmaq_laneq_f32(_sum, _r22, _k12131415, 0);
_sum = vfmaq_laneq_f32(_sum, _r23, _k12131415, 1);
_sum = vfmaq_laneq_f32(_sum, _r24, _k12131415, 2);
_sum = vfmaq_laneq_f32(_sum, _r30, _k12131415, 3);
_sum = vfmaq_laneq_f32(_sum, _r31, _k16171819, 0);
_sum = vfmaq_laneq_f32(_sum, _r32, _k16171819, 1);
_sum = vfmaq_laneq_f32(_sum, _r33, _k16171819, 2);
_sum = vfmaq_laneq_f32(_sum, _r34, _k16171819, 3);
_sum = vfmaq_laneq_f32(_sum, _r40, _k20212223, 0);
_sum = vfmaq_laneq_f32(_sum, _r41, _k20212223, 1);
_sum = vfmaq_laneq_f32(_sum, _r42, _k20212223, 2);
_sum = vfmaq_laneq_f32(_sum, _r43, _k20212223, 3);
_sum = vfmaq_laneq_f32(_sum, _r44, _k24242424, 0);
vst1q_f32(outptr, _sum);
r0 += 4;
r1 += 4;
r2 += 4;
r3 += 4;
r4 += 4;
r5 += 4;
outptr += 4;
}
for (; remain > 0; --remain) {
float sum = 0;
float32x4_t _r0 = vld1q_f32(r0);
float32x4_t _sum = vmulq_f32(_r0, _k0123);
float debug[4];
vst1q_f32(debug, _sum);
float32x4_t _r1 = vld1q_f32(r1);
_sum = vmlaq_f32(_sum, _r1, vld1q_f32(k1));
float32x4_t _r2 = vld1q_f32(r2);
_sum = vmlaq_f32(_sum, _r2, vld1q_f32(k2));
float32x4_t _r3 = vld1q_f32(r3);
_sum = vmlaq_f32(_sum, _r3, vld1q_f32(k3));
float32x4_t _r4 = vld1q_f32(r4);
_sum = vmlaq_f32(_sum, _r4, _k20212223);
float32x4_t _k_t4;
_k_t4 = vsetq_lane_f32(k0[4], _k_t4, 0);
_k_t4 = vsetq_lane_f32(k1[4], _k_t4, 1);
_k_t4 = vsetq_lane_f32(k2[4], _k_t4, 2);
_k_t4 = vsetq_lane_f32(k3[4], _k_t4, 3);
float32x4_t _r_t4;
_r_t4 = vsetq_lane_f32(r0[4], _r_t4, 0);
_r_t4 = vsetq_lane_f32(r1[4], _r_t4, 1);
_r_t4 = vsetq_lane_f32(r2[4], _r_t4, 2);
_r_t4 = vsetq_lane_f32(r3[4], _r_t4, 3);
_sum = vmlaq_f32(_sum, _r_t4, _k_t4);
sum = r4[4] * k4[4];
float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
_ss = vpadd_f32(_ss, _ss);
sum += vget_lane_f32(_ss, 0);
*outptr += sum;
++r0;
++r1;
++r2;
++r3;
++r4;
++outptr;
}
r0 += 4;
r1 += 4;
r2 += 4;
r3 += 4;
r4 += 4;
}
}
}
}
}
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_NEON_CONV_2D_NEON_5X5_H_
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/depthwise_conv2d.h"
namespace mace {
namespace kernels {
extern void Conv2dNeonK3x3S1(const float *input,
const index_t *input_shape,
const float *filter,
const index_t *filter_shape,
const float *bias,
float *output,
const index_t *output_shape);
extern void Conv2dNeonK3x3S2(const float *input,
const index_t *input_shape,
const float *filter,
const index_t *filter_shape,
const float *bias,
float *output,
const index_t *output_shape);
template <>
void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
typedef void (*Conv2dNeonFunction)(
const float *input, const index_t *input_shape, const float *filter,
const index_t *filter_shape, const float *bias, float *output,
const index_t *output_shape);
// Selection matrix: kernel_size x stride_size
static const Conv2dNeonFunction selector[5][2] = {
{nullptr, nullptr},
{nullptr, nullptr},
{Conv2dNeonK3x3S1, Conv2dNeonK3x3S2},
{nullptr, nullptr},
{nullptr, nullptr}};
// not implement yet
index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3);
if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] ||
strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 ||
selector[kernel_h - 1][strides_[0] - 1] == nullptr) {
LOG(WARNING) << "Depthwise-Conv2d NEON kernel with "
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides_[0] << "x" << strides_[1]
<< " is not implemented yet, using slow version";
DepthwiseConv2dFunctor<DeviceType::CPU, float>(
strides_, paddings_, dilations_)(input, filter, bias, output, future);
return;
}
const float *input_ptr = input->data<float>();
const index_t *input_shape = input->shape().data();
const float *filter_ptr = filter->data<float>();
const index_t *filter_shape = filter->shape().data();
const float *bias_ptr = bias->data<float>();
float *output_ptr = output->mutable_data<float>();
const index_t *output_shape = output->shape().data();
// Keep this alive during kernel execution
Tensor padded_input;
if (paddings_[0] > 0 || paddings_[1] > 0) {
ConstructInputWithPadding(input, paddings_.data(), &padded_input);
input_ptr = padded_input.data<float>();
input_shape = padded_input.shape().data();
}
auto conv2d_neon_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_neon_func(input_ptr, input_shape, filter_ptr, filter_shape, bias_ptr,
output_ptr, output_shape);
}
} // namespace kernels
} // namespace mace
......@@ -166,8 +166,20 @@ struct PoolingFunctor : PoolingFunctorBase {
};
template <>
void PoolingFunctor<DeviceType::NEON, float>::operator()(
const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future);
struct PoolingFunctor<DeviceType::NEON, float> : PoolingFunctorBase {
PoolingFunctor(const PoolingType pooling_type,
const int *kernels,
const int *strides,
const Padding padding_type,
const std::vector<int> &paddings,
const int *dilations)
: PoolingFunctorBase(
pooling_type, kernels, strides, padding_type, paddings, dilations) {
}
void operator()(const Tensor *input_tensor,
Tensor *output_tensor,
StatsFuture *future);
};
template <typename T>
struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
......
......@@ -56,6 +56,11 @@ struct SoftmaxFunctor {
}
};
template <>
struct SoftmaxFunctor<DeviceType::NEON, float> {
void operator()(const Tensor *logits, Tensor *output, StatsFuture *future);
};
template <typename T>
struct SoftmaxFunctor<DeviceType::OPENCL, T> {
void operator()(const Tensor *logits, Tensor *output, StatsFuture *future);
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_KERNELS_TRANSPOSE_H_
#define MACE_KERNELS_TRANSPOSE_H_
#include <vector>
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/public/mace.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
template<DeviceType D, typename T>
struct TransposeFunctor {
explicit TransposeFunctor(const std::vector<int> &dims) : dims_(dims) {}
void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
Tensor::MappingGuard input_guard(input);
Tensor::MappingGuard output_guard(output);
const std::vector<index_t> &input_shape = input->shape();
const std::vector<index_t> &output_shape = output->shape();
const T *input_data = input->data<T>();
T *output_data = output->mutable_data<T>();
std::vector<index_t>
in_stride{input_shape[1] * input_shape[2] * input_shape[3],
input_shape[2] * input_shape[3], input_shape[3], 1};
std::vector<index_t>
out_stride{output_shape[1] * output_shape[2] * output_shape[3],
output_shape[2] * output_shape[3], output_shape[3], 1};
std::vector<index_t> idim(4, 0);
std::vector<index_t> odim(4, 0);
for (odim[0] = 0; odim[0] < output_shape[0]; ++odim[0]) {
for (odim[1] = 0; odim[1] < output_shape[1]; ++odim[1]) {
for (odim[2] = 0; odim[2] < output_shape[2]; ++odim[2]) {
for (odim[3] = 0; odim[3] < output_shape[3]; ++odim[3]) {
idim[dims_[0]] = odim[0];
idim[dims_[1]] = odim[1];
idim[dims_[2]] = odim[2];
idim[dims_[3]] = odim[3];
output_data[odim[0] * out_stride[0] + odim[1] * out_stride[1]
+ odim[2] * out_stride[2] + odim[3]] =
input_data[idim[0] * in_stride[0] + idim[1] * in_stride[1]
+ idim[2] * in_stride[2] + idim[3]];
}
}
}
}
}
std::vector<int> dims_;
};
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_TRANSPOSE_H_
......@@ -25,6 +25,11 @@ void Register_Activation(OperatorRegistry *op_registry) {
.TypeConstraint<half>("T")
.Build(),
ActivationOp<DeviceType::OPENCL, half>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation")
.Device(DeviceType::NEON)
.TypeConstraint<float>("T")
.Build(),
ActivationOp<DeviceType::NEON, float>);
}
} // namespace ops
......
......@@ -25,6 +25,11 @@ void Register_BatchNorm(OperatorRegistry *op_registry) {
.TypeConstraint<half>("T")
.Build(),
BatchNormOp<DeviceType::OPENCL, half>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm")
.Device(DeviceType::NEON)
.TypeConstraint<float>("T")
.Build(),
BatchNormOp<DeviceType::NEON, float>);
}
} // namespace ops
......
......@@ -11,7 +11,7 @@ namespace test {
class BatchNormOpTest : public OpsTestBase {};
template <DeviceType D>
template<DeviceType D>
void Simple() {
OpsTestNet net;
......@@ -36,14 +36,14 @@ void Simple() {
kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.AddFloatArg("epsilon", 1e-3)
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.AddFloatArg("epsilon", 1e-3)
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
......@@ -52,22 +52,22 @@ void Simple() {
kernels::BufferType::IN_OUT_CHANNEL);
} else {
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("Input")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.AddFloatArg("epsilon", 1e-3)
.Output("Output")
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.AddFloatArg("epsilon", 1e-3)
.Output("Output")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
// Check
auto expected =
CreateTensor<float>({1, 6, 2, 1}, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
3.17, 3.17, 5.51, 5.51, 7.86, 7.86});
CreateTensor<float>({1, 6, 2, 1}, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
3.17, 3.17, 5.51, 5.51, 7.86, 7.86});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-2);
}
......@@ -87,18 +87,18 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
// Construct graph
OpsTestNet net;
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("Input")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.AddFloatArg("epsilon", 1e-3)
.Output("Output")
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.AddFloatArg("epsilon", 1e-3)
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<DeviceType::OPENCL, float>(
"Input", {batch, height, width, channels});
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
......@@ -124,14 +124,14 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.AddFloatArg("epsilon", 1e-3)
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.AddFloatArg("epsilon", 1e-3)
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
// Tuning
setenv("MACE_TUNING", "1", 1);
......@@ -158,18 +158,18 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
// Construct graph
OpsTestNet net;
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("Input")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.AddFloatArg("epsilon", 1e-3)
.Output("Output")
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.AddFloatArg("epsilon", 1e-3)
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<DeviceType::OPENCL, float>(
"Input", {batch, height, width, channels});
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
......@@ -195,15 +195,15 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.AddFloatArg("epsilon", 1e-3)
.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.AddFloatArg("epsilon", 1e-3)
.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
// Tuning
setenv("MACE_TUNING", "1", 1);
......@@ -230,18 +230,18 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
// Construct graph
OpsTestNet net;
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("Input")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.AddFloatArg("epsilon", 1e-3)
.Output("Output")
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.AddFloatArg("epsilon", 1e-3)
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<DeviceType::OPENCL, float>(
"Input", {batch, height, width, channels});
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
......@@ -267,14 +267,14 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.AddFloatArg("epsilon", 1e-3)
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.AddFloatArg("epsilon", 1e-3)
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
// tuning
setenv("MACE_TUNING", "1", 1);
......@@ -301,18 +301,18 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
// Construct graph
OpsTestNet net;
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("Input")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.AddFloatArg("epsilon", 1e-3)
.Output("Output")
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.AddFloatArg("epsilon", 1e-3)
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<DeviceType::OPENCL, float>(
"Input", {batch, height, width, channels});
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});
net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
......@@ -338,15 +338,15 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
kernels::BufferType::ARGUMENT);
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.AddFloatArg("epsilon", 1e-3)
.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("ScaleImage")
.Input("OffsetImage")
.Input("MeanImage")
.Input("VarImage")
.AddFloatArg("epsilon", 1e-3)
.Output("OutputImage")
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
// tuning
setenv("MACE_TUNING", "1", 1);
......@@ -362,6 +362,63 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.5);
}
TEST_F(BatchNormOpTest, NEONTest) {
srand(time(NULL));
unsigned int seed;
// generate random input
index_t batch = 1 + rand_r(&seed) % 10;
index_t channels = 3 + rand_r(&seed) % 50;
index_t height = 64;
index_t width = 64;
// Construct graph
OpsTestNet net;
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("Input")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.AddFloatArg("epsilon", 1e-3)
.Output("Output")
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<DeviceType::CPU, float>(
"Input", {batch, height, width, channels});
net.AddRandomInput<DeviceType::CPU, float>("Scale", {channels});
net.AddRandomInput<DeviceType::CPU, float>("Offset", {channels});
net.AddRandomInput<DeviceType::CPU, float>("Mean", {channels});
net.AddRandomInput<DeviceType::CPU, float>("Var", {channels}, true);
// run cpu
net.RunOp();
OpDefBuilder("BatchNorm", "BatchNormTest")
.Input("InputNeon")
.Input("Scale")
.Input("Offset")
.Input("Mean")
.Input("Var")
.AddFloatArg("epsilon", 1e-3)
.Output("OutputNeon")
.Finalize(net.NewOperatorDef());
net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("InputNeon", "Input");
// Run on neon
net.RunOp(DeviceType::NEON);
net.Sync();
net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("OutputExptected",
"Output");
ExpectTensorNear<float>(*net.GetOutput("OutputExptected"),
*net.GetOutput("OutputNeon"),
0.001);
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -25,6 +25,12 @@ void Register_Conv2D(OperatorRegistry *op_registry) {
.TypeConstraint<half>("T")
.Build(),
Conv2dOp<DeviceType::OPENCL, half>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D")
.Device(DeviceType::NEON)
.TypeConstraint<float>("T")
.Build(),
Conv2dOp<DeviceType::NEON, float>);
}
} // namespace ops
......
......@@ -30,10 +30,19 @@ static void Conv2d(int iters,
OpsTestNet net;
// Add input data
net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
net.AddRandomInput<D, float>("Filter",
{kernel_h, kernel_w, output_channels, channels});
net.AddRandomInput<D, float>("Bias", {output_channels});
if (D == DeviceType::NEON) {
net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
net.AddRandomInput<D, float>("Filter",
{output_channels, channels, kernel_h,
kernel_w});
net.AddRandomInput<D, float>("Bias", {output_channels});
} else {
net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
net.AddRandomInput<D, float>("Filter",
{kernel_h, kernel_w, output_channels,
channels});
net.AddRandomInput<D, float>("Bias", {output_channels});
}
if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(&net, "Input", "InputImage",
......@@ -65,15 +74,17 @@ static void Conv2d(int iters,
.Finalize(net.NewOperatorDef());
}
net.Setup(D);
// Warm-up
for (int i = 0; i < 2; ++i) {
net.RunOp(D);
net.Run();
net.Sync();
}
mace::testing::StartTiming();
while (iters--) {
net.RunOp(D);
net.Run();
net.Sync();
}
}
......@@ -112,7 +123,8 @@ static void Conv2d(int iters,
#define BM_CONV_2D(N, C, H, W, KH, KW, S, D, P, OC) \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, CPU); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, OPENCL); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, OPENCL);
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, OPENCL); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, NEON);
BM_CONV_2D(1, 256, 64, 64, 3, 3, 1, 1, VALID, 256);
......@@ -133,6 +145,8 @@ BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, 1, VALID, 128);
BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, 1, VALID, 128); // Test bad alignments
BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, 1, SAME, 128);
BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, 1, SAME, 128);
BM_CONV_2D(1, 3, 224, 224, 3, 3, 2, 1, SAME, 32);
BM_CONV_2D(1, 3, 224, 224, 3, 3, 2, 1, VALID, 32);
BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, 1, SAME, 128);
BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, 1, SAME, 128);
......
......@@ -14,17 +14,17 @@ namespace test {
class Conv2dOpTest : public OpsTestBase {};
template <DeviceType D, typename T>
template<DeviceType D, typename T>
void TestNHWCSimple3x3VALID() {
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, T>(
"Input", {1, 3, 3, 2},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
"Input", {1, 3, 3, 2},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, T>(
"Filter", {3, 3, 1, 2},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
"Filter", {3, 3, 1, 2},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
net.AddInputFromArray<D, T>("Bias", {1}, {0.1f});
if (D == DeviceType::OPENCL) {
......@@ -35,15 +35,15 @@ void TestNHWCSimple3x3VALID() {
BufferToImage<D, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
net.RunOp(D);
......@@ -53,15 +53,15 @@ void TestNHWCSimple3x3VALID() {
} else {
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
......@@ -70,18 +70,18 @@ void TestNHWCSimple3x3VALID() {
ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
}
template <DeviceType D, typename T>
template<DeviceType D, typename T>
void TestNHWCSimple3x3SAME() {
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, T>(
"Input", {1, 3, 3, 2},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
"Input", {1, 3, 3, 2},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, T>(
"Filter", {3, 3, 1, 2},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
"Filter", {3, 3, 1, 2},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
net.AddInputFromArray<D, T>("Bias", {1}, {0.1f});
if (D == DeviceType::OPENCL) {
......@@ -92,15 +92,15 @@ void TestNHWCSimple3x3SAME() {
BufferToImage<D, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
......@@ -110,22 +110,22 @@ void TestNHWCSimple3x3SAME() {
} else {
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
auto expected = CreateTensor<float>(
{1, 3, 3, 1},
{8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f});
{1, 3, 3, 1},
{8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f});
ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
}
......@@ -140,18 +140,18 @@ TEST_F(Conv2dOpTest, OPENCLSimple) {
TestNHWCSimple3x3SAME<DeviceType::OPENCL, float>();
}
template <DeviceType D, typename T>
template<DeviceType D, typename T>
void TestNHWCSimple3x3WithoutBias() {
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, T>(
"Input", {1, 3, 3, 2},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
"Input", {1, 3, 3, 2},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, T>(
"Filter", {3, 3, 1, 2},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
"Filter", {3, 3, 1, 2},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(&net, "Input", "InputImage",
......@@ -160,14 +160,14 @@ void TestNHWCSimple3x3WithoutBias() {
kernels::BufferType::CONV2D_FILTER);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
......@@ -175,14 +175,14 @@ void TestNHWCSimple3x3WithoutBias() {
kernels::BufferType::IN_OUT_CHANNEL);
} else {
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input")
.Input("Filter")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("Filter")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
......@@ -202,21 +202,21 @@ TEST_F(Conv2dOpTest, OPENCLWithoutBias) {
TestNHWCSimple3x3WithoutBias<DeviceType::OPENCL, float>();
}
template <DeviceType D, typename T>
template<DeviceType D, typename T>
static void TestNHWCCombined3x3() {
// Construct graph
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, T>(
"Input", {1, 5, 5, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
"Input", {1, 5, 5, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, T>(
"Filter", {3, 3, 2, 2},
{1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f,
1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f,
1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f});
"Filter", {3, 3, 2, 2},
{1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f,
1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f,
1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f});
net.AddInputFromArray<D, T>("Bias", {2}, {0.1f, 0.2f});
if (D == DeviceType::OPENCL) {
......@@ -228,15 +228,15 @@ static void TestNHWCCombined3x3() {
kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
......@@ -244,23 +244,23 @@ static void TestNHWCCombined3x3() {
kernels::BufferType::IN_OUT_CHANNEL);
} else {
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
// Check
auto expected = CreateTensor<float>(
{1, 3, 3, 2}, {8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 18.1f,
9.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f});
{1, 3, 3, 2}, {8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 18.1f,
9.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f});
ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
}
......@@ -272,24 +272,24 @@ TEST_F(Conv2dOpTest, OPENCLStride2) {
TestNHWCCombined3x3<DeviceType::OPENCL, float>();
}
template <DeviceType D>
template<DeviceType D>
void TestConv1x1() {
// Construct graph
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 3, 10, 5},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
"Input", {1, 3, 10, 5},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, float>(
"Filter", {1, 1, 2, 5},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f});
"Filter", {1, 1, 2, 5},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f});
net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});
if (D == DeviceType::OPENCL) {
......@@ -301,14 +301,14 @@ void TestConv1x1() {
kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
......@@ -316,27 +316,27 @@ void TestConv1x1() {
kernels::BufferType::IN_OUT_CHANNEL);
} else {
OpDefBuilder("Conv2D", "Conv2DTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
// Check
auto expected = CreateTensor<float>(
{1, 3, 10, 2},
{5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f});
{1, 3, 10, 2},
{5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
......@@ -345,7 +345,7 @@ TEST_F(Conv2dOpTest, CPUConv1x1) { TestConv1x1<DeviceType::CPU>(); }
TEST_F(Conv2dOpTest, OPENCLConv1x1) { TestConv1x1<DeviceType::OPENCL>(); }
template <DeviceType D, typename T>
template<DeviceType D, typename T>
static void TestComplexConvNxNS12(const std::vector<index_t> &shape,
const int stride) {
testing::internal::LogToStderr();
......@@ -361,20 +361,20 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape,
// Construct graph
OpsTestNet net;
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, T>(
"Filter", {kernel_h, kernel_w, output_channels, input_channels});
"Filter", {kernel_h, kernel_w, output_channels, input_channels});
net.AddRandomInput<D, T>("Bias", {output_channels});
// run on cpu
......@@ -392,15 +392,15 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape,
kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
......@@ -430,7 +430,7 @@ TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS34) {
TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 32, 13, 17}, 4);
}
template <DeviceType D>
template<DeviceType D>
static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
const std::vector<index_t> &filter_shape,
const std::vector<int> &dilations) {
......@@ -449,30 +449,30 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
// Construct graph
OpsTestNet net;
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {dilations[0], dilations[1]})
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {dilations[0], dilations[1]})
.Finalize(net.NewOperatorDef());
std::vector<float> float_input_data;
GenerateRandomRealTypeData({batch, height, width, input_channels},
&float_input_data);
std::vector<float> float_filter_data;
GenerateRandomRealTypeData(
{kernel_h, kernel_w, output_channels, input_channels},
&float_filter_data);
{kernel_h, kernel_w, output_channels, input_channels},
&float_filter_data);
std::vector<float> float_bias_data;
GenerateRandomRealTypeData({output_channels}, &float_bias_data);
// Add input data
net.AddInputFromArray<D, float>(
"Input", {batch, height, width, input_channels}, float_input_data);
"Input", {batch, height, width, input_channels}, float_input_data);
net.AddInputFromArray<D, float>(
"Filter", {kernel_h, kernel_w, output_channels, input_channels},
float_filter_data);
"Filter", {kernel_h, kernel_w, output_channels, input_channels},
float_filter_data);
net.AddInputFromArray<D, float>("Bias", {output_channels}, float_bias_data);
// run on cpu
......@@ -490,15 +490,15 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {dilations[0], dilations[1]})
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {dilations[0], dilations[1]})
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
......@@ -566,7 +566,7 @@ TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation4) {
{4, 4});
}
template <DeviceType D, typename T>
template<DeviceType D, typename T>
static void TestDilationConvNxN(const std::vector<index_t> &shape,
const int dilation_rate) {
testing::internal::LogToStderr();
......@@ -583,20 +583,20 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape,
// Construct graph
OpsTestNet net;
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {dilation_rate, dilation_rate})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {dilation_rate, dilation_rate})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, T>(
"Filter", {kernel_h, kernel_w, output_channels, input_channels});
"Filter", {kernel_h, kernel_w, output_channels, input_channels});
net.AddRandomInput<D, T>("Bias", {output_channels});
// run on cpu
......@@ -614,15 +614,15 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape,
kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {dilation_rate, dilation_rate})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {dilation_rate, dilation_rate})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
......@@ -651,7 +651,7 @@ TEST_F(Conv2dOpTest, OPENCLUnalignedDilation4) {
TestDilationConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, 4);
}
template <DeviceType D, typename T>
template<DeviceType D, typename T>
static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
const std::vector<int> &paddings) {
testing::internal::LogToStderr();
......@@ -667,19 +667,19 @@ static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
// Construct graph
OpsTestNet net;
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntsArg("padding_values", paddings)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntsArg("padding_values", paddings)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, T>(
"Filter", {kernel_h, kernel_w, output_channels, input_channels});
"Filter", {kernel_h, kernel_w, output_channels, input_channels});
net.AddRandomInput<D, T>("Bias", {output_channels});
// run on cpu
......@@ -697,14 +697,14 @@ static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
kernels::BufferType::ARGUMENT);
OpDefBuilder("Conv2D", "Conv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntsArg("padding_values", paddings)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntsArg("padding_values", paddings)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
......@@ -733,6 +733,83 @@ TEST_F(Conv2dOpTest, OPENCLUnalignedPad4) {
TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, {4, 4});
}
static void TestNeonArbitraryPadConvNxN(const std::vector<index_t> &shape,
const std::vector<int> &paddings) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w) {
srand(time(NULL));
// generate random input
index_t batch = 1;
index_t height = shape[0];
index_t width = shape[1];
index_t input_channels = shape[2];
index_t output_channels = shape[3];
// Construct graph
OpsTestNet net;
OpDefBuilder("Conv2D", "Conv2dTestCPU")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntsArg("padding_values", paddings)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<DeviceType::CPU, float>("Input",
{batch, height, width,
input_channels});
net.AddRandomInput<DeviceType::CPU, float>(
"Filter", {kernel_h, kernel_w, output_channels, input_channels});
net.AddRandomInput<DeviceType::CPU, float>("Bias", {output_channels});
// run cpu
net.RunOp();
// run neon
OpDefBuilder("Conv2D", "Conv2dTestNEON")
.Input("InputNeon")
.Input("FilterNeon")
.Input("Bias")
.Output("OutputNeon")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntsArg("padding_values", paddings)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
.Finalize(net.NewOperatorDef());
net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("InputNeon", "Input");
net.FillHWOIInputToOIHWInput<DeviceType::CPU, float>("FilterNeon",
"Filter");
// Run on device
net.RunOp(DeviceType::NEON);
net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("OutputExptected",
"Output");
ExpectTensorNear<float>(*net.GetOutput("OutputExptected"),
*net.GetOutput("OutputNeon"),
0.001);
};
for (int kernel_size : {1, 3, 5}) {
for (int stride : {1, 2}) {
if (stride < kernel_size) {
func(kernel_size, kernel_size, stride, stride);
}
}
}
}
TEST_F(Conv2dOpTest, NEONTest) {
TestNeonArbitraryPadConvNxN({32, 34, 32, 64}, {0, 0});
TestNeonArbitraryPadConvNxN({32, 32, 32, 64}, {1, 1});
TestNeonArbitraryPadConvNxN({128, 128, 16, 16}, {2, 2});
TestNeonArbitraryPadConvNxN({107, 113, 5, 7}, {4, 4});
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -25,6 +25,12 @@ void Register_DepthwiseConv2d(OperatorRegistry *op_registry) {
.TypeConstraint<half>("T")
.Build(),
DepthwiseConv2dOp<DeviceType::OPENCL, half>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d")
.Device(DeviceType::NEON)
.TypeConstraint<float>("T")
.Build(),
DepthwiseConv2dOp<DeviceType::NEON, float>);
}
} // namespace ops
......
......@@ -29,10 +29,19 @@ static void DepthwiseConv2d(int iters,
OpsTestNet net;
// Add input data
net.AddRandomInput<D, float>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, float>(
if (D == DeviceType::NEON) {
net.AddRandomInput<D, float>("Input",
{batch, input_channels, height, width});
net.AddRandomInput<D, float>(
"Filter", {multiplier, input_channels, kernel_h, kernel_w});
net.AddRandomInput<D, float>("Bias", {input_channels * multiplier});
} else {
net.AddRandomInput<D, float>("Input",
{batch, height, width, input_channels});
net.AddRandomInput<D, float>(
"Filter", {kernel_h, kernel_w, input_channels, multiplier});
net.AddRandomInput<D, float>("Bias", {input_channels * multiplier});
net.AddRandomInput<D, float>("Bias", {input_channels * multiplier});
}
if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(&net, "Input", "InputImage",
......@@ -64,15 +73,17 @@ static void DepthwiseConv2d(int iters,
.Finalize(net.NewOperatorDef());
}
net.Setup(D);
// Warm-up
for (int i = 0; i < 2; ++i) {
net.RunOp(D);
net.Run();
net.Sync();
}
mace::testing::StartTiming();
while (iters--) {
net.RunOp(D);
net.Run();
net.Sync();
}
}
......@@ -108,10 +119,16 @@ static void DepthwiseConv2d(int iters,
#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M) \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, OPENCL); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, OPENCL);
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, OPENCL); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, NEON);
BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 1, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 2, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 32, 56, 56, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 32, 224, 224, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 56, 56, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 112, 112, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 224, 224, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 1);
......@@ -124,6 +141,10 @@ BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 3, 112, 112, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 3, 224, 224, 3, 3, 2, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 8, 224, 224, 3, 3, 2, SAME, 1);
} // namespace test
} // namespace ops
......
......@@ -11,7 +11,7 @@ namespace test {
class DepthwiseConv2dOpTest : public OpsTestBase {};
template <DeviceType D, typename T>
template<DeviceType D, typename T>
void SimpleValidTest() {
testing::internal::LogToStderr();
// Construct graph
......@@ -19,10 +19,10 @@ void SimpleValidTest() {
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 3, 3, 2},
{1, 2, 2, 4, 3, 6, 4, 8, 5, 10, 6, 12, 7, 14, 8, 16, 9, 18});
"Input", {1, 3, 3, 2},
{1, 2, 2, 4, 3, 6, 4, 8, 5, 10, 6, 12, 7, 14, 8, 16, 9, 18});
net.AddInputFromArray<D, float>(
"Filter", {2, 2, 2, 1}, {1.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f, 8.0f});
"Filter", {2, 2, 2, 1}, {1.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f, 8.0f});
net.AddInputFromArray<D, float>("Bias", {2}, {.1f, .2f});
if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(&net, "Input", "InputImage",
......@@ -32,15 +32,15 @@ void SimpleValidTest() {
BufferToImage<D, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
net.RunOp(D);
......@@ -50,23 +50,23 @@ void SimpleValidTest() {
} else {
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
// Check
auto expected = CreateTensor<T>(
{1, 2, 2, 2}, VectorStaticCast<T>({37.1f, 148.2f, 47.1f, 188.2f, 67.1f,
268.2f, 77.1f, 308.2f}));
{1, 2, 2, 2}, VectorStaticCast<T>({37.1f, 148.2f, 47.1f, 188.2f, 67.1f,
268.2f, 77.1f, 308.2f}));
ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 1e-5);
}
......@@ -83,7 +83,7 @@ TEST_F(DepthwiseConv2dOpTest, SimpleOpenCLHalf) {
SimpleValidTest<DeviceType::OPENCL, half>();
}
template <DeviceType D, typename T>
template<DeviceType D, typename T>
void ComplexValidTest() {
testing::internal::LogToStderr();
// Construct graph
......@@ -91,41 +91,41 @@ void ComplexValidTest() {
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 10, 10, 3},
{0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11,
0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23,
0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35,
0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47,
0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59,
0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71,
0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83,
0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95,
0.96, 0.97, 0.98, 0.99, 1.0, 1.01, 1.02, 1.03, 1.04, 1.05, 1.06, 1.07,
1.08, 1.09, 1.1, 1.11, 1.12, 1.13, 1.14, 1.15, 1.16, 1.17, 1.18, 1.19,
1.2, 1.21, 1.22, 1.23, 1.24, 1.25, 1.26, 1.27, 1.28, 1.29, 1.3, 1.31,
1.32, 1.33, 1.34, 1.35, 1.36, 1.37, 1.38, 1.39, 1.4, 1.41, 1.42, 1.43,
1.44, 1.45, 1.46, 1.47, 1.48, 1.49, 1.5, 1.51, 1.52, 1.53, 1.54, 1.55,
1.56, 1.57, 1.58, 1.59, 1.6, 1.61, 1.62, 1.63, 1.64, 1.65, 1.66, 1.67,
1.68, 1.69, 1.7, 1.71, 1.72, 1.73, 1.74, 1.75, 1.76, 1.77, 1.78, 1.79,
1.8, 1.81, 1.82, 1.83, 1.84, 1.85, 1.86, 1.87, 1.88, 1.89, 1.9, 1.91,
1.92, 1.93, 1.94, 1.95, 1.96, 1.97, 1.98, 1.99, 2.0, 2.01, 2.02, 2.03,
2.04, 2.05, 2.06, 2.07, 2.08, 2.09, 2.1, 2.11, 2.12, 2.13, 2.14, 2.15,
2.16, 2.17, 2.18, 2.19, 2.2, 2.21, 2.22, 2.23, 2.24, 2.25, 2.26, 2.27,
2.28, 2.29, 2.3, 2.31, 2.32, 2.33, 2.34, 2.35, 2.36, 2.37, 2.38, 2.39,
2.4, 2.41, 2.42, 2.43, 2.44, 2.45, 2.46, 2.47, 2.48, 2.49, 2.5, 2.51,
2.52, 2.53, 2.54, 2.55, 2.56, 2.57, 2.58, 2.59, 2.6, 2.61, 2.62, 2.63,
2.64, 2.65, 2.66, 2.67, 2.68, 2.69, 2.7, 2.71, 2.72, 2.73, 2.74, 2.75,
2.76, 2.77, 2.78, 2.79, 2.8, 2.81, 2.82, 2.83, 2.84, 2.85, 2.86, 2.87,
2.88, 2.89, 2.9, 2.91, 2.92, 2.93, 2.94, 2.95, 2.96, 2.97, 2.98, 2.99});
"Input", {1, 10, 10, 3},
{0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11,
0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23,
0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35,
0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47,
0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59,
0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71,
0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83,
0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95,
0.96, 0.97, 0.98, 0.99, 1.0, 1.01, 1.02, 1.03, 1.04, 1.05, 1.06, 1.07,
1.08, 1.09, 1.1, 1.11, 1.12, 1.13, 1.14, 1.15, 1.16, 1.17, 1.18, 1.19,
1.2, 1.21, 1.22, 1.23, 1.24, 1.25, 1.26, 1.27, 1.28, 1.29, 1.3, 1.31,
1.32, 1.33, 1.34, 1.35, 1.36, 1.37, 1.38, 1.39, 1.4, 1.41, 1.42, 1.43,
1.44, 1.45, 1.46, 1.47, 1.48, 1.49, 1.5, 1.51, 1.52, 1.53, 1.54, 1.55,
1.56, 1.57, 1.58, 1.59, 1.6, 1.61, 1.62, 1.63, 1.64, 1.65, 1.66, 1.67,
1.68, 1.69, 1.7, 1.71, 1.72, 1.73, 1.74, 1.75, 1.76, 1.77, 1.78, 1.79,
1.8, 1.81, 1.82, 1.83, 1.84, 1.85, 1.86, 1.87, 1.88, 1.89, 1.9, 1.91,
1.92, 1.93, 1.94, 1.95, 1.96, 1.97, 1.98, 1.99, 2.0, 2.01, 2.02, 2.03,
2.04, 2.05, 2.06, 2.07, 2.08, 2.09, 2.1, 2.11, 2.12, 2.13, 2.14, 2.15,
2.16, 2.17, 2.18, 2.19, 2.2, 2.21, 2.22, 2.23, 2.24, 2.25, 2.26, 2.27,
2.28, 2.29, 2.3, 2.31, 2.32, 2.33, 2.34, 2.35, 2.36, 2.37, 2.38, 2.39,
2.4, 2.41, 2.42, 2.43, 2.44, 2.45, 2.46, 2.47, 2.48, 2.49, 2.5, 2.51,
2.52, 2.53, 2.54, 2.55, 2.56, 2.57, 2.58, 2.59, 2.6, 2.61, 2.62, 2.63,
2.64, 2.65, 2.66, 2.67, 2.68, 2.69, 2.7, 2.71, 2.72, 2.73, 2.74, 2.75,
2.76, 2.77, 2.78, 2.79, 2.8, 2.81, 2.82, 2.83, 2.84, 2.85, 2.86, 2.87,
2.88, 2.89, 2.9, 2.91, 2.92, 2.93, 2.94, 2.95, 2.96, 2.97, 2.98, 2.99});
net.AddInputFromArray<D, float>(
"Filter", {5, 5, 3, 1},
{0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1,
0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21,
0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32,
0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43,
0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54,
0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65,
0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74});
"Filter", {5, 5, 3, 1},
{0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1,
0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21,
0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32,
0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43,
0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54,
0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65,
0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74});
net.AddInputFromArray<D, float>("Bias", {6},
{0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
if (D == DeviceType::OPENCL) {
......@@ -136,15 +136,15 @@ void ComplexValidTest() {
BufferToImage<D, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
net.RunOp(D);
......@@ -154,38 +154,38 @@ void ComplexValidTest() {
} else {
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
// Check
auto expected = CreateTensor<T>(
{1, 5, 5, 3},
VectorStaticCast<T>(
{4.48200035, 4.63479996, 4.79079962, 5.85899973, 6.05599976,
6.25699997, 6.38100004, 6.59000015, 6.80300045, 6.90299988,
7.1239996, 7.34899998, 4.03559971, 4.16820002, 4.30319977,
8.90999985, 9.1760006, 9.44599915, 11.20499992, 11.54500103,
11.89000034, 11.74499989, 12.09999943, 12.46000004, 12.28499985,
12.65500069, 13.03000069, 7.00200033, 7.22399998, 7.44900036,
13.4100008, 13.79599953, 14.18599987, 16.60500145, 17.09499741,
17.59000015, 17.14500046, 17.65000153, 18.15999794, 17.68499947,
18.20499992, 18.72999954, 9.97200012, 10.28399944, 10.59899998,
17.90999985, 18.41600037, 18.92599869, 22.00500107, 22.64500046,
23.28999901, 22.54500008, 23.19999886, 23.8599987, 23.0850029,
23.75500107, 24.43000031, 12.94200039, 13.34400082, 13.7489996,
6.97500038, 7.29659986, 7.62060022, 8.32049942, 8.72700024,
9.13650036, 8.5095005, 8.92500019, 9.34349918, 8.69849968,
9.12300014, 9.55049992, 4.55220032, 4.80690002, 5.06340027}));
{1, 5, 5, 3},
VectorStaticCast<T>(
{4.48200035, 4.63479996, 4.79079962, 5.85899973, 6.05599976,
6.25699997, 6.38100004, 6.59000015, 6.80300045, 6.90299988,
7.1239996, 7.34899998, 4.03559971, 4.16820002, 4.30319977,
8.90999985, 9.1760006, 9.44599915, 11.20499992, 11.54500103,
11.89000034, 11.74499989, 12.09999943, 12.46000004, 12.28499985,
12.65500069, 13.03000069, 7.00200033, 7.22399998, 7.44900036,
13.4100008, 13.79599953, 14.18599987, 16.60500145, 17.09499741,
17.59000015, 17.14500046, 17.65000153, 18.15999794, 17.68499947,
18.20499992, 18.72999954, 9.97200012, 10.28399944, 10.59899998,
17.90999985, 18.41600037, 18.92599869, 22.00500107, 22.64500046,
23.28999901, 22.54500008, 23.19999886, 23.8599987, 23.0850029,
23.75500107, 24.43000031, 12.94200039, 13.34400082, 13.7489996,
6.97500038, 7.29659986, 7.62060022, 8.32049942, 8.72700024,
9.13650036, 8.5095005, 8.92500019, 9.34349918, 8.69849968,
9.12300014, 9.55049992, 4.55220032, 4.80690002, 5.06340027}));
ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 0.2);
}
......@@ -202,7 +202,7 @@ TEST_F(DepthwiseConv2dOpTest, ComplexOpenCLHalf) {
ComplexValidTest<DeviceType::OPENCL, half>();
}
template <DeviceType D, typename T>
template<DeviceType D, typename T>
void TestNxNS12(const index_t height, const index_t width) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
......@@ -219,18 +219,18 @@ void TestNxNS12(const index_t height, const index_t width) {
net.AddRandomInput<D, float>("Input",
{batch, height, width, input_channels});
net.AddRandomInput<D, float>(
"Filter", {kernel_h, kernel_w, input_channels, multiplier});
"Filter", {kernel_h, kernel_w, input_channels, multiplier});
net.AddRandomInput<D, float>("Bias", {multiplier * input_channels});
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
.Finalize(net.NewOperatorDef());
// Run on cpu
net.RunOp();
......@@ -246,15 +246,15 @@ void TestNxNS12(const index_t height, const index_t width) {
BufferToImage<D, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
net.RunOp(D);
......@@ -263,15 +263,15 @@ void TestNxNS12(const index_t height, const index_t width) {
kernels::BufferType::IN_OUT_CHANNEL);
} else {
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("DeviceOutput")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("DeviceOutput")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
......@@ -315,6 +315,86 @@ TEST_F(DepthwiseConv2dOpTest, OpenCLUnalignedNxNS12Half) {
TestNxNS12<DeviceType::OPENCL, half>(107, 113);
}
void TestNEONNxNS12(const index_t height,
const index_t width,
const index_t input_channels,
const index_t multiplier) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) {
// generate random input
index_t batch = 1;
// Construct graph
OpsTestNet net;
// Add input data
net.AddRandomInput<CPU, float>("Input",
{batch, height, width, input_channels});
net.AddRandomInput<CPU, float>(
"Filter", {kernel_h, kernel_w, input_channels, multiplier});
net.AddRandomInput<CPU, float>("Bias", {multiplier * input_channels});
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
.Finalize(net.NewOperatorDef());
// Run on cpu
net.RunOp();
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
.Input("InputNeon")
.Input("FilterNeon")
.Input("Bias")
.Output("OutputNeon")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
.Finalize(net.NewOperatorDef());
net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("InputNeon", "Input");
net.FillHWIOInputToOIHWInput<DeviceType::CPU, float>("FilterNeon",
"Filter");
// Run
net.RunOp(NEON);
net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("OutputExptected",
"Output");
// Check
ExpectTensorNear<float>(*net.GetOutput("OutputExptected"),
*net.GetOutput("OutputNeon"),
0.001);
};
for (int kernel_size : {1, 3, 5}) {
for (int stride : {1, 2}) {
if (kernel_size > stride) {
func(kernel_size, kernel_size, stride, stride, VALID);
func(kernel_size, kernel_size, stride, stride, SAME);
}
}
}
}
TEST_F(DepthwiseConv2dOpTest, NEONTest) {
TestNEONNxNS12(4, 4, 32, 1);
TestNEONNxNS12(64, 64, 32, 1);
TestNEONNxNS12(112, 112, 32, 1);
TestNEONNxNS12(128, 128, 15, 1);
TestNEONNxNS12(107, 113, 15, 1);
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -25,6 +25,11 @@ void Register_FoldedBatchNorm(OperatorRegistry *op_registry) {
.TypeConstraint<half>("T")
.Build(),
FoldedBatchNormOp<DeviceType::OPENCL, half>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
.Device(DeviceType::NEON)
.TypeConstraint<float>("T")
.Build(),
FoldedBatchNormOp<DeviceType::NEON, float>);
}
} // namespace ops
......
......@@ -25,6 +25,11 @@ void Register_FusedConv2D(OperatorRegistry *op_registry) {
.TypeConstraint<half>("T")
.Build(),
FusedConv2dOp<DeviceType::OPENCL, half>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FusedConv2D")
.Device(DeviceType::NEON)
.TypeConstraint<float>("T")
.Build(),
FusedConv2dOp<DeviceType::NEON, float>);
}
} // namespace ops
......
......@@ -13,17 +13,17 @@ namespace test {
class FusedConv2dOpTest : public OpsTestBase {};
template <DeviceType D, typename T>
template<DeviceType D, typename T>
void TestNHWCSimple3x3VALID() {
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, T>(
"Input", {1, 3, 3, 2},
{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
"Input", {1, 3, 3, 2},
{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
net.AddInputFromArray<D, T>(
"Filter", {3, 3, 2, 1},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
"Filter", {3, 3, 2, 1},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
net.AddInputFromArray<D, T>("Bias", {1}, {-0.1f});
if (D == DeviceType::OPENCL) {
......@@ -34,15 +34,15 @@ void TestNHWCSimple3x3VALID() {
BufferToImage<D, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
net.RunOp(D);
......@@ -52,15 +52,15 @@ void TestNHWCSimple3x3VALID() {
} else {
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
......@@ -69,18 +69,18 @@ void TestNHWCSimple3x3VALID() {
ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
}
template <DeviceType D, typename T>
template<DeviceType D, typename T>
void TestNHWCSimple3x3SAME() {
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, T>(
"Input", {1, 3, 3, 2},
{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
"Input", {1, 3, 3, 2},
{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
net.AddInputFromArray<D, T>(
"Filter", {3, 3, 2, 1},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
"Filter", {3, 3, 2, 1},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
net.AddInputFromArray<D, T>("Bias", {1}, {-0.1f});
if (D == DeviceType::OPENCL) {
......@@ -91,15 +91,15 @@ void TestNHWCSimple3x3SAME() {
BufferToImage<D, T>(&net, "Bias", "BiasImage",
kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
......@@ -109,21 +109,21 @@ void TestNHWCSimple3x3SAME() {
} else {
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
auto expected = CreateTensor<float>(
{1, 3, 3, 1}, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
{1, 3, 3, 1}, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
ExpectTensorNear<float, T>(*expected, *net.GetOutput("Output"), 0.01);
}
......@@ -138,18 +138,18 @@ TEST_F(FusedConv2dOpTest, OPENCLSimple) {
TestNHWCSimple3x3SAME<DeviceType::OPENCL, float>();
}
template <DeviceType D, typename T>
template<DeviceType D, typename T>
void TestNHWCSimple3x3WithoutBias() {
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, T>(
"Input", {1, 3, 3, 2},
{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
"Input", {1, 3, 3, 2},
{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1});
net.AddInputFromArray<D, T>(
"Filter", {3, 3, 2, 1},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
"Filter", {3, 3, 2, 1},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
if (D == DeviceType::OPENCL) {
BufferToImage<D, T>(&net, "Input", "InputImage",
......@@ -158,14 +158,14 @@ void TestNHWCSimple3x3WithoutBias() {
kernels::BufferType::CONV2D_FILTER);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Transfer output
......@@ -173,14 +173,14 @@ void TestNHWCSimple3x3WithoutBias() {
kernels::BufferType::IN_OUT_CHANNEL);
} else {
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("Input")
.Input("Filter")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("Filter")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
......@@ -200,24 +200,24 @@ TEST_F(FusedConv2dOpTest, OPENCLWithoutBias) {
TestNHWCSimple3x3WithoutBias<DeviceType::OPENCL, float>();
}
template <DeviceType D>
template<DeviceType D>
void TestConv1x1() {
// Construct graph
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 3, 10, 5},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
"Input", {1, 3, 10, 5},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, float>(
"Filter", {1, 1, 5, 2},
{1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f});
"Filter", {1, 1, 5, 2},
{1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f});
net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});
if (D == DeviceType::OPENCL) {
......@@ -229,14 +229,14 @@ void TestConv1x1() {
kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
......@@ -244,27 +244,27 @@ void TestConv1x1() {
kernels::BufferType::IN_OUT_CHANNEL);
} else {
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
// Check
auto expected = CreateTensor<float>(
{1, 3, 10, 2},
{5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f});
{1, 3, 10, 2},
{5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
......@@ -273,7 +273,7 @@ TEST_F(FusedConv2dOpTest, CPUConv1x1) { TestConv1x1<DeviceType::CPU>(); }
TEST_F(FusedConv2dOpTest, OPENCLConv1x1) { TestConv1x1<DeviceType::OPENCL>(); }
template <DeviceType D, typename T>
template<DeviceType D, typename T>
static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
......@@ -288,20 +288,20 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
// Construct graph
OpsTestNet net;
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, T>(
"Filter", {kernel_h, kernel_w, output_channels, input_channels});
"Filter", {kernel_h, kernel_w, output_channels, input_channels});
net.AddRandomInput<D, T>("Bias", {output_channels});
// run on cpu
......@@ -319,15 +319,15 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
......@@ -348,7 +348,7 @@ TEST_F(FusedConv2dOpTest, OPENCLUnalignedConvNxNS12) {
TestComplexConvNxNS12<DeviceType::OPENCL, float>({107, 113, 5, 7});
}
template <DeviceType D>
template<DeviceType D>
static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
......@@ -363,30 +363,30 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
// Construct graph
OpsTestNet net;
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
std::vector<float> float_input_data;
GenerateRandomRealTypeData({batch, height, width, input_channels},
&float_input_data);
std::vector<float> float_filter_data;
GenerateRandomRealTypeData(
{kernel_h, kernel_w, output_channels, input_channels},
&float_filter_data);
{kernel_h, kernel_w, output_channels, input_channels},
&float_filter_data);
std::vector<float> float_bias_data;
GenerateRandomRealTypeData({output_channels}, &float_bias_data);
// Add input data
net.AddInputFromArray<D, float>(
"Input", {batch, height, width, input_channels}, float_input_data);
"Input", {batch, height, width, input_channels}, float_input_data);
net.AddInputFromArray<D, float>(
"Filter", {kernel_h, kernel_w, output_channels, input_channels},
float_filter_data);
"Filter", {kernel_h, kernel_w, output_channels, input_channels},
float_filter_data);
net.AddInputFromArray<D, float>("Bias", {output_channels}, float_bias_data);
// run on cpu
......@@ -404,15 +404,15 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataType::DT_HALF))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
......@@ -433,7 +433,7 @@ TEST_F(FusedConv2dOpTest, OPENCLHalfAlignedConvNxNS12) {
TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32, 32, 64});
}
template <DeviceType D, typename T>
template<DeviceType D, typename T>
static void TestGeneralConvNxNS12(const std::vector<index_t> &image_shape,
const std::vector<index_t> &filter_shape) {
testing::internal::LogToStderr();
......@@ -451,20 +451,20 @@ static void TestGeneralConvNxNS12(const std::vector<index_t> &image_shape,
// Construct graph
OpsTestNet net;
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, T>(
"Filter", {kernel_h, kernel_w, output_channels, input_channels});
"Filter", {kernel_h, kernel_w, output_channels, input_channels});
net.AddRandomInput<D, T>("Bias", {output_channels});
// run on cpu
......@@ -482,15 +482,15 @@ static void TestGeneralConvNxNS12(const std::vector<index_t> &image_shape,
kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
......@@ -513,7 +513,7 @@ TEST_F(FusedConv2dOpTest, OPENCL15X1ConvNxNS12) {
TestGeneralConvNxNS12<DeviceType::OPENCL, float>({40, 40}, {15, 1, 32, 64});
}
template <DeviceType D, typename T>
template<DeviceType D, typename T>
static void TestAtrousConvNxN(const std::vector<index_t> &shape,
const int dilation) {
testing::internal::LogToStderr();
......@@ -530,20 +530,20 @@ static void TestAtrousConvNxN(const std::vector<index_t> &shape,
// Construct graph
OpsTestNet net;
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {dilation, dilation})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {dilation, dilation})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
net.AddRandomInput<D, T>(
"Filter", {kernel_h, kernel_w, output_channels, input_channels});
"Filter", {kernel_h, kernel_w, output_channels, input_channels});
net.AddRandomInput<D, T>("Bias", {output_channels});
// run on cpu
......@@ -561,15 +561,15 @@ static void TestAtrousConvNxN(const std::vector<index_t> &shape,
kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {dilation, dilation})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {dilation, dilation})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
......@@ -598,7 +598,7 @@ TEST_F(FusedConv2dOpTest, OPENCLUnalignedAtrousConvNxN) {
TestAtrousConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, 2);
}
template <DeviceType D>
template<DeviceType D>
static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
const std::vector<index_t> &filter_shape,
const std::vector<int> &dilations) {
......@@ -617,20 +617,20 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
// Construct graph
OpsTestNet net;
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, float>("Input",
{batch, height, width, input_channels});
net.AddRandomInput<D, float>(
"Filter", {kernel_h, kernel_w, output_channels, input_channels});
"Filter", {kernel_h, kernel_w, output_channels, input_channels});
net.AddRandomInput<D, float>("Bias", {output_channels});
// run on cpu
......@@ -648,15 +648,15 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
kernels::BufferType::ARGUMENT);
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<half>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Input("FilterImage")
.Input("BiasImage")
.Output("OutputImage")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<half>::value))
.Finalize(net.NewOperatorDef());
// Run on device
net.RunOp(D);
......@@ -679,6 +679,79 @@ TEST_F(FusedConv2dOpTest, OPENCL15X15AtrousConvD4) {
{2, 2});
}
static void TestNEONGeneralConvNxNS12(
const std::vector<index_t> &image_shape,
const std::vector<index_t> &filter_shape) {
testing::internal::LogToStderr();
auto func = [&](int stride_h, int stride_w, Padding type) {
srand(time(NULL));
// generate random input
index_t batch = 1;
index_t height = image_shape[0];
index_t width = image_shape[1];
index_t input_channels = filter_shape[2];
index_t output_channels = filter_shape[3];
index_t kernel_h = filter_shape[0];
index_t kernel_w = filter_shape[1];
// Construct graph
OpsTestNet net;
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<CPU, float>("Input",
{batch, height, width, input_channels});
net.AddRandomInput<CPU, float>(
"Filter", {kernel_h, kernel_w, output_channels, input_channels});
net.AddRandomInput<CPU, float>("Bias", {output_channels});
// run on cpu
net.RunOp();
OpDefBuilder("FusedConv2D", "FusedConv2dTest")
.Input("InputNeon")
.Input("FilterNeon")
.Input("Bias")
.Output("OutputNeon")
.AddIntsArg("strides", {stride_h, stride_w})
.AddIntArg("padding", type)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
.Finalize(net.NewOperatorDef());
net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("InputNeon", "Input");
net.FillHWOIInputToOIHWInput<DeviceType::CPU, float>("FilterNeon",
"Filter");
// Run on device
net.RunOp(DeviceType::NEON);
net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("OutputExptected",
"Output");
ExpectTensorNear<float>(*net.GetOutput("OutputExptected"),
*net.GetOutput("OutputNeon"),
0.001);
};
for (int stride : {1, 2}) {
func(stride, stride, VALID);
func(stride, stride, SAME);
}
}
TEST_F(FusedConv2dOpTest, NEONTest) {
TestNEONGeneralConvNxNS12({32, 32}, {7, 7, 3, 64});
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -105,12 +105,12 @@ class OpsTestNet {
public:
OpsTestNet() : op_registry_(new OperatorRegistry()) {}
template <DeviceType D, typename T>
template<DeviceType D, typename T>
void AddInputFromArray(const std::string &name,
const std::vector<index_t> &shape,
const std::vector<T> &data) {
Tensor *input =
ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v());
ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v());
input->Resize(shape);
Tensor::MappingGuard input_mapper(input);
T *input_data = input->mutable_data<T>();
......@@ -118,24 +118,24 @@ class OpsTestNet {
memcpy(input_data, data.data(), data.size() * sizeof(T));
}
template <DeviceType D, typename T>
template<DeviceType D, typename T>
void AddRepeatedInput(const std::string &name,
const std::vector<index_t> &shape,
const T data) {
Tensor *input =
ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v());
ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v());
input->Resize(shape);
Tensor::MappingGuard input_mapper(input);
T *input_data = input->mutable_data<T>();
std::fill(input_data, input_data + input->size(), data);
}
template <DeviceType D, typename T>
template<DeviceType D, typename T>
void AddRandomInput(const std::string &name,
const std::vector<index_t> &shape,
bool positive = false) {
Tensor *input =
ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v());
ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v());
input->Resize(shape);
Tensor::MappingGuard input_mapper(input);
T *input_data = input->mutable_data<T>();
......@@ -145,10 +145,10 @@ class OpsTestNet {
std::normal_distribution<float> nd(0, 1);
if (DataTypeToEnum<T>::value == DT_HALF) {
std::generate(
input_data, input_data + input->size(), [&gen, &nd, positive] {
return half_float::half_cast<half>(positive ? std::abs(nd(gen))
: nd(gen));
});
input_data, input_data + input->size(), [&gen, &nd, positive] {
return half_float::half_cast<half>(positive ? std::abs(nd(gen))
: nd(gen));
});
} else {
std::generate(input_data, input_data + input->size(),
[&gen, &nd, positive] {
......@@ -157,6 +157,84 @@ class OpsTestNet {
}
}
template<DeviceType D, typename T>
void FillNHWCInputToNCHWInput(const std::string &name_nchw,
const std::string &name_nhwc) {
Tensor *input = ws_.GetTensor(name_nhwc);
Tensor *output = ws_.CreateTensor(name_nchw,
GetDeviceAllocator(D),
DataTypeToEnum<T>::v());
const std::vector<index_t> input_shape = input->shape();
index_t batch = input_shape[0];
index_t height = input_shape[1];
index_t width = input_shape[2];
index_t channels = input_shape[3];
output->Resize({batch, channels, height, width});
const T *input_data = input->data<T>();
T *output_data = output->mutable_data<T>();
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) {
for (index_t h = 0; h < height; ++h) {
for (index_t w = 0; w < width; ++w) {
output_data[((b * channels + c) * height + h) * width + w] =
input_data[((b * height + h) * width + w) * channels + c];
}
}
}
}
}
template<DeviceType D, typename T>
void FillHWOIInputToOIHWInput(const std::string &name_oihw,
const std::string &name_hwoi) {
Tensor *input = ws_.GetTensor(name_hwoi);
Tensor *output = ws_.CreateTensor(name_oihw,
GetDeviceAllocator(D),
DataTypeToEnum<T>::v());
const std::vector<index_t> input_shape = input->shape();
index_t height = input_shape[0];
index_t width = input_shape[1];
index_t out_channels = input_shape[2];
index_t in_channels = input_shape[3];
index_t hw = height * width;
index_t oi = out_channels * in_channels;
output->Resize({out_channels, in_channels, height, width});
const T *input_data = input->data<T>();
T *output_data = output->mutable_data<T>();
for (index_t i = 0; i < oi; ++i) {
for (index_t j = 0; j < hw; ++j) {
output_data[i * height * width + j] =
input_data[j * out_channels * in_channels + i];
}
}
}
template<DeviceType D, typename T>
void FillHWIOInputToOIHWInput(const std::string &name_oihw,
const std::string &name_hwio) {
Tensor *input = ws_.GetTensor(name_hwio);
Tensor *output = ws_.CreateTensor(name_oihw,
GetDeviceAllocator(D),
DataTypeToEnum<T>::v());
const std::vector<index_t> input_shape = input->shape();
index_t height = input_shape[0];
index_t width = input_shape[1];
index_t in_channels = input_shape[2];
index_t out_channels = input_shape[3];
index_t hw = height * width;
output->Resize({out_channels, in_channels, height, width});
const T *input_data = input->data<T>();
T *output_data = output->mutable_data<T>();
for (index_t m = 0; m < out_channels; ++m) {
for (index_t c = 0; c < in_channels; ++c) {
for (index_t k = 0; k < hw; ++k) {
output_data[((m * in_channels) + c) * height * width + k] =
input_data[k * out_channels * in_channels + c * out_channels + m];
}
}
}
}
OperatorDef *NewOperatorDef() {
op_defs_.clear();
op_defs_.emplace_back(OperatorDef());
......@@ -165,17 +243,35 @@ class OpsTestNet {
Workspace *ws() { return &ws_; }
bool RunOp(DeviceType device) {
bool Setup(DeviceType device) {
NetDef net_def;
for (auto &op_def_ : op_defs_) {
net_def.add_op()->CopyFrom(op_def_);
}
net_ = CreateNet(op_registry_, net_def, &ws_, device);
device_ = device;
return net_ != nullptr;
}
bool Run() {
MACE_CHECK_NOTNULL(net_);
return net_->Run();
}
bool RunOp() { return RunOp(DeviceType::CPU); }
// DEPRECATED(liyin):
// Test and benchmark should setup model once and run multiple times.
// Setup time should not be counted during benchmark.
bool RunOp(DeviceType device) {
Setup(device);
return Run();
}
// DEPRECATED(liyin):
// Test and benchmark should setup model once and run multiple times.
// Setup time should not be counted during benchmark.
bool RunOp() {
return RunOp(DeviceType::CPU);
}
Tensor *GetOutput(const char *output_name) {
return ws_.GetTensor(output_name);
......@@ -210,7 +306,7 @@ class OpsTestBase : public ::testing::Test {
}
};
template <typename T>
template<typename T>
void GenerateRandomRealTypeData(const std::vector<index_t> &shape,
std::vector<T> *res) {
MACE_CHECK_NOTNULL(res);
......@@ -231,7 +327,7 @@ void GenerateRandomRealTypeData(const std::vector<index_t> &shape,
}
}
template <typename T>
template<typename T>
void GenerateRandomIntTypeData(const std::vector<index_t> &shape,
std::vector<T> *res,
const T a = 0,
......@@ -249,7 +345,7 @@ void GenerateRandomIntTypeData(const std::vector<index_t> &shape,
std::generate(res->begin(), res->end(), [&gen, &nd] { return nd(gen); });
}
template <typename T>
template<typename T>
std::vector<T> VectorStaticCast(const std::vector<float> &&src) {
std::vector<T> dest;
dest.reserve(src.size());
......@@ -259,11 +355,11 @@ std::vector<T> VectorStaticCast(const std::vector<float> &&src) {
return std::move(dest);
}
template <typename T>
template<typename T>
std::unique_ptr<Tensor> CreateTensor(const std::vector<index_t> &shape,
const std::vector<T> &data) {
std::unique_ptr<Tensor> res(
new Tensor(GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum<T>::v()));
new Tensor(GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum<T>::v()));
res->Resize(shape);
T *input_data = res->mutable_data<T>();
memcpy(input_data, data.data(), data.size() * sizeof(T));
......@@ -293,24 +389,24 @@ inline std::string ShapeToString(const Tensor &x) {
return std::string(stream.str());
}
template <typename T>
template<typename T>
struct is_floating_point_type {
static const bool value = std::is_same<T, float>::value ||
std::is_same<T, double>::value ||
std::is_same<T, half>::value;
std::is_same<T, double>::value ||
std::is_same<T, half>::value;
};
template <typename T>
template<typename T>
inline void ExpectEqual(const T &a, const T &b) {
EXPECT_EQ(a, b);
}
template <>
template<>
inline void ExpectEqual<float>(const float &a, const float &b) {
EXPECT_FLOAT_EQ(a, b);
}
template <>
template<>
inline void ExpectEqual<double>(const double &a, const double &b) {
EXPECT_DOUBLE_EQ(a, b);
}
......@@ -320,13 +416,13 @@ inline void AssertSameDims(const Tensor &x, const Tensor &y) {
<< "y.shape [ " << ShapeToString(y) << "]";
}
template <typename EXP_TYPE,
typename RES_TYPE,
bool is_fp = is_floating_point_type<EXP_TYPE>::value>
template<typename EXP_TYPE,
typename RES_TYPE,
bool is_fp = is_floating_point_type<EXP_TYPE>::value>
struct Expector;
// Partial specialization for float and double.
template <typename EXP_TYPE, typename RES_TYPE>
template<typename EXP_TYPE, typename RES_TYPE>
struct Expector<EXP_TYPE, RES_TYPE, true> {
static void Equal(const EXP_TYPE &a, const RES_TYPE &b) { ExpectEqual(a, b); }
......@@ -373,22 +469,22 @@ struct Expector<EXP_TYPE, RES_TYPE, true> {
}
};
template <typename T>
template<typename T>
void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) {
static_assert(is_floating_point_type<T>::value,
"T is not a floating point type");
Expector<T, T>::Near(x, y, abs_err);
}
template <typename EXP_TYPE, typename RES_TYPE>
template<typename EXP_TYPE, typename RES_TYPE>
void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) {
static_assert(is_floating_point_type<EXP_TYPE>::value &&
is_floating_point_type<RES_TYPE>::value,
is_floating_point_type<RES_TYPE>::value,
"T is not a floating point type");
Expector<EXP_TYPE, RES_TYPE>::Near(x, y, abs_err);
}
template <DeviceType D, typename T>
template<DeviceType D, typename T>
void BufferToImage(OpsTestNet *net,
const std::string &input_name,
const std::string &output_name,
......@@ -396,11 +492,11 @@ void BufferToImage(OpsTestNet *net,
MACE_CHECK_NOTNULL(net);
OpDefBuilder("BufferToImage", "BufferToImageTest")
.Input(input_name)
.Output(output_name)
.AddIntArg("buffer_type", type)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net->NewOperatorDef());
.Input(input_name)
.Output(output_name)
.AddIntArg("buffer_type", type)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net->NewOperatorDef());
// Run
net->RunOp(D);
......@@ -408,7 +504,7 @@ void BufferToImage(OpsTestNet *net,
net->Sync();
}
template <DeviceType D, typename T>
template<DeviceType D, typename T>
void ImageToBuffer(OpsTestNet *net,
const std::string &input_name,
const std::string &output_name,
......@@ -416,11 +512,11 @@ void ImageToBuffer(OpsTestNet *net,
MACE_CHECK_NOTNULL(net);
OpDefBuilder("ImageToBuffer", "ImageToBufferTest")
.Input(input_name)
.Output(output_name)
.AddIntArg("buffer_type", type)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net->NewOperatorDef());
.Input(input_name)
.Output(output_name)
.AddIntArg("buffer_type", type)
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net->NewOperatorDef());
// Run
net->RunOp(D);
......
......@@ -29,6 +29,11 @@ void Register_Pooling(OperatorRegistry *op_registry) {
.TypeConstraint<half>("T")
.Build(),
PoolingOp<DeviceType::OPENCL, half>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling")
.Device(DeviceType::NEON)
.TypeConstraint<float>("T")
.Build(),
PoolingOp<DeviceType::NEON, float>);
}
} // namespace ops
......
......@@ -19,27 +19,27 @@ TEST_F(PoolingOpTest, MAX_VALID) {
// Construct graph
OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
.AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("pooling_type", PoolingType::MAX)
.Finalize(net.NewOperatorDef());
.Input("Input")
.Output("Output")
.AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("pooling_type", PoolingType::MAX)
.Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<DeviceType::CPU, float>(
"Input", {1, 4, 4, 2},
{0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
"Input", {1, 4, 4, 2},
{0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
// Run
net.RunOp();
// Check
auto expected =
CreateTensor<float>({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31});
CreateTensor<float>({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
......@@ -48,14 +48,14 @@ TEST_F(PoolingOpTest, MAX_SAME) {
// Construct graph
OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
.AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("pooling_type", PoolingType::MAX)
.Finalize(net.NewOperatorDef());
.Input("Input")
.Output("Output")
.AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("pooling_type", PoolingType::MAX)
.Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 3, 3, 1},
......@@ -74,19 +74,19 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
// Construct graph
OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
.AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {2, 2})
.AddIntArg("pooling_type", PoolingType::MAX)
.Finalize(net.NewOperatorDef());
.Input("Input")
.Output("Output")
.AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {1, 1})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {2, 2})
.AddIntArg("pooling_type", PoolingType::MAX)
.Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<DeviceType::CPU, float>(
"Input", {1, 4, 4, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
"Input", {1, 4, 4, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
// Run
net.RunOp();
......@@ -101,19 +101,19 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
// Construct graph
OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
.Input("Input")
.Output("Output")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<DeviceType::CPU, float>(
"Input", {1, 2, 9, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17});
"Input", {1, 2, 9, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17});
// Run
net.RunOp();
......@@ -123,43 +123,43 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
template <DeviceType D>
template<DeviceType D>
static void SimpleMaxPooling3S2() {
// Construct graph
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 3, 9, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26});
"Input", {1, 3, 9, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26});
if (D == DeviceType::OPENCL) {
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
net.RunOp(D);
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL);
} else {
// Run
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
.Input("Input")
.Output("Output")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
net.RunOp(D);
}
......@@ -175,22 +175,22 @@ TEST_F(PoolingOpTest, OPENCLSimpleMaxPooling3S2) {
SimpleMaxPooling3S2<OPENCL>();
}
template <DeviceType D, typename T>
template<DeviceType D, typename T>
static void MaxPooling3S2(const std::vector<index_t> &input_shape,
const std::vector<int> strides,
Padding padding) {
// Construct graph
OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", strides)
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("Input")
.Output("Output")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", strides)
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, T>("Input", input_shape);
......@@ -203,15 +203,15 @@ static void MaxPooling3S2(const std::vector<index_t> &input_shape,
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", strides)
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::MAX)
.AddIntsArg("kernels", {3, 3})
.AddIntsArg("strides", strides)
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
net.RunOp(D);
ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -250,52 +250,52 @@ TEST_F(PoolingOpTest, AVG_VALID) {
// Construct graph
OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
.AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("pooling_type", PoolingType::AVG)
.Finalize(net.NewOperatorDef());
.Input("Input")
.Output("Output")
.AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::VALID)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("pooling_type", PoolingType::AVG)
.Finalize(net.NewOperatorDef());
// Add input data
net.AddInputFromArray<DeviceType::CPU, float>(
"Input", {1, 4, 4, 2},
{0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
"Input", {1, 4, 4, 2},
{0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
// Run
net.RunOp();
// Check
auto expected = CreateTensor<float>(
{1, 2, 2, 2}, {2.5, 18.5, 4.5, 20.5, 10.5, 26.5, 12.5, 28.5});
{1, 2, 2, 2}, {2.5, 18.5, 4.5, 20.5, 10.5, 26.5, 12.5, 28.5});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
template <DeviceType D>
template<DeviceType D>
static void SimpleAvgPoolingTest() {
// Construct graph
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
"Input", {1, 2, 8, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
"Input", {1, 2, 8, 1},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
BufferToImage<D, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::AVG)
.AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::AVG)
.AddIntsArg("kernels", {2, 2})
.AddIntsArg("strides", {2, 2})
.AddIntArg("padding", Padding::SAME)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
ImageToBuffer<D, float>(&net, "OutputImage", "Output",
......@@ -311,7 +311,7 @@ TEST_F(PoolingOpTest, OPENCLSimpleAvgPooling) {
SimpleAvgPoolingTest<OPENCL>();
}
template <DeviceType D, typename T>
template<DeviceType D, typename T>
static void AvgPoolingTest(const std::vector<index_t> &shape,
const std::vector<int> &kernels,
const std::vector<int> &strides,
......@@ -319,14 +319,14 @@ static void AvgPoolingTest(const std::vector<index_t> &shape,
// Construct graph
OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
.AddIntArg("pooling_type", PoolingType::AVG)
.AddIntsArg("kernels", kernels)
.AddIntsArg("strides", strides)
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
.Input("Input")
.Output("Output")
.AddIntArg("pooling_type", PoolingType::AVG)
.AddIntsArg("kernels", kernels)
.AddIntsArg("strides", strides)
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, float>("Input", shape);
......@@ -339,15 +339,15 @@ static void AvgPoolingTest(const std::vector<index_t> &shape,
BufferToImage<D, T>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::AVG)
.AddIntsArg("kernels", kernels)
.AddIntsArg("strides", strides)
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("pooling_type", PoolingType::AVG)
.AddIntsArg("kernels", kernels)
.AddIntsArg("strides", strides)
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
.Finalize(net.NewOperatorDef());
net.RunOp(D);
ImageToBuffer<D, T>(&net, "OutputImage", "OPENCLOutput",
kernels::BufferType::IN_OUT_CHANNEL);
......@@ -396,6 +396,62 @@ TEST_F(PoolingOpTest, OPENCLUnAlignedLargeKernelAvgPooling) {
Padding::SAME);
}
static void AvgPoolingNEONTest(const std::vector<index_t> &shape,
const std::vector<int> &kernels,
const std::vector<int> &strides,
Padding padding,
PoolingType pooling_type) {
// Construct graph
OpsTestNet net;
OpDefBuilder("Pooling", "PoolingTest")
.Input("Input")
.Output("Output")
.AddIntArg("pooling_type", pooling_type)
.AddIntsArg("kernels", kernels)
.AddIntsArg("strides", strides)
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<CPU, float>("Input", shape);
// run on cpu
net.RunOp();
OpDefBuilder("Pooling", "PoolingTest")
.Input("InputNeon")
.Output("OutputNeon")
.AddIntArg("pooling_type", pooling_type)
.AddIntsArg("kernels", kernels)
.AddIntsArg("strides", strides)
.AddIntArg("padding", padding)
.AddIntsArg("dilations", {1, 1})
.Finalize(net.NewOperatorDef());
net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("InputNeon", "Input");
// run on neon
net.RunOp(DeviceType::NEON);
net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("OutputExptected",
"Output");
ExpectTensorNear<float>(*net.GetOutput("OutputExptected"),
*net.GetOutput("OutputNeon"),
0.01);
}
TEST_F(PoolingOpTest, NEONTest) {
AvgPoolingNEONTest({3, 31, 37, 128}, {8, 8}, {8, 8},
Padding::VALID, PoolingType::MAX);
AvgPoolingNEONTest({3, 31, 37, 128}, {8, 8}, {8, 8},
Padding::SAME, PoolingType::MAX);
AvgPoolingNEONTest({3, 31, 37, 128}, {8, 8}, {8, 8},
Padding::VALID, PoolingType::AVG);
AvgPoolingNEONTest({3, 31, 37, 128}, {8, 8}, {8, 8},
Padding::SAME, PoolingType::AVG);
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -25,6 +25,11 @@ void Register_Softmax(OperatorRegistry *op_registry) {
.TypeConstraint<half>("T")
.Build(),
SoftmaxOp<DeviceType::OPENCL, half>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax")
.Device(DeviceType::NEON)
.TypeConstraint<float>("T")
.Build(),
SoftmaxOp<DeviceType::NEON, float>);
}
} // namespace ops
......
......@@ -11,7 +11,7 @@ namespace test {
class SoftmaxOpTest : public OpsTestBase {};
template <DeviceType D>
template<DeviceType D>
void Simple() {
// Construct graph
OpsTestNet net;
......@@ -24,9 +24,9 @@ void Simple() {
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Softmax", "SoftmaxTest")
.Input("InputImage")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
......@@ -36,17 +36,17 @@ void Simple() {
kernels::BufferType::IN_OUT_CHANNEL);
} else {
OpDefBuilder("Softmax", "SoftmaxTest")
.Input("Input")
.Output("Output")
.Finalize(net.NewOperatorDef());
.Input("Input")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
}
auto expected = CreateTensor<float>(
{1, 1, 2, 4},
{0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426});
{1, 1, 2, 4},
{0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-7);
}
......@@ -54,7 +54,7 @@ void Simple() {
TEST_F(SoftmaxOpTest, CPUSimple) { Simple<DeviceType::CPU>(); }
TEST_F(SoftmaxOpTest, OPENCLSimple) { Simple<DeviceType::OPENCL>(); }
template <DeviceType D>
template<DeviceType D>
void Complex(const std::vector<index_t> &logits_shape) {
// Construct graph
OpsTestNet net;
......@@ -62,9 +62,9 @@ void Complex(const std::vector<index_t> &logits_shape) {
net.AddRandomInput<D, float>("Input", logits_shape);
OpDefBuilder("Softmax", "SoftmaxTest")
.Input("Input")
.Output("Output")
.Finalize(net.NewOperatorDef());
.Input("Input")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Run on cpu
net.RunOp();
......@@ -75,9 +75,9 @@ void Complex(const std::vector<index_t> &logits_shape) {
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("Softmax", "SoftmaxTest")
.Input("InputImage")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Output("OutputImage")
.Finalize(net.NewOperatorDef());
// Run on gpu
net.RunOp(D);
......@@ -104,6 +104,45 @@ TEST_F(SoftmaxOpTest, OPENCLUnAligned) {
Complex<DeviceType::OPENCL>({5, 211, 107, 1});
}
void SoftMaxNEONTest(const std::vector<index_t> &logits_shape) {
// Construct graph
OpsTestNet net;
// Add input data
net.AddRandomInput<CPU, float>("Input", logits_shape);
OpDefBuilder("Softmax", "SoftmaxTest")
.Input("Input")
.Output("Output")
.Finalize(net.NewOperatorDef());
// Run on cpu
net.RunOp();
OpDefBuilder("Softmax", "SoftmaxTest")
.Input("InputNeon")
.Output("OutputNeon")
.Finalize(net.NewOperatorDef());
net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("InputNeon", "Input");
// run on neon
net.RunOp(DeviceType::NEON);
net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("OutputExptected",
"Output");
ExpectTensorNear<float>(*net.GetOutput("OutputExptected"),
*net.GetOutput("OutputNeon"),
0.01);
}
TEST_F(SoftmaxOpTest, NEONTest) {
SoftMaxNEONTest({5, 64, 64, 3});
SoftMaxNEONTest({8, 128, 128, 8});
SoftMaxNEONTest({1, 113, 107, 13});
SoftMaxNEONTest({5, 211, 107, 1});
}
} // namespace test
} // namespace ops
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/ops/transpose.h"
namespace mace {
namespace ops {
void Register_Transpose(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Transpose")
.Device(DeviceType::CPU)
.TypeConstraint<float>("T")
.Build(),
TransposeOp<DeviceType::CPU, float>);
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Transpose")
.Device(DeviceType::NEON)
.TypeConstraint<float>("T")
.Build(),
TransposeOp<DeviceType::NEON, float>);
}
} // namespace ops
} // namespace mace
//
// Copyright (c) 2018 XiaoMi All rights reserved.
//
#ifndef MACE_OPS_TRANSPOSE_H_
#define MACE_OPS_TRANSPOSE_H_
#include <vector>
#include "mace/core/operator.h"
#include "mace/kernels/transpose.h"
#include "mace/kernels/softmax.h"
namespace mace {
template<DeviceType D, class T>
class TransposeOp : public Operator<D, T> {
public:
TransposeOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws),
dims_(OperatorBase::GetRepeatedArgument<int>(
"dims")),
functor_(dims_) {}
bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
Tensor *output = this->Output(OUTPUT);
const std::vector<index_t> &input_shape = input->shape();
MACE_CHECK(input_shape.size() == 4 && dims_.size() == 4,
"rank should be 4");
std::vector<index_t> output_shape;
for (int i = 0; i < dims_.size(); ++i) {
output_shape.push_back(input_shape[dims_[i]]);
}
output->Resize(output_shape);
functor_(input, output, future);
return true;
}
protected:
std::vector<int> dims_;
kernels::TransposeFunctor<D, T> functor_;
OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace mace
#endif // MACE_OPS_TRANSPOSE_H_
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/operator.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class TransposeOpTest : public OpsTestBase {};
void TransposeNCHWTest(const std::vector<index_t> &input_shape) {
// Construct graph
OpsTestNet net;
// Add input data
net.AddRandomInput<CPU, float>("Input", input_shape);
OpDefBuilder("Transpose", "TransposeNCHWTest")
.Input("Input")
.Output("Output")
.AddIntsArg("dims", {0, 3, 1, 2})
.Finalize(net.NewOperatorDef());
// Run on cpu
net.RunOp();
net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("InputNCHW", "Input");
ExpectTensorNear<float>(*net.GetOutput("InputNCHW"),
*net.GetOutput("Output"),
0.01);
}
TEST_F(TransposeOpTest, NCHW) {
TransposeNCHWTest({3, 64, 64, 128});
TransposeNCHWTest({1, 64, 48, 128});
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -8,7 +8,6 @@ from mace.python.tools import memory_optimizer
from tensorflow.core.framework import graph_pb2
from tensorflow.core.framework import tensor_shape_pb2
# TODO: support NCHW formt, now only support NHWC.
padding_mode = {
'VALID': 0,
'SAME': 1,
......@@ -133,7 +132,7 @@ class TFConverter(object):
arg.i = self.dt
return output_name
def add_input_transform(self, names):
def add_gpu_input_transform(self, names):
for name in names:
new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
......@@ -150,7 +149,24 @@ class TFConverter(object):
arg.name = 'T'
arg.i = self.dt
def add_output_transform(self, names):
def add_neon_input_transform(self, names):
for name in names:
new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = name
op_def.type = 'Transpose'
op_def.input.extend([new_input_name])
op_def.output.extend([name+':0'])
dims_arg = op_def.arg.add()
dims_arg.name = 'dims'
dims_arg.ints.extend([0, 3, 1, 2])
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
def add_gpu_output_transform(self, names):
for name in names:
output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
......@@ -163,6 +179,19 @@ class TFConverter(object):
epsilon_arg.name = 'buffer_type'
epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL']
def add_neon_output_transform(self, names):
for name in names:
output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'Transpose'
op_def.input.extend([name+':0'])
op_def.output.extend([output_name])
dims_arg = op_def.arg.add()
dims_arg.name = 'dims'
dims_arg.ints.extend([0, 2, 3, 1])
@staticmethod
def add_output_shape(outputs, op):
output_shapes = []
......@@ -335,9 +364,14 @@ class TFConverter(object):
op_def.name = op.name
if op.type == 'DepthwiseConv2dNative':
op_def.type = 'DepthwiseConv2d'
if self.device == 'neon':
self.transpose_filter_tensor[get_input_tensor(op, 1).name] = (3, 2, 0, 1)
else:
op_def.type = op.type
self.transpose_filter_tensor[get_input_tensor(op, 1).name] = (0, 1, 3, 2)
if self.device == 'neon':
self.transpose_filter_tensor[get_input_tensor(op, 1).name] = (3, 2, 0, 1)
else:
self.transpose_filter_tensor[get_input_tensor(op, 1).name] = (0, 1, 3, 2)
if self.device == 'gpu':
op_def.input.extend([op.inputs[0].name])
buffer_type = "DW_CONV2D_FILTER" if op_def.type == 'DepthwiseConv2d' else "CONV2D_FILTER"
......@@ -354,7 +388,10 @@ class TFConverter(object):
strides_arg.ints.extend(op.get_attr('strides')[1:3])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
data_format_arg.s = 'NHWC'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
final_op = op
self.resolved_ops[op.name] = 1
......@@ -394,7 +431,10 @@ class TFConverter(object):
arg.i = self.dt
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
data_format_arg.s = 'NHWC'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
op_def.name = op.name
op_def.type = 'FoldedBatchNorm'
......@@ -497,7 +537,10 @@ class TFConverter(object):
epsilon_arg.f = get_input_tensor(op, 1).eval().astype(np.float)
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
data_format_arg.s = 'NHWC'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
self.unused_tensor.add(get_input_tensor(op, 1).name)
self.net_def.op.extend([op_def])
......@@ -528,7 +571,10 @@ class TFConverter(object):
kernels_arg.ints.extend(op.get_attr('ksize')[1:3])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
data_format_arg.s = 'NHWC'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
self.resolved_ops[op.name] = 1
def convert_global_avg_pooling(self, op):
......@@ -555,7 +601,10 @@ class TFConverter(object):
kernels_arg.ints.extend(op.inputs[0].shape.as_list()[1:3])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
data_format_arg.s = 'NHWC'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
self.resolved_ops[op.name] = 1
def convert_activation(self, op):
......@@ -771,7 +820,10 @@ class TFConverter(object):
strides_arg.ints.extend([1, 1])
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
data_format_arg.s = 'NHWC'
if self.device == 'neon':
data_format_arg.s = 'NCHW'
else:
data_format_arg.s = 'NHWC'
final_op = conv_op
self.resolved_ops[op.name] = 1
self.resolved_ops[conv_op.name] = 1
......@@ -879,7 +931,9 @@ class TFConverter(object):
def convert(self, input_nodes, output_nodes):
if self.device == 'gpu':
self.add_input_transform(input_nodes)
self.add_gpu_input_transform(input_nodes)
if self.device == 'neon':
self.add_neon_input_transform(input_nodes)
for op in self.tf_ops:
if self.resolved_ops[op.name] == 1:
......@@ -957,7 +1011,10 @@ class TFConverter(object):
raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type))
if self.device == 'gpu':
self.add_output_transform(output_nodes)
self.add_gpu_output_transform(output_nodes)
if self.device == 'neon':
self.add_neon_output_transform(output_nodes)
if self.device == 'cpu':
self.replace_in_out_name(input_nodes, output_nodes)
......@@ -1007,12 +1064,20 @@ class Optimizer:
scale_tensor = self.tensor_map[scale_buffer_name]
weight_shape = weight_tensor.dims
idx = 0
for i in range(weight_shape[0]):
for j in range(weight_shape[1]):
for ic in range(weight_shape[2]):
for oc in range(weight_shape[3]):
weight_tensor.float_data[idx] *= scale_tensor.float_data[ic * weight_shape[3] + oc]
idx += 1
if self.device == 'neon': # OIHW
for oc in range(weight_shape[0]):
for ic in range(weight_shape[1]):
for i in range(weight_shape[2]):
for j in range(weight_shape[3]):
weight_tensor.float_data[idx] *= scale_tensor.float_data[ic * weight_shape[0] + oc]
idx += 1
else: # HWIO
for i in range(weight_shape[0]):
for j in range(weight_shape[1]):
for ic in range(weight_shape[2]):
for oc in range(weight_shape[3]):
weight_tensor.float_data[idx] *= scale_tensor.float_data[ic * weight_shape[3] + oc]
idx += 1
new_tensors.append(weight_tensor)
unused_tensors.add(weight_tensor.name)
......
#!/bin/bash
set -x
Usage() {
echo "Usage: bash tools/benchmark.sh target_soc model_output_dir option_args"
}
......@@ -70,6 +71,7 @@ else
--copt="-DMACE_OBFUSCATE_LITERALS" \
--copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \
--define openmp=true \
--define neon=true \
--copt="-O3" \
--define production=true || exit 1
......@@ -85,7 +87,7 @@ else
adb -s $DEVICE_ID push ${MODEL_OUTPUT_DIR}/benchmark_model \
${PHONE_DATA_DIR} > /dev/null || exit 1
if [ "$EMBED_MODEL_DATA" = 0 ]; then
adb -s $DEVICE_ID push ${MODEL_OUTPUT_DIR}/${MODEL_TAG}.data
adb -s $DEVICE_ID push ${MODEL_OUTPUT_DIR}/${MODEL_TAG}.data \
${PHONE_DATA_DIR} > /dev/null || exit 1
fi
......
......@@ -57,6 +57,7 @@ else
--copt="-DMACE_OBFUSCATE_LITERALS" \
--copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \
--define openmp=true \
--define neon=true \
--copt="-O3" \
$NEON_ENABLE_FLAG \
$PRODUCTION_MODE_BUILD_FLAGS \
......
......@@ -23,6 +23,9 @@ elif [ x"$RUNTIME" = x"gpu" ]; then
elif [ x"$RUNTIME" = x"cpu" ]; then
DATA_TYPE="DT_FLOAT"
DEVICE_TYPE="CPU"
elif [ x"$RUNTIME" = x"neon" ]; then
DATA_TYPE="DT_FLOAT"
DEVICE_TYPE="NEON"
fi
GENERATED_MODEL_LIB_NAME="libgenerated_models.a"
......
......@@ -50,6 +50,8 @@ def get_global_runtime(configs):
global_runtime = "gpu"
elif "cpu" in runtime_list:
global_runtime = "cpu"
elif "neon" in runtime_list:
global_runtime = "neon"
else:
raise Exception("Not found available RUNTIME in config files!")
......@@ -379,3 +381,4 @@ def main(unused_args):
if __name__ == "__main__":
FLAGS, unparsed = parse_args()
main(unused_args=[sys.argv[0]] + unparsed)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册