提交 de985851 编写于 作者: L liuqi

Finish depthwise conv2d cpu kernel and 3x3 neon kernel.

上级 5b21653b
......@@ -71,6 +71,51 @@ void CalcPaddingAndOutputSize(const index_t *input_shape, // NCHW
output_shape[3] = output_width;
}
void CalPaddingSize(const index_t *input_shape, // NCHW
const index_t *filter_shape, // OIHW
const int *dilations,
const int *strides,
Padding padding,
int *padding_size) {
MACE_CHECK(dilations[0] > 0 && dilations[1] > 0,
"Invalid dilations, must >= 1");
MACE_CHECK((dilations[0] == 1 || strides[0] == 1) &&
(dilations[1] == 1 || strides[1] == 1),
"If dilations > 1, strides should be 1");
MACE_CHECK_NOTNULL(padding_size);
index_t output_height, output_width;
index_t k_extent_height = (filter_shape[2] - 1) * dilations[0] + 1;
index_t k_extent_width = (filter_shape[3] - 1) * dilations[1] + 1;
switch (padding) {
case VALID:
output_height = (input_shape[2] - k_extent_height) / strides[0] + 1;
output_width = (input_shape[3] - k_extent_width) / strides[1] + 1;
break;
case SAME:
output_height = (input_shape[2] - 1) / strides[0] + 1;
output_width = (input_shape[3] - 1) / strides[1] + 1;
break;
case FULL:
output_height = (input_shape[2] + k_extent_height - 2) / strides[0] + 1;
output_width = (input_shape[3] + k_extent_width - 2) / strides[1] + 1;
break;
default:
MACE_CHECK(false, "Unsupported padding type: ", padding);
}
// Note: TensorFlow may padded one more on the right/bottom side
// TODO may be it's better to also truncate the left/top to
// utilize the more centered features. We need to benchmark
// based on the model accuracy.
padding_size[0] =
(output_height - 1) * strides[0] + k_extent_height - input_shape[2];
padding_size[1] =
(output_width - 1) * strides[1] + k_extent_width - input_shape[3];
}
void ConstructInputWithPadding(const float *input,
const index_t *input_shape,
const int *paddings,
......
......@@ -25,6 +25,13 @@ void CalcPaddingAndOutputSize(const index_t *input_shape, // NCHW
index_t *output_shape,
int *padding_size);
void CalPaddingSize(const index_t *input_shape, // NCHW
const index_t *filter_shape, // OIHW
const int *dilations,
const int *strides,
Padding padding,
int *padding_size);
void ConstructInputWithPadding(const float *input,
const index_t *input_shape,
const int *paddings,
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_KERNELS_DEPTHWISE_CONV_H_
#define MACE_KERNELS_DEPTHWISE_CONV_H_
#include "mace/proto/mace.pb.h"
#include "mace/core/common.h"
#include "mace/kernels/conv_pool_2d_util.h"
namespace mace {
namespace kernels {
template<DeviceType D, typename T>
class DepthwiseConv2dFunctor {
public:
DepthwiseConv2dFunctor(const int* strides,
Padding paddings,
const int* dilations) :
strides_(strides),
padding_(paddings),
dilations_(dilations) {}
void operator()(const T* input, // NCHW
const index_t* input_shape,
const T* filter, // c_out, c_in, kernel_h, kernel_w
const index_t* filter_shape,
const T* bias, // c_out
T* output, // NCHW
const index_t* output_shape) {
MACE_CHECK_NOTNULL(output);
index_t batch = output_shape[0];
index_t channels = output_shape[1];
index_t height = output_shape[2];
index_t width = output_shape[3];
index_t input_batch = input_shape[0];
index_t input_channels = input_shape[1];
index_t input_height = input_shape[2];
index_t input_width = input_shape[3];
index_t kernel_h = filter_shape[2];
index_t kernel_w = filter_shape[3];
int stride_h = strides_[0];
int stride_w = strides_[1];
int dilation_h = dilations_[0];
int dilation_w = dilations_[1];
MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");
vector<int> paddings_size(2, 0);
CalPaddingSize(input_shape, filter_shape, dilations_, strides_, padding_, paddings_size.data());
// The left-upper most offset of the padded input
int padded_h_start = 0 - paddings_size[0] / 2;
int padded_w_start = 0 - paddings_size[1] / 2;
index_t padded_h_stop = input_height + paddings_size[0] - paddings_size[0] / 2;
index_t padded_w_stop = input_width + paddings_size[1] - paddings_size[1] / 2;
index_t kernel_size = filter_shape[1] * kernel_h * kernel_w;
index_t multiplier = channels / input_channels;
#pragma omp parallel for collapse(2)
for (int n = 0; n < batch; ++n) {
for (int c = 0; c < channels; ++c) {
for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) {
index_t offset = n * channels * height * width +
c * height * width + h * width + w;
T sum = 0;
const T* filter_ptr = filter + c * kernel_size;
for (int kh = 0; kh < kernel_h; ++kh) {
for (int kw = 0; kw < kernel_w; ++kw) {
int inh = padded_h_start + h * stride_h + dilation_h * kh;
int inw = padded_w_start + w * stride_w + dilation_w * kw;
if (inh < 0 || inh >= input_height || inw < 0 ||
inw >= input_width) {
MACE_CHECK(inh >= padded_h_start && inh < padded_h_stop &&
inw >= padded_w_start && inw < padded_w_stop,
"Out of range read from input: ", inh, ", ",
inw);
// else padding with 0:
// sum += 0;
} else {
index_t input_offset =
n * input_channels * input_height * input_width +
(c / multiplier) * input_height * input_width + inh * input_width +
inw;
sum += input[input_offset] * *filter_ptr;
}
++filter_ptr;
}
}
output[offset] = sum + bias[c];
}
}
}
}
}
private:
const int* strides_; // [stride_h, stride_w]
Padding padding_ ;
const int* dilations_; // [dilation_h, dilation_w]
};
template <>
void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(const float* input,
const index_t* input_shape,
const float* filter,
const index_t* filter_shape,
const float* bias,
float* output,
const index_t* output_shape);
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_DEPTHWISE_CONV_H_
......@@ -11,6 +11,7 @@ namespace kernels {
extern void Conv2dNeonK1x1S1(const float *input,
const index_t *input_shape,
const float *filter,
const index_t *filter_shape,
const float *bias,
float *output,
const index_t *output_shape);
......@@ -18,6 +19,7 @@ extern void Conv2dNeonK1x1S1(const float *input,
extern void Conv2dNeonK3x3S1(const float *input,
const index_t *input_shape,
const float *filter,
const index_t *filter_shape,
const float *bias,
float *output,
const index_t *output_shape);
......@@ -25,6 +27,7 @@ extern void Conv2dNeonK3x3S1(const float *input,
extern void Conv2dNeonK3x3S2(const float *input,
const index_t *input_shape,
const float *filter,
const index_t *filter_shape,
const float *bias,
float *output,
const index_t *output_shape);
......@@ -32,6 +35,7 @@ extern void Conv2dNeonK3x3S2(const float *input,
extern void Conv2dNeonK5x5S1(const float *input,
const index_t *input_shape,
const float *filter,
const index_t *filter_shape,
const float *bias,
float *output,
const index_t *output_shape);
......@@ -48,6 +52,7 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const float *input,
const float *input,
const index_t *input_shape,
const float *filter,
const index_t *filter_shape,
const float *bias,
float *output,
const index_t *output_shape);
......@@ -81,7 +86,7 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const float *input,
input_shape = padded_input.shape().data();
}
auto conv2d_neon_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_neon_func(input, input_shape, filter, bias, output, output_shape);
conv2d_neon_func(input, input_shape, filter, nullptr, bias, output, output_shape);
}
} // namespace kernels
......
......@@ -11,6 +11,7 @@ namespace kernels {
void Conv2dNeonK1x1S1(const float* input, // NCHW
const index_t* input_shape,
const float* filter, // c_out, c_in, kernel_h, kernel_w
const index_t* filter_shape,
const float* bias, // c_out
float* output, // NCHW
const index_t* output_shape) {
......
......@@ -17,30 +17,36 @@ namespace kernels {
int input_channels = input_shape[1]; \
int input_height = input_shape[2]; \
int input_width = input_shape[3]; \
int kernel_h = 3; \
int kernel_w = 3; \
int multiplier = filter_shape == nullptr ? 0 : (filter_shape[0] / input_channels); \
int filter_in_channels = filter_shape == nullptr ? input_channels : filter_shape[1]; \
for (int b = 0; b < output_batch; ++b) { \
float* output_ptr_base = output + b * output_channels * output_height * output_width; \
for (int oc = 0; oc < output_channels; ++oc) { \
const float* filter_ptr = filter + oc * input_channels * kernel_h * kernel_w; \
const float* filter_ptr = filter + oc * filter_in_channels * kFilterSize; \
const float* input_ptr = input + b * input_channels * input_height * input_width; \
if (filter_shape != nullptr) { \
input_ptr += (oc / multiplier) * input_height * input_width; \
} \
float* output_ptr = output_ptr_base + oc * output_height * output_width; \
std::fill(output_ptr, output_ptr + output_height * output_width, bias[oc]); \
for (int ic = 0; ic < input_channels; ++ic) { \
for (int ic = 0; ic < filter_in_channels; ++ic) { \
float32x4_t n_filter_v[3] = {vld1q_f32(filter_ptr), vld1q_f32(filter_ptr+3), vld1q_f32(filter_ptr+6)};
#define KERNEL_TAIL_CODE \
filter_ptr += 9; \
filter_ptr += kFilterSize; \
input_ptr += input_height * input_width; \
} \
} \
}
static const int kRegisterSize = 4;
static const int kFilterSize = 9;
void Conv2dNeonK3x3S1(const float *input, // NCHW
const index_t *input_shape,
const float *filter, // c_out, c_in, kernel_h, kernel_w
const index_t *filter_shape,
const float *bias, // c_out
float *output, // NCHW
const index_t *output_shape) {
......@@ -213,6 +219,7 @@ void Conv2dNeonK3x3S1(const float *input, // NCHW
void Conv2dNeonK3x3S2(const float *input, // NCHW
const index_t *input_shape,
const float *filter, // c_out, c_in, kernel_h, kernel_w
const index_t *filter_shape,
const float *bias, // c_out
float *output, // NCHW
const index_t *output_shape) {
......@@ -287,7 +294,6 @@ void Conv2dNeonK3x3S2(const float *input, // NCHW
KERNEL_TAIL_CODE
}
#undef KERNEL_HEAD_CODE
#undef KERNEL_TAIL_CODE
......
......@@ -13,6 +13,7 @@ namespace kernels {
void Conv2dNeonK5x5S1(const float* input, // NCHW
const index_t* input_shape,
const float* filter, // c_out, c_in, kernel_h, kernel_w
const index_t* filter_shape,
const float* bias, // c_out
float* output, // NCHW
const index_t* output_shape) {
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/depthwise_conv2d.h"
#include "mace/kernels/conv_2d.h"
namespace mace {
namespace kernels {
extern void Conv2dNeonK3x3S1(const float *input,
const index_t *input_shape,
const float *filter,
const index_t *filter_shape,
const float *bias,
float *output,
const index_t *output_shape);
extern void Conv2dNeonK3x3S2(const float *input,
const index_t *input_shape,
const float *filter,
const index_t *filter_shape,
const float *bias,
float *output,
const index_t *output_shape);
template<>
void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(const float* input, // NCHW
const index_t* input_shape,
const float* filter, // c_out, c_in, kernel_h, kernel_w
const index_t* filter_shape,
const float* bias, // c_out
float* output, // NCHW
const index_t* output_shape) {
typedef void (*Conv2dNeonFunction)(
const float *input,
const index_t *input_shape,
const float *filter,
const index_t *filter_shape,
const float *bias,
float *output,
const index_t *output_shape);
// Selection matrix: kernel_size x stride_size
static const Conv2dNeonFunction selector[5][2] = {
{nullptr, nullptr},
{nullptr, nullptr},
{Conv2dNeonK3x3S1, Conv2dNeonK3x3S2},
{nullptr, nullptr},
{nullptr, nullptr}};
// not implement yet
index_t kernel_h = filter_shape[2];
index_t kernel_w = filter_shape[3];
if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] ||
strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 ||
selector[kernel_h - 1][strides_[0] - 1] == nullptr) {
LOG(WARNING) << "Depthwise-Conv2d NEON kernel with "
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides_[0] << "x" << strides_[1]
<< " is not implemented yet, using slow version";
DepthwiseConv2dFunctor<DeviceType::CPU, float>(strides_, padding_, dilations_)(
input, input_shape, filter, filter_shape, bias, output, output_shape);
return;
}
// Keep this alive during kernel execution
vector<int> paddings_size(2, 0);
CalPaddingSize(input_shape, filter_shape, dilations_, strides_, padding_, paddings_size.data());
Tensor padded_input;
if (paddings_size[0] > 0 || paddings_size[1] > 0) {
ConstructInputWithPadding(input, input_shape, paddings_size.data(), &padded_input);
input = padded_input.data<float>();
input_shape = padded_input.shape().data();
}
auto conv2d_neon_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_neon_func(input, input_shape, filter, filter_shape, bias, output, output_shape);
}
} // namespace kernels
} // namespace mace
\ No newline at end of file
......@@ -3,7 +3,6 @@
//
#include "mace/ops/conv_2d.h"
#include "mace/proto/mace.pb.h"
namespace mace {
......
......@@ -173,10 +173,10 @@ TEST_F(Conv2dOpTest, ConvNxNS12) {
// generate random input
index_t batch = 1 + rand() % 10;
index_t input_channels = 1 + rand() % 50;
index_t height = 11 + rand() % 100;
index_t width = 11 + rand() % 100;
index_t output_channels = 1 + rand() % 50;
index_t input_channels = 1 + rand() % 10;
index_t height = 107;
index_t width = 113;
index_t output_channels = 1 + rand() % 10;
// Construct graph
auto& net = test_net();
OpDefBuilder("Conv2d", "Conv2dTest")
......
......@@ -20,6 +20,50 @@ class ConvPool2dOpBase : public Operator<D, T> {
"padding", static_cast<int>(SAME)))),
dilations_(OperatorBase::GetRepeatedArgument<int>("dilations")) {}
void CalOutputSize(const index_t *input_shape, // NCHW
const index_t *filter_shape, // OIHW
const int *dilations,
const int *strides,
Padding padding,
index_t *output_shape) {
MACE_CHECK(dilations[0] > 0 && dilations[1] > 0,
"Invalid dilations, must >= 1");
MACE_CHECK((dilations[0] == 1 || strides[0] == 1) &&
(dilations[1] == 1 || strides[1] == 1),
"If dilations > 1, strides should be 1");
MACE_CHECK_NOTNULL(output_shape);
/*
* Convlution/pooling arithmetic:
* o = (i + 2 * p - k - (k - 1) * (d - 1)) / s + 1
* For details, see https://arxiv.org/pdf/1603.07285.pdf or
* http://deeplearning.net/software/theano/tutorial/conv_arithmetic.html
*/
index_t output_height, output_width;
switch (padding) {
case VALID:
output_height = (input_shape[2] - (filter_shape[2] - 1) * dilations[0] - 1) / strides[0] + 1;
output_width = (input_shape[3] - (filter_shape[3] - 1) * dilations[1] - 1) / strides[1] + 1;
break;
case SAME:
output_height = (input_shape[2] - 1) / strides[0] + 1;
output_width = (input_shape[3] - 1) / strides[1] + 1;
break;
case FULL:
output_height = (input_shape[2] + (filter_shape[2] - 1) * dilations[0] - 1) / strides[0] + 1;
output_width = (input_shape[3] + (filter_shape[3] - 1) * dilations[1] - 1) / strides[1] + 1;
break;
default:
MACE_CHECK(false, "Unsupported padding type: ", padding);
}
output_shape[0] = input_shape[0];
output_shape[1] = filter_shape[0];
output_shape[2] = output_height;
output_shape[3] = output_width;
}
protected:
std::vector<int> strides_;
Padding padding_;
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/ops/depthwise_conv2d.h"
namespace mace {
REGISTER_CPU_OPERATOR(DepthwiseConv2d, DepthwiseConv2dOp<DeviceType::CPU, float>);
#if __ARM_NEON
REGISTER_NEON_OPERATOR(DepthwiseConv2d, DepthwiseConv2dOp<DeviceType::NEON, float>);
#endif // __ARM_NEON
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_OPS_DEPTHWISE_CONV_H_
#define MACE_OPS_DEPTHWISE_CONV_H_
#include <memory>
#include "mace/core/operator.h"
#include "mace/kernels/conv_2d.h"
#include "mace/ops/conv_pool_2d_base.h"
#include "mace/kernels/depthwise_conv2d.h"
namespace mace {
template <DeviceType D, typename T>
class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
public:
DepthwiseConv2dOp(const OperatorDef& op_def, Workspace* ws)
: ConvPool2dOpBase<D, T>(op_def, ws),
functor_(this->strides_.data(), this->padding_, this->dilations_.data()){};
bool Run() override {
const Tensor* input = this->Input(INPUT);
const Tensor* filter = this->Input(FILTER);
const Tensor* bias = this->Input(BIAS);
Tensor* output = this->Output(OUTPUT);
// resize filter shape.
std::vector<index_t> filter_shape(filter->shape().begin(), filter->shape().end());
filter_shape[0] *= filter_shape[1];
filter_shape[1] = 1;
std::vector<index_t> output_shape(4);
this->CalOutputSize(
input->shape().data(), filter_shape.data(), this->dilations_.data(),
this->strides_.data(), this->padding_, output_shape.data());
output->Resize(output_shape);
functor_(input->data<T>(), input->shape().data(), filter->data<T>(),
filter_shape.data(), bias->data<T>(), output->mutable_data<T>(),
output->shape().data());
return true;
}
private:
kernels::DepthwiseConv2dFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT, FILTER, BIAS);
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace mace
#endif // MACE_OPS_DEPTHWISE_CONV_H_
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/ops/conv_2d.h"
#include "mace/ops/ops_test_util.h"
using namespace mace;
class DepthwiseConv2dOpTest : public OpsTestBase {};
TEST_F(DepthwiseConv2dOpTest, Simple_VALID) {
// Construct graph
auto& net = test_net();
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.Finalize(net.operator_def());
// Add args
net.AddIntsArg("strides", {1, 1});
net.AddIntArg("padding", Padding::VALID);
net.AddIntsArg("dilations", {1, 1});
// Add input data
net.AddInputFromArray<float>(
"Input", {1, 2, 2, 3},
{1, 3, 5, 7, 9, 11, 2, 4, 6, 8, 10, 12});
net.AddInputFromArray<float>(
"Filter", {2, 2, 2, 2},
{1.0f, 5.0f, 9.0f, 13.0f,
2.0f, 6.0f, 10.0f, 14.0f,
3.0f, 7.0f, 11.0f, 15.0f,
4.0f, 8.0f, 12.0f, 16.0f});
net.AddInputFromArray<float>("Bias", {4}, {.1f, .2f, .3f, .4f});
// Run
net.RunOp();
// Check
auto expected = CreateTensor<float>({1, 4, 1, 2},
{196.1f, 252.1f, 216.2f, 280.2f,
272.3f, 344.3f, 296.4f, 376.4f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
}
TEST_F(DepthwiseConv2dOpTest, ConvNxNS12) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) {
srand(time(NULL));
// generate random input
index_t batch = 2 + rand() % 10;
index_t input_channels = 3 + rand() % 10;
index_t height = 107;
index_t width = 113;
index_t multiplier = 3 + rand() % 10;
// Construct graph
auto& net = test_net();
OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest")
.Input("Input")
.Input("Filter")
.Input("Bias")
.Output("Output")
.Finalize(net.operator_def());
// Add args
net.AddIntsArg("strides", {stride_h, stride_w});
net.AddIntArg("padding", type);
net.AddIntsArg("dilations", {1, 1});
// Add input data
net.AddRandomInput<float>("Input", {batch, input_channels, height, width});
net.AddRandomInput<float>(
"Filter", {multiplier, input_channels, kernel_h, kernel_w});
net.AddRandomInput<float>("Bias", {multiplier * input_channels});
// run cpu
net.RunOp();
// Check
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// Run NEON
net.RunOp(DeviceType::NEON);
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-3);
};
for (int kernel_size : {3}) {
for (int stride : {1, 2}) {
func(kernel_size, kernel_size, stride, stride, VALID);
func(kernel_size, kernel_size, stride, stride, SAME);
}
}
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册