提交 8bb5716a 编写于 作者: L Liangliang He

Merge branch 'conv2d-neon' into 'master'

Neon conv2d 3x3 stride 2 kernel.

See merge request !44
......@@ -22,6 +22,13 @@ extern void Conv2dNeonK3x3S1(const float *input,
float *output,
const index_t *output_shape);
extern void Conv2dNeonK3x3S2(const float *input,
const index_t *input_shape,
const float *filter,
const float *bias,
float *output,
const index_t *output_shape);
extern void Conv2dNeonK5x5S1(const float *input,
const index_t *input_shape,
const float *filter,
......@@ -30,27 +37,25 @@ extern void Conv2dNeonK5x5S1(const float *input,
const index_t *output_shape);
template <>
void Conv2dFunctor<DeviceType::NEON,
float>::
operator()(const float *input, // NCHW
const index_t *input_shape,
const float *filter, // c_out, c_in, kernel_h, kernel_w
const index_t *filter_shape,
const float *bias, // c_out
float *output, // NCHW
const index_t *output_shape) {
void Conv2dFunctor<DeviceType::NEON, float>::operator()(const float *input,
const index_t *input_shape,
const float *filter,
const index_t *filter_shape,
const float *bias,
float *output,
const index_t *output_shape) {
typedef void (*Conv2dNeonFunction)(
const float *input, // NCHW
const float *input,
const index_t *input_shape,
const float *filter, // c_out, c_in, kernel_h, kernel_w
const float *bias, // c_out
float *output, // NCHW
const float *filter,
const float *bias,
float *output,
const index_t *output_shape);
// Selection matrix: kernel_size x stride_size
static const Conv2dNeonFunction selector[5][2] = {
{Conv2dNeonK1x1S1, nullptr},
{nullptr, nullptr},
{Conv2dNeonK3x3S1, nullptr},
{Conv2dNeonK3x3S1, Conv2dNeonK3x3S2},
{nullptr, nullptr},
{Conv2dNeonK5x5S1, nullptr}};
// not implement yet
......@@ -59,7 +64,10 @@ operator()(const float *input, // NCHW
if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] ||
strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 ||
selector[kernel_h - 1][strides_[0] - 1] == nullptr) {
LOG(WARNING) << "NEON conv2d kernel not implementated, using slow vesion";
LOG(WARNING) << "NEON conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides_[0] << "x" << strides_[1]
<< " is not implemented yet, using slow version";
Conv2dFunctor<DeviceType::CPU, float>(strides_, paddings_, dilations_)(
input, input_shape, filter, filter_shape, bias, output, output_shape);
return;
......
......@@ -61,8 +61,7 @@ static void Conv2d(int iters,
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::BytesProcessed(tot*(sizeof(TYPE))); \
Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, mace::Padding::P, \
OC); \
Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, mace::Padding::P, OC); \
} \
BENCHMARK( \
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_OC##_##TYPE##_##DEVICE)
......@@ -77,6 +76,10 @@ BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float);
BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float);
BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float);
BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float);
BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float);
BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float);
BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float);
BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float);
BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float);
BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float);
BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float);
......
......@@ -174,8 +174,8 @@ TEST_F(Conv2dOpTest, ConvNxNS12) {
// generate random input
index_t batch = 1 + rand() % 10;
index_t input_channels = 1 + rand() % 50;
index_t height = 7 + rand() % 100;
index_t width = 7 + rand() % 100;
index_t height = 11 + rand() % 100;
index_t width = 11 + rand() % 100;
index_t output_channels = 1 + rand() % 50;
// Construct graph
auto& net = test_net();
......
......@@ -155,9 +155,9 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
net.RunOp(DeviceType::NEON);
// Check
Tensor expected = CreateTensor<float>({1, 1, 2, 3}, {6, 8, 9, 16, 18, 19});
auto expected = CreateTensor<float>({1, 1, 2, 3}, {6, 8, 9, 16, 18, 19});
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
TEST_F(PoolingOpTest, MAX_k3x3s2x2) {
......@@ -183,7 +183,7 @@ TEST_F(PoolingOpTest, MAX_k3x3s2x2) {
net.RunOp(DeviceType::NEON);
// Check
Tensor expected = CreateTensor<float>({1, 1, 2, 3}, {11, 13, 14, 16, 18, 19});
auto expected = CreateTensor<float>({1, 1, 2, 3}, {11, 13, 14, 16, 18, 19});
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册