提交 dd501eea 编写于 作者: L Liangliang He

Fix boundary checking and tuning performance

上级 aba88a23
......@@ -5,6 +5,10 @@
#ifndef MACE_KERNELS_DEPTHWISE_CONV2D_H_
#define MACE_KERNELS_DEPTHWISE_CONV2D_H_
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
#include <arm_neon.h>
#endif
#include "mace/core/common.h"
#include "mace/core/future.h"
#include "mace/core/public/mace.h"
......@@ -64,7 +68,9 @@ void DepthwiseConv2dKernel(const T *input_ptr,
inw >= input_width) {
MACE_CHECK(inh >= padded_h_start && inh < padded_h_stop &&
inw >= padded_w_start && inw < padded_w_stop,
"Out of range read from input: ", inh, ", ", inw);
"Out of range read from input: ", padded_h_start,
" <= ", inh, " < ", padded_h_stop, ", ",
padded_w_start, " <= ", inw, " < ", padded_w_stop);
} else {
index_t input_offset =
n * input_height * input_width * input_channels +
......@@ -108,33 +114,120 @@ void DepthwiseConv2dNoOOBCheckKernel(const T *input_ptr,
int h_stop,
int w_start,
int w_stop) {
if (multiplier == 1) {
constexpr int c_tile_size = 4;
#pragma omp parallel for collapse(3)
for (int n = 0; n < batch; ++n) {
for (int h = h_start; h < h_stop; ++h) {
for (int w = w_start; w < w_stop; ++w) {
int c;
for (c = 0; c + c_tile_size <= channels; c += c_tile_size) {
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
static_assert(c_tile_size == 4, "channels tile size must be 4");
float32x4_t sum = vdupq_n_f32(0);
if (bias_ptr != nullptr) {
sum = vld1q_f32(bias_ptr + c);
}
#else
T sum[c_tile_size] = {0};
if (bias_ptr != nullptr) {
for (int ci = 0; ci < c_tile_size; ++ci) {
sum[ci] = bias_ptr[c + ci];
}
}
#endif
const T *filter_base = filter_ptr + c;
for (int kh = 0; kh < kernel_h; ++kh) {
for (int kw = 0; kw < kernel_w; ++kw) {
int inh = padded_h_start + h * stride_h + dilation_h * kh;
int inw = padded_w_start + w * stride_w + dilation_w * kw;
MACE_ASSERT(inh >= 0 && inh < input_height && inw >= 0 &&
inw < input_width);
index_t input_offset =
n * input_height * input_width * input_channels +
inh * input_width * input_channels + inw * input_channels +
c;
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
float32x4_t in = vld1q_f32(input_ptr + input_offset);
float32x4_t weights = vld1q_f32(filter_base);
sum = vfmaq_f32(sum, in, weights);
#else
for (int ci = 0; ci < c_tile_size; ++ci) {
sum[ci] +=
input_ptr[input_offset + ci] * filter_base[ci]; // HWIM
}
#endif
filter_base += input_channels;
}
}
index_t offset = n * height * width * channels +
h * width * channels + w * channels + c;
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
vst1q_f32(output_ptr + offset, sum);
#else
for (int ci = 0; ci < c_tile_size; ++ci) {
output_ptr[offset + ci] = sum[ci];
}
#endif
}
for (; c < channels; ++c) {
T bias_channel = bias_ptr ? bias_ptr[c] : 0;
index_t offset = n * height * width * channels +
h * width * channels + w * channels + c;
output_ptr[offset] = bias_channel;
T sum = 0;
const T *filter_base = filter_ptr + c;
for (int kh = 0; kh < kernel_h; ++kh) {
for (int kw = 0; kw < kernel_w; ++kw) {
int inh = padded_h_start + h * stride_h + dilation_h * kh;
int inw = padded_w_start + w * stride_w + dilation_w * kw;
MACE_ASSERT(inh >= 0 && inh < input_height && inw >= 0 &&
inw < input_width);
index_t input_offset =
n * input_height * input_width * input_channels +
inh * input_width * input_channels + inw * input_channels +
c;
sum += input_ptr[input_offset] * filter_base[0]; // HWIM
filter_base += input_channels * multiplier;
}
}
output_ptr[offset] += sum;
}
}
}
}
} else {
#pragma omp parallel for collapse(4)
for (int n = 0; n < batch; ++n) {
for (int h = h_start; h < h_stop; ++h) {
for (int w = w_start; w < w_stop; ++w) {
for (int c = 0; c < channels; ++c) {
const index_t inc = c / multiplier;
const index_t m = c % multiplier;
T bias_channel = bias_ptr ? bias_ptr[c] : 0;
index_t offset = n * height * width * channels +
h * width * channels + w * channels + c;
output_ptr[offset] = bias_channel;
T sum = 0;
const T *filter_base = filter_ptr + inc * multiplier + m;
for (int kh = 0; kh < kernel_h; ++kh) {
for (int kw = 0; kw < kernel_w; ++kw) {
int inh = padded_h_start + h * stride_h + dilation_h * kh;
int inw = padded_w_start + w * stride_w + dilation_w * kw;
index_t input_offset =
n * input_height * input_width * input_channels +
inh * input_width * input_channels + inw * input_channels +
inc;
// TODO vectorize this
sum += input_ptr[input_offset] * filter_base[0]; // HWIM
filter_base += input_channels * multiplier;
for (int n = 0; n < batch; ++n) {
for (int h = h_start; h < h_stop; ++h) {
for (int w = w_start; w < w_stop; ++w) {
for (int c = 0; c < channels; ++c) {
const index_t inc = c / multiplier;
const index_t m = c % multiplier;
T bias_channel = bias_ptr ? bias_ptr[c] : 0;
index_t offset = n * height * width * channels +
h * width * channels + w * channels + c;
output_ptr[offset] = bias_channel;
T sum = 0;
const T *filter_base = filter_ptr + inc * multiplier + m;
for (int kh = 0; kh < kernel_h; ++kh) {
for (int kw = 0; kw < kernel_w; ++kw) {
int inh = padded_h_start + h * stride_h + dilation_h * kh;
int inw = padded_w_start + w * stride_w + dilation_w * kw;
MACE_ASSERT(inh >= 0 && inh < input_height && inw >= 0 &&
inw < input_width);
index_t input_offset =
n * input_height * input_width * input_channels +
inh * input_width * input_channels + inw * input_channels +
inc;
sum += input_ptr[input_offset] * filter_base[0]; // HWIM
filter_base += input_channels * multiplier;
}
}
output_ptr[offset] += sum;
}
output_ptr[offset] += sum;
}
}
}
......@@ -230,10 +323,15 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");
// The left-upper most offset of the padded input
int padded_h_start = 0 - paddings[0] / 2;
int padded_w_start = 0 - paddings[1] / 2;
index_t padded_h_stop = input_height + paddings[0] - paddings[0] / 2;
index_t padded_w_stop = input_width + paddings[1] - paddings[1] / 2;
int paddings_top = paddings[0] / 2;
int paddings_bottom = paddings[0] - paddings_top;
int paddings_left = paddings[1] / 2;
int paddings_right = paddings[1] - paddings_left;
int padded_h_start = 0 - paddings_top;
int padded_w_start = 0 - paddings_left;
index_t padded_h_stop = input_height + paddings_bottom;
index_t padded_w_stop = input_width + paddings_right;
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter);
......@@ -244,38 +342,59 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
const T *bias_ptr = bias == nullptr ? nullptr : bias->data<T>();
T *output_ptr = output->mutable_data<T>();
int valid_h_start =
paddings_top == 0 ? 0 : (paddings_top - 1) / stride_h + 1;
int valid_h_stop = paddings_bottom == 0
? height
: height - ((paddings_bottom - 1) / stride_h + 1);
int valid_w_start =
paddings_left == 0 ? 0 : (paddings_left - 1) / stride_w + 1;
int valid_w_stop = paddings_right == 0
? width
: width - ((paddings_right - 1) / stride_w + 1);
// Calculate border elements with out-of-boundary checking
DepthwiseConv2dKernel<T>(
input_ptr, filter_ptr, bias_ptr, output_ptr, batch, height, width,
channels, input_height, input_width, input_channels, multiplier,
padded_h_start, padded_h_stop, padded_w_start, padded_w_stop, kernel_h,
kernel_w, stride_h, stride_w, dilation_h, dilation_w, 0, 1, 0, width);
DepthwiseConv2dKernel<T>(input_ptr, filter_ptr, bias_ptr, output_ptr, batch,
height, width, channels, input_height, input_width,
input_channels, multiplier, padded_h_start,
padded_h_stop, padded_w_start, padded_w_stop,
kernel_h, kernel_w, stride_h, stride_w, dilation_h,
dilation_w, height - 1, height, 0, width);
DepthwiseConv2dKernel<T>(input_ptr, filter_ptr, bias_ptr, output_ptr, batch,
height, width, channels, input_height, input_width,
input_channels, multiplier, padded_h_start,
padded_h_stop, padded_w_start, padded_w_stop,
kernel_h, kernel_w, stride_h, stride_w, dilation_h,
dilation_w, 1, height - 1, 0, 1);
DepthwiseConv2dKernel<T>(input_ptr, filter_ptr, bias_ptr, output_ptr, batch,
height, width, channels, input_height, input_width,
input_channels, multiplier, padded_h_start,
padded_h_stop, padded_w_start, padded_w_stop,
kernel_h, kernel_w, stride_h, stride_w, dilation_h,
dilation_w, 1, height - 1, width - 1, width);
if (valid_h_start > 0) {
DepthwiseConv2dKernel<T>(
input_ptr, filter_ptr, bias_ptr, output_ptr, batch, height, width,
channels, input_height, input_width, input_channels, multiplier,
padded_h_start, padded_h_stop, padded_w_start, padded_w_stop,
kernel_h, kernel_w, stride_h, stride_w, dilation_h, dilation_w, 0,
valid_h_start, 0, width);
}
if (valid_h_stop < height) {
DepthwiseConv2dKernel<T>(
input_ptr, filter_ptr, bias_ptr, output_ptr, batch, height, width,
channels, input_height, input_width, input_channels, multiplier,
padded_h_start, padded_h_stop, padded_w_start, padded_w_stop,
kernel_h, kernel_w, stride_h, stride_w, dilation_h, dilation_w,
std::max(valid_h_start, valid_h_stop), height, 0, width);
}
if (valid_w_start > 0) {
DepthwiseConv2dKernel<T>(
input_ptr, filter_ptr, bias_ptr, output_ptr, batch, height, width,
channels, input_height, input_width, input_channels, multiplier,
padded_h_start, padded_h_stop, padded_w_start, padded_w_stop,
kernel_h, kernel_w, stride_h, stride_w, dilation_h, dilation_w,
valid_h_start, valid_h_stop, 0, valid_w_start);
}
if (valid_w_stop < width) {
DepthwiseConv2dKernel<T>(
input_ptr, filter_ptr, bias_ptr, output_ptr, batch, height, width,
channels, input_height, input_width, input_channels, multiplier,
padded_h_start, padded_h_stop, padded_w_start, padded_w_stop,
kernel_h, kernel_w, stride_h, stride_w, dilation_h, dilation_w,
valid_h_start, valid_h_stop, std::max(valid_w_start, valid_w_stop),
width);
}
// Calculate border elements without out-of-boundary checking
DepthwiseConv2dNoOOBCheckKernel<T>(
input_ptr, filter_ptr, bias_ptr, output_ptr, batch, height, width,
channels, input_height, input_width, input_channels, multiplier,
padded_h_start, padded_h_stop, padded_w_start, padded_w_stop, kernel_h,
kernel_w, stride_h, stride_w, dilation_h, dilation_w, 1, height - 1, 1,
width - 1);
kernel_w, stride_h, stride_w, dilation_h, dilation_w, valid_h_start,
valid_h_stop, valid_w_start, valid_w_stop);
output_ptr = output->mutable_data<T>();
DoActivation(output_ptr, output_ptr, output->NumElements(), activation_,
......
......@@ -75,25 +75,27 @@ static void DepthwiseConv2d(int iters,
}
}
#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE, \
DEVICE) \
static void \
BM_DEPTHWISE_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, \
mace::Padding::P, OC); \
} \
BENCHMARK( \
BM_DEPTHWISE_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE, \
DEVICE) \
static void \
BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, \
mace::Padding::P, OC); \
} \
BENCHMARK( \
BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, OC) \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, float, CPU); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, float, OPENCL); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, half, OPENCL);
BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 1, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 2, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 1);
......
......@@ -280,10 +280,10 @@ void TestNxNS12(const index_t height, const index_t width) {
ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 0.1);
};
for (int kernel_size : {3}) {
for (int kernel_size : {2, 3, 4}) {
for (int stride : {1, 2}) {
func(kernel_size, kernel_size, stride, stride, VALID);
//func(kernel_size, kernel_size, stride, stride, SAME);
func(kernel_size, kernel_size, stride, stride, SAME);
}
}
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册