提交 74f5a329 编写于 作者: B Bin Li

optimize conv7x7 s1 s2 s3 armv7 neon

上级 6ad006d0
......@@ -51,6 +51,39 @@ extern void Conv2dNeonK3x3S2(const float *input,
const index_t out_channels,
float *output);
extern void Conv2dNeonK7x7S1(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
float *output);
extern void Conv2dNeonK7x7S2(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
float *output);
extern void Conv2dNeonK7x7S3(const float *input,
const float *filter,
const index_t batch,
const index_t in_height,
const index_t in_width,
const index_t in_channels,
const index_t out_height,
const index_t out_width,
const index_t out_channels,
float *output);
} // namespace kernels
} // namespace mace
......
此差异已折叠。
......@@ -224,6 +224,12 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
&& stride_h == 2 && stride_w == 2 && dilation_h == 1 && dilation_w == 1;
bool use_neon_1x1_s1 = filter_h == 1 && filter_w == 1
&& stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1;
bool use_neon_7x7_s1 = filter_h == 7 && filter_w == 7
&& stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1;
bool use_neon_7x7_s2 = filter_h == 7 && filter_w == 7
&& stride_h == 2 && stride_w == 2 && dilation_h == 1 && dilation_w == 1;
bool use_neon_7x7_s3 = filter_h == 7 && filter_w == 7
&& stride_h == 3 && stride_w == 3 && dilation_h == 1 && dilation_w == 1;
std::vector<index_t> transformed_input_shape;
std::vector<index_t> transformed_output_shape;
......@@ -288,6 +294,44 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
if (extra_input_width != padded_input_width) {
pad_right += (extra_input_width - padded_input_width);
}
} else if (use_neon_7x7_s1) {
extra_output_height = height;
extra_input_height =
std::max(padded_input_height, extra_output_height + 6);
extra_output_width = RoundUp<index_t>(width, 4);
extra_input_width = std::max(padded_input_width, extra_output_width + 6);
if (extra_input_height != padded_input_height) {
pad_bottom += (extra_input_height - padded_input_height);
}
if (extra_input_width != padded_input_width) {
pad_right += (extra_input_width - padded_input_width);
}
} else if (use_neon_7x7_s2) {
extra_output_height = height;
extra_input_height =
std::max(padded_input_height, (extra_output_height - 1) * 2 + 7);
extra_output_width = RoundUp<index_t>(width, 4);
extra_input_width =
std::max(padded_input_width, (extra_output_width - 1) * 2 + 7);
if (extra_input_height != padded_input_height) {
pad_bottom += (extra_input_height - padded_input_height);
}
if (extra_input_width != padded_input_width) {
pad_right += (extra_input_width - padded_input_width);
}
} else if (use_neon_7x7_s3) {
extra_output_height = height;
extra_input_height =
std::max(padded_input_height, (extra_output_height - 1) * 3 + 7);
extra_output_width = RoundUp<index_t>(width, 4);
extra_input_width =
std::max(padded_input_width, (extra_output_width - 1) * 3 + 7);
if (extra_input_height != padded_input_height) {
pad_bottom += (extra_input_height - padded_input_height);
}
if (extra_input_width != padded_input_width) {
pad_right += (extra_input_width - padded_input_width);
}
}
// decide scratch size before allocate it
......@@ -413,6 +457,45 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
channels,
pad_output);
};
} else if (use_neon_7x7_s1) {
conv_func = [=](const float *pad_input, float *pad_output) {
Conv2dNeonK7x7S1(pad_input,
filter_data,
batch,
extra_input_height,
extra_input_width,
input_channels,
extra_output_height,
extra_output_width,
channels,
pad_output);
};
} else if (use_neon_7x7_s2) {
conv_func = [=](const float *pad_input, float *pad_output) {
Conv2dNeonK7x7S2(pad_input,
filter_data,
batch,
extra_input_height,
extra_input_width,
input_channels,
extra_output_height,
extra_output_width,
channels,
pad_output);
};
} else if (use_neon_7x7_s3) {
conv_func = [=](const float *pad_input, float *pad_output) {
Conv2dNeonK7x7S3(pad_input,
filter_data,
batch,
extra_input_height,
extra_input_width,
input_channels,
extra_output_height,
extra_output_width,
channels,
pad_output);
};
} else {
conv_func = [=](const float *pad_input, float *pad_output) {
Conv2dGeneral(pad_input,
......
......@@ -152,6 +152,9 @@ BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, 1, SAME, 128);
BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, 1, SAME, 128);
BM_CONV_2D(1, 64, 32, 31, 15, 1, 1, 1, SAME, 128);
BM_CONV_2D(1, 64, 32, 31, 1, 15, 1, 1, SAME, 128);
BM_CONV_2D(1, 64, 32, 31, 7, 7, 1, 1, SAME, 128);
BM_CONV_2D(1, 64, 32, 31, 7, 7, 2, 1, SAME, 128);
BM_CONV_2D(1, 64, 32, 31, 7, 7, 3, 1, SAME, 128);
// 3 channels input
BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, 1, VALID, 3);
......
......@@ -878,7 +878,7 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
1e-4, 1e-4);
};
for (int kernel_size : {3, 5}) {
for (int kernel_size : {3, 5, 7}) {
for (int stride : {2, 3}) {
func(kernel_size, kernel_size, stride, stride);
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册