未验证 提交 e14fb0d6 编写于 作者: R Ruilong Liu 提交者: GitHub

Merge pull request #517 from Eclipsess/develop

fix #516 update dwbnrelu
...@@ -108,7 +108,7 @@ void ConvAddBNReluBasic(const FusionConvAddBNReluParam &param) { ...@@ -108,7 +108,7 @@ void ConvAddBNReluBasic(const FusionConvAddBNReluParam &param) {
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<float>(filter_slice, false, col_matrix, false, math::matmul<float>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice, static_cast<float>(1), &out_slice,
static_cast<float>(1)); static_cast<float>(0));
} }
} }
/// todo : use neon in special case instead of 2for(300ms) /// todo : use neon in special case instead of 2for(300ms)
...@@ -131,15 +131,16 @@ void ConvAddBNReluCompute(const FusionConvAddBNReluParam &param) { ...@@ -131,15 +131,16 @@ void ConvAddBNReluCompute(const FusionConvAddBNReluParam &param) {
param.Input()->dims()[1] == param.Output()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] && param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) { param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
math::DepthwiseConvAddBNRelu3x3s1p1( math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
param.Input(), param.Filter(), param.Output(), &Bias, 1, param.Output(), param.NewScale(),
param.NewScale(), param.NewBias(), 1, 1); param.NewBias(), 1);
} else if (0 && param.Groups() == param.Input()->dims()[1] && } else if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] && param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) { param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(), math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
param.Filter(), &Bias, param.Output(), false); param.Output(), param.NewScale(),
param.NewBias(), 1);
} else { } else {
ConvAddBNReluBasic(param); ConvAddBNReluBasic(param);
} }
......
...@@ -514,14 +514,11 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter, ...@@ -514,14 +514,11 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
} }
void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
Tensor *output, Tensor *bias, bool if_bias, Tensor *output, const Tensor *new_scale,
const Tensor *new_scale, const Tensor *new_bias, bool if_relu) {
const Tensor *new_bias, bool if_bn,
bool if_relu) {
const float *input_data = input->data<float>(); const float *input_data = input->data<float>();
const float *filter_data = filter->data<float>(); const float *filter_data = filter->data<float>();
float *output_data = output->data<float>(); float *output_data = output->data<float>();
const float *bias_data = bias->data<float>();
const float *newscale_data = new_scale->data<float>(); const float *newscale_data = new_scale->data<float>();
const float *newbias_data = new_bias->data<float>(); const float *newbias_data = new_bias->data<float>();
...@@ -532,7 +529,6 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, ...@@ -532,7 +529,6 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
const int batch_size = static_cast<int>(input->dims()[0]); const int batch_size = static_cast<int>(input->dims()[0]);
const int c = static_cast<int>(input->dims()[1]); const int c = static_cast<int>(input->dims()[1]);
const int hxw = h * w; const int hxw = h * w;
float32x4_t vbias = vdupq_n_f32(0.0);
float32x4_t vnewbias = vdupq_n_f32(0.0); float32x4_t vnewbias = vdupq_n_f32(0.0);
float32x4_t vnewscale = vdupq_n_f32(1.0); float32x4_t vnewscale = vdupq_n_f32(1.0);
float32x4_t vzero = vdupq_n_f32(0); float32x4_t vzero = vdupq_n_f32(0);
...@@ -541,13 +537,9 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, ...@@ -541,13 +537,9 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
const float *filter_data_tmp = filter_data; const float *filter_data_tmp = filter_data;
for (int j = 0; j < c; ++j) { for (int j = 0; j < c; ++j) {
if (if_bias) { vnewbias = vdupq_n_f32(newbias_data[j]);
vbias = vdupq_n_f32(bias_data[j]); vnewscale = vdupq_n_f32(newscale_data[j]);
}
if (if_bn) {
vnewbias = vdupq_n_f32(newbias_data[j]);
vnewscale = vdupq_n_f32(newscale_data[j]);
}
int l_mid = l - 2; // l=1->l_mid=-1,l=2->l_mid=0 int l_mid = l - 2; // l=1->l_mid=-1,l=2->l_mid=0
float w00 = filter_data_tmp[0]; float w00 = filter_data_tmp[0];
float w01 = filter_data_tmp[1]; float w01 = filter_data_tmp[1];
...@@ -573,21 +565,14 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, ...@@ -573,21 +565,14 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
w01 * input_data[(l - 2) * (l + 1) + 1] + w01 * input_data[(l - 2) * (l + 1) + 1] +
w10 * input_data[l * l - 2] + w10 * input_data[l * l - 2] +
w11 * input_data[l * l - 1]; w11 * input_data[l * l - 1];
if (if_bias) { output_data[0] = output_data[0] * newscale_data[j] + newbias_data[j];
output_data[0] += bias_data[j]; output_data[l - 1] =
output_data[l - 1] += bias_data[j]; output_data[l - 1] * newscale_data[j] + newbias_data[j];
output_data[(l - 1) * l] += bias_data[j]; output_data[(l - 1) * l] =
output_data[l * l - 1] += bias_data[j]; output_data[(l - 1) * l] * newscale_data[j] + newbias_data[j];
} output_data[l * l - 1] =
if (if_bn) { output_data[l * l - 1] * newscale_data[j] + newbias_data[j];
output_data[0] = output_data[0] * newscale_data[j] + newbias_data[j];
output_data[l - 1] =
output_data[l - 1] * newscale_data[j] + newbias_data[j];
output_data[(l - 1) * l] =
output_data[(l - 1) * l] * newscale_data[j] + newbias_data[j];
output_data[l * l - 1] =
output_data[l * l - 1] * newscale_data[j] + newbias_data[j];
}
if (if_relu) { if (if_relu) {
output_data[0] = output_data[0] < 0 ? 0 : output_data[0]; output_data[0] = output_data[0] < 0 ? 0 : output_data[0];
output_data[l - 1] = output_data[l - 1] < 0 ? 0 : output_data[l - 1]; output_data[l - 1] = output_data[l - 1] < 0 ? 0 : output_data[l - 1];
...@@ -607,16 +592,11 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, ...@@ -607,16 +592,11 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
w11 * input_data[i * l + l - 1] + w11 * input_data[i * l + l - 1] +
w20 * input_data[i * l + l - 1 + l - 1] + w20 * input_data[i * l + l - 1 + l - 1] +
w21 * input_data[i * l + l - 1 + l]; w21 * input_data[i * l + l - 1 + l];
if (if_bias) { output_data[i * l] =
output_data[i * l] += bias_data[j]; output_data[i * l] * newscale_data[j] + newbias_data[j];
output_data[i * l + l - 1] += bias_data[j]; output_data[i * l + l - 1] =
} output_data[i * l + l - 1] * newscale_data[j] + newbias_data[j];
if (if_bn) {
output_data[i * l] =
output_data[i * l] * newscale_data[j] + newbias_data[j];
output_data[i * l + l - 1] =
output_data[i * l + l - 1] * newscale_data[j] + newbias_data[j];
}
if (if_relu) { if (if_relu) {
output_data[i * l] = output_data[i * l] < 0 ? 0 : output_data[i * l]; output_data[i * l] = output_data[i * l] < 0 ? 0 : output_data[i * l];
output_data[i * l + l - 1] = output_data[i * l + l - 1] =
...@@ -652,7 +632,6 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, ...@@ -652,7 +632,6 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
out0 = vmlaq_n_f32(out0, in2, w20); out0 = vmlaq_n_f32(out0, in2, w20);
out0 = vmlaq_n_f32(out0, tmp2, w21); out0 = vmlaq_n_f32(out0, tmp2, w21);
out0 = vmlaq_n_f32(out0, tmp3, w22); out0 = vmlaq_n_f32(out0, tmp3, w22);
out0 = vaddq_f32(out0, vbias);
out0 = vmlaq_f32(vnewbias, vnewscale, out0); out0 = vmlaq_f32(vnewbias, vnewscale, out0);
if (if_relu) { if (if_relu) {
out0 = vmaxq_f32(out0, vzero); out0 = vmaxq_f32(out0, vzero);
...@@ -673,7 +652,6 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, ...@@ -673,7 +652,6 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
out0 = vmlaq_n_f32(out0, in6, w10); out0 = vmlaq_n_f32(out0, in6, w10);
out0 = vmlaq_n_f32(out0, tmp2, w11); out0 = vmlaq_n_f32(out0, tmp2, w11);
out0 = vmlaq_n_f32(out0, tmp3, w12); out0 = vmlaq_n_f32(out0, tmp3, w12);
out0 = vaddq_f32(out0, vbias);
out0 = vmlaq_f32(vnewbias, vnewscale, out0); out0 = vmlaq_f32(vnewbias, vnewscale, out0);
if (if_relu) { if (if_relu) {
out0 = vmaxq_f32(out0, vzero); out0 = vmaxq_f32(out0, vzero);
...@@ -705,7 +683,6 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, ...@@ -705,7 +683,6 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
out0 = vmlaq_n_f32(out0, in2, w20); out0 = vmlaq_n_f32(out0, in2, w20);
out0 = vmlaq_n_f32(out0, tmp2, w21); out0 = vmlaq_n_f32(out0, tmp2, w21);
out0 = vmlaq_n_f32(out0, tmp3, w22); out0 = vmlaq_n_f32(out0, tmp3, w22);
out0 = vaddq_f32(out0, vbias);
out0 = vmlaq_f32(vnewbias, vnewscale, out0); out0 = vmlaq_f32(vnewbias, vnewscale, out0);
if (if_relu) { if (if_relu) {
out0 = vmaxq_f32(out0, vzero); out0 = vmaxq_f32(out0, vzero);
...@@ -737,7 +714,6 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, ...@@ -737,7 +714,6 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
out0 = vmlaq_n_f32(out0, in6, w10); out0 = vmlaq_n_f32(out0, in6, w10);
out0 = vmlaq_n_f32(out0, tmp2, w11); out0 = vmlaq_n_f32(out0, tmp2, w11);
out0 = vmlaq_n_f32(out0, tmp3, w12); out0 = vmlaq_n_f32(out0, tmp3, w12);
out0 = vaddq_f32(out0, vbias);
out0 = vmlaq_f32(vnewbias, vnewscale, out0); out0 = vmlaq_f32(vnewbias, vnewscale, out0);
if (if_relu) { if (if_relu) {
out0 = vmaxq_f32(out0, vzero); out0 = vmaxq_f32(out0, vzero);
...@@ -783,7 +759,6 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, ...@@ -783,7 +759,6 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
out0 = vmlaq_n_f32(out0, in4_tmp, w20); out0 = vmlaq_n_f32(out0, in4_tmp, w20);
out0 = vmlaq_n_f32(out0, tmp4, w21); out0 = vmlaq_n_f32(out0, tmp4, w21);
out0 = vmlaq_n_f32(out0, tmp5, w22); out0 = vmlaq_n_f32(out0, tmp5, w22);
out0 = vaddq_f32(out0, vbias);
out0 = vmlaq_f32(vnewbias, vnewscale, out0); out0 = vmlaq_f32(vnewbias, vnewscale, out0);
if (if_relu) { if (if_relu) {
out0 = vmaxq_f32(out0, vzero); out0 = vmaxq_f32(out0, vzero);
...@@ -817,7 +792,6 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, ...@@ -817,7 +792,6 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
out0 = vmlaq_n_f32(out0, in4_tmp, w20); out0 = vmlaq_n_f32(out0, in4_tmp, w20);
out0 = vmlaq_n_f32(out0, tmp4, w21); out0 = vmlaq_n_f32(out0, tmp4, w21);
out0 = vmlaq_n_f32(out0, tmp5, w22); out0 = vmlaq_n_f32(out0, tmp5, w22);
out0 = vaddq_f32(out0, vbias);
out0 = vmlaq_f32(vnewbias, vnewscale, out0); out0 = vmlaq_f32(vnewbias, vnewscale, out0);
if (if_relu) { if (if_relu) {
out0 = vmaxq_f32(out0, vzero); out0 = vmaxq_f32(out0, vzero);
...@@ -840,6 +814,202 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, ...@@ -840,6 +814,202 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
} }
} }
} }
void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter,
Tensor *output, const Tensor *new_scale,
const Tensor *new_bias, bool if_relu) {
const int batch_size = input->dims()[0];
const int input_height = input->dims()[2];
const int input_width = input->dims()[3];
const int output_channels = output->dims()[1];
const int output_height = output->dims()[2];
const int output_width = output->dims()[3];
const int _kernel_size = 3;
const int stride_height = 2;
const int stride_width = 2;
const int padding_height = 1;
const int padding_width = 1;
const float zero = 0;
const int input_channel_stride = input_height * input_width;
const int output_channel_stride = output_height * output_width;
const int filter_channel_stride = 9;
const float *newscale_data = new_scale->data<float>();
const float *newbias_data = new_bias->data<float>();
const float *input_data = input->data<float>();
const float *filter_data = filter->data<float>();
float *output_data = output->mutable_data<float>();
const int input_batch_stride = output_channels * input_channel_stride;
const int output_batch_stride = output_channels * output_channel_stride;
const int filter_batch_stride = output_channels * output_channel_stride;
const float *pos1, *pos2, *pos3, *filter1, *filter2, *filter3, *output_ptr;
int hstart, wstart, hend, wend;
float result;
for (int i = 0; i < batch_size; ++i) {
for (int c = 0; c < output_channels; ++c) {
filter1 = filter_data;
filter2 = filter1 + 3;
filter3 = filter2 + 3;
for (int ph = 0; ph < output_height; ph++) {
for (int pw = 0; pw < output_width; pw++) {
hstart = ph * stride_height - padding_height;
wstart = pw * stride_width - padding_width;
hend = min(hstart + _kernel_size, input_height + padding_height);
wend = min(wstart + _kernel_size, input_width + padding_width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
hend = min(hend, input_height);
wend = min(wend, input_width);
pos1 = input_data + hstart * input_width + wstart;
pos2 = input_data + (hstart + 1) * input_width + wstart;
pos3 = input_data + (hstart + 2) * input_width + wstart;
output_ptr = output_data + ph * output_width + pw;
if (hend - hstart != 3 || wend - wstart != 3) {
result = 0;
float fake_input[9] = {0};
if (hstart == 0 && wstart == 0) {
// 左上角
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 3; ++k) {
if (j >= 3 - hend && k >= 3 - wend) {
fake_input[3 * j + k] =
input_data[(j - (3 - hend)) * input_width + k -
(3 - wend)];
}
}
}
} else if (hstart == 0 && wend == input_width) {
// 右上角
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 3; ++k) {
if (j >= 3 - hend && k <= input_width - wstart - 1) {
fake_input[3 * j + k] =
input_data[(j - (3 - hend)) * input_width + k + wstart];
}
}
}
} else if (hend == input_height && wstart == 0) {
// 左下角
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 3; ++k) {
if (j <= input_height - 1 - hstart && k >= 3 - wend) {
fake_input[3 * j + k] =
input_data[(j + hstart) * input_width + k - (3 - wend)];
}
}
}
} else if (hend == input_height && wend == input_width) {
// 右下角
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 3; ++k) {
if (j <= input_height - hstart - 1 &&
k <= input_width - wstart - 1) {
fake_input[3 * j + k] =
input_data[(j + hstart) * input_width + k + wstart];
}
}
}
} else if (hstart == 0) {
// 顶部
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 3; ++k) {
if (j >= 3 - hend) {
fake_input[3 * j + k] =
input_data[(j - (3 - hend)) * input_width + k + wstart];
}
}
}
} else if (hend == input_height) {
// 底部
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 3; ++k) {
if (j <= input_height - hstart - 1) {
fake_input[3 * j + k] =
input_data[(j + hstart) * input_width + k + wstart];
}
}
}
} else if (wstart == 0) {
// 左侧
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 3; ++k) {
if (k >= 3 - wend) {
fake_input[3 * j + k] =
input_data[(j + hstart) * input_width +
(k - (3 - wend))];
}
}
}
} else if (wend == input_width) {
// 右侧
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 3; ++k) {
if (k <= input_width - wstart - 1) {
fake_input[3 * j + k] =
input_data[(j + hstart) * input_width + k + wstart];
}
}
}
}
for (int l = 0; l < 9; ++l) {
result += fake_input[l] * filter1[l];
}
output_data[ph * output_width + pw] =
newscale_data[c] * result + newbias_data[c];
if (if_relu) {
output_data[ph * output_width + pw] =
output_data[ph * output_width + pw] < 0
? 0
: output_data[ph * output_width + pw];
}
} else {
const float32x4_t data1 = vld1q_f32(pos1);
const float32x4_t data2 = vld1q_f32(pos2);
const float32x4_t data3 = vld1q_f32(pos3);
const float32x4_t v_filter1 = vld1q_f32(filter1);
const float32x4_t v_filter2 = vld1q_f32(filter2);
const float32x4_t v_filter3 = vld1q_f32(filter3);
float32x4_t mula = vmulq_f32(data1, v_filter1);
mula = vmlaq_f32(mula, data2, v_filter2);
mula = vmlaq_f32(mula, data3, v_filter3);
float32x2_t res = vpadd_f32(
vget_high_f32(vsetq_lane_f32(0, mula, 3)), vget_low_f32(mula));
res = vpadd_f32(res, res);
output_data[ph * output_width + pw] =
vget_lane_f32(res, 0) * newscale_data[c] + newbias_data[c];
if (if_relu) {
output_data[ph * output_width + pw] =
output_data[ph * output_width + pw] < 0
? 0
: output_data[ph * output_width + pw];
}
}
}
}
input_data += input_channel_stride;
output_data += output_channel_stride;
filter_data += filter_channel_stride;
}
input_data += input_batch_stride;
output_data += output_batch_stride;
}
}
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -33,10 +33,11 @@ void DepthwiseConv3x3(const Tensor *input, vector<int> strides, ...@@ -33,10 +33,11 @@ void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter, void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
Tensor *output, Tensor *bias, bool if_bias); Tensor *output, Tensor *bias, bool if_bias);
void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
Tensor *output, Tensor *bias, bool if_bias, Tensor *output, const Tensor *new_scale,
const Tensor *new_scale, const Tensor *new_bias, bool if_relu);
const Tensor *new_bias, bool if_bn, void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter,
bool if_relu); Tensor *output, const Tensor *new_scale,
const Tensor *new_bias, bool if_relu);
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册