提交 686398d2 编写于 作者: E eclipsess

fix dwconv in w!=h

上级 2319a4ca
...@@ -124,8 +124,7 @@ void ConvAddCompute(const FusionConvAddParam<CPU> &param) { ...@@ -124,8 +124,7 @@ void ConvAddCompute(const FusionConvAddParam<CPU> &param) {
} else if (param.Groups() == param.Input()->dims()[1] && } else if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] && param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2 && param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
param.Input()->dims()[2] == param.Input()->dims()[3]) {
// math::DepthwiseConv3x3(param.Input(), param.Strides(), // math::DepthwiseConv3x3(param.Input(), param.Strides(),
// param.Paddings(), // param.Paddings(),
// param.Filter(), param.Bias(), // param.Filter(), param.Bias(),
......
...@@ -118,16 +118,14 @@ void ConvAddBNReluCompute(const FusionConvAddBNReluParam<CPU> &param) { ...@@ -118,16 +118,14 @@ void ConvAddBNReluCompute(const FusionConvAddBNReluParam<CPU> &param) {
if (param.Groups() == param.Input()->dims()[1] && if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] && param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 && param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
param.Input()->dims()[2] == param.Input()->dims()[3]) {
math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(), math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
param.Output(), param.NewScale(), param.Output(), param.NewScale(),
param.NewBias(), true); param.NewBias(), true);
} else if (param.Groups() == param.Input()->dims()[1] && } else if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] && param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2 && param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
param.Input()->dims()[2] == param.Input()->dims()[3]) {
// math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(), // math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
// param.Output(), param.NewScale(), // param.Output(), param.NewScale(),
// param.NewBias(), 1); // param.NewBias(), 1);
......
...@@ -130,8 +130,7 @@ void ConvCompute(const ConvParam<CPU> &param) { ...@@ -130,8 +130,7 @@ void ConvCompute(const ConvParam<CPU> &param) {
} else if (param.Groups() == param.Input()->dims()[1] && } else if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] && param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Filter()->dims()[2] == 3) {
param.Input()->dims()[2] == param.Input()->dims()[3]) {
math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(), math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
param.Filter(), nullptr, param.Output(), false); param.Filter(), nullptr, param.Output(), false);
} else { } else {
......
...@@ -122,16 +122,14 @@ void ConvBNAddReluCompute(const FusionConvBNAddReluParam<CPU> &param) { ...@@ -122,16 +122,14 @@ void ConvBNAddReluCompute(const FusionConvBNAddReluParam<CPU> &param) {
if (param.Groups() == param.Input()->dims()[1] && if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] && param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 && param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
param.Input()->dims()[2] == param.Input()->dims()[3]) {
math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(), math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
param.Output(), param.NewScale(), param.Output(), param.NewScale(),
param.NewBias(), true); param.NewBias(), true);
} else if (param.Groups() == param.Input()->dims()[1] && } else if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] && param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2 && param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
param.Input()->dims()[2] == param.Input()->dims()[3]) {
// math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(), // math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
// param.Output(), param.NewScale(), // param.Output(), param.NewScale(),
// param.NewBias(), 1); // param.NewBias(), 1);
......
...@@ -117,16 +117,14 @@ void ConvBNReluCompute(const FusionConvBNReluParam<CPU> &param) { ...@@ -117,16 +117,14 @@ void ConvBNReluCompute(const FusionConvBNReluParam<CPU> &param) {
if (param.Groups() == param.Input()->dims()[1] && if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] && param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 && param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
param.Input()->dims()[2] == param.Input()->dims()[3]) {
math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(), math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
param.Output(), param.NewScale(), param.Output(), param.NewScale(),
param.NewBias(), true); param.NewBias(), true);
} else if (param.Groups() == param.Input()->dims()[1] && } else if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] && param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2 && param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
param.Input()->dims()[2] == param.Input()->dims()[3]) {
// math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(), // math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
// param.Output(), param.NewScale(), // param.Output(), param.NewScale(),
// param.NewBias(), 1); // param.NewBias(), 1);
......
...@@ -36,8 +36,7 @@ void DepthwiseConvCompute(const ConvParam<CPU> &param) { ...@@ -36,8 +36,7 @@ void DepthwiseConvCompute(const ConvParam<CPU> &param) {
} else if (param.Groups() == param.Input()->dims()[1] && } else if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] && param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2 && param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
param.Input()->dims()[2] == param.Input()->dims()[3]) {
// math::DepthwiseConv3x3(param.Input(), param.Strides(), // math::DepthwiseConv3x3(param.Input(), param.Strides(),
// param.Paddings(), // param.Paddings(),
// param.Filter(), &Bias, param.Output(), false); // param.Filter(), &Bias, param.Output(), false);
......
...@@ -115,16 +115,14 @@ void DWConvBNReluCompute(const FusionDWConvBNReluParam<CPU> &param) { ...@@ -115,16 +115,14 @@ void DWConvBNReluCompute(const FusionDWConvBNReluParam<CPU> &param) {
if (param.Groups() == param.Input()->dims()[1] && if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] && param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 && param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
param.Input()->dims()[2] == param.Input()->dims()[3]) {
math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(), math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
param.Output(), param.NewScale(), param.Output(), param.NewScale(),
param.NewBias(), true); param.NewBias(), true);
} else if (param.Groups() == param.Input()->dims()[1] && } else if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] && param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2 && param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
param.Input()->dims()[2] == param.Input()->dims()[3]) {
// math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(), // math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
// param.Output(), param.NewScale(), // param.Output(), param.NewScale(),
// param.NewBias(), 1); // param.NewBias(), 1);
......
...@@ -302,7 +302,7 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter, ...@@ -302,7 +302,7 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
for (int i = 1; i < h - 1; ++i) { for (int i = 1; i < h - 1; ++i) {
output_data[i * w] = output_data[i * w] =
w01 * input_data[i * w - w] + w02 * input_data[i * w - w + 1] + w01 * input_data[i * w - w] + w02 * input_data[i * w - w + 1] +
w11 * input_data[i * w] + w12 * input_data[i * w + w] + w11 * input_data[i * w] + w12 * input_data[i * w + 1] +
w21 * input_data[i * w + w] + w22 * input_data[i * w + w + 1]; w21 * input_data[i * w + w] + w22 * input_data[i * w + w + 1];
output_data[i * w + w - 1] = w00 * input_data[i * w + w - 1 - w - 1] + output_data[i * w + w - 1] = w00 * input_data[i * w + w - 1 - w - 1] +
...@@ -537,8 +537,9 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, ...@@ -537,8 +537,9 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
const int hxw = input_height * input_width; const int hxw = input_height * input_width;
const int l = input_height; // const int l = input_height;
const int h = input_height;
const int w = input_width;
float32x4_t vzero = vdupq_n_f32(0); float32x4_t vzero = vdupq_n_f32(0);
for (int b = 0; b < batch_size; b++) { for (int b = 0; b < batch_size; b++) {
...@@ -624,54 +625,53 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, ...@@ -624,54 +625,53 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
} }
output_data[0] = w11 * input_data[0] + w12 * input_data[1] + output_data[0] = w11 * input_data[0] + w12 * input_data[1] +
w21 * input_data[l] + w22 * input_data[l + 1]; w21 * input_data[w] + w22 * input_data[w + 1];
output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l - 1] + output_data[w - 1] = w10 * input_data[w - 2] + w11 * input_data[w - 1] +
w20 * input_data[2 * l - 2] + w20 * input_data[2 * w - 2] +
w21 * input_data[2 * l - 1]; w21 * input_data[2 * w - 1];
output_data[(l - 1) * l] = output_data[(h - 1) * w] =
w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + 1] + w01 * input_data[(h - 2) * w] + w02 * input_data[(h - 2) * w + 1] +
w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1]; w11 * input_data[(h - 1) * w] + w12 * input_data[(h - 1) * w + 1];
output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] + output_data[h * w - 1] =
w01 * input_data[(l - 2) * (l + 1) + 1] + w00 * input_data[h * w - w - 2] + w01 * input_data[h * w - w - 1] +
w10 * input_data[l * l - 2] + w10 * input_data[h * w - 2] + w11 * input_data[h * w - 1];
w11 * input_data[l * l - 1];
output_data[0] = output_data[0] * newscale_data[c] + newbias_data[c]; output_data[0] = output_data[0] * newscale_data[c] + newbias_data[c];
output_data[l - 1] = output_data[w - 1] =
output_data[l - 1] * newscale_data[c] + newbias_data[c]; output_data[w - 1] * newscale_data[c] + newbias_data[c];
output_data[(l - 1) * l] = output_data[(h - 1) * w] =
output_data[(l - 1) * l] * newscale_data[c] + newbias_data[c]; output_data[(h - 1) * w] * newscale_data[c] + newbias_data[c];
output_data[l * l - 1] = output_data[h * w - 1] =
output_data[l * l - 1] * newscale_data[c] + newbias_data[c]; output_data[h * w - 1] * newscale_data[c] + newbias_data[c];
if (if_relu) { if (if_relu) {
output_data[0] = output_data[0] < 0 ? 0 : output_data[0]; output_data[0] = output_data[0] < 0 ? 0 : output_data[0];
output_data[l - 1] = output_data[l - 1] < 0 ? 0 : output_data[l - 1]; output_data[w - 1] = output_data[w - 1] < 0 ? 0 : output_data[w - 1];
output_data[(l - 1) * l] = output_data[(h - 1) * w] =
output_data[(l - 1) * l] < 0 ? 0 : output_data[(l - 1) * l]; output_data[(h - 1) * w] < 0 ? 0 : output_data[(h - 1) * w];
output_data[l * l - 1] = output_data[h * w - 1] =
output_data[l * l - 1] < 0 ? 0 : output_data[l * l - 1]; output_data[h * w - 1] < 0 ? 0 : output_data[h * w - 1];
} }
for (int i = 1; i < l - 1; ++i) { for (int i = 1; i < h - 1; ++i) {
output_data[i * l] = output_data[i * w] =
w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] + w01 * input_data[i * w - w] + w02 * input_data[i * w - w + 1] +
w11 * input_data[i * l] + w12 * input_data[i * l + 1] + w11 * input_data[i * w] + w12 * input_data[i * w + 1] +
w21 * input_data[i * l + l] + w22 * input_data[i * l + l + 1]; w21 * input_data[i * w + w] + w22 * input_data[i * w + w + 1];
output_data[i * l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] + output_data[i * w + w - 1] = w00 * input_data[i * w + w - 1 - w - 1] +
w01 * input_data[i * l + l - 1 - l] + w01 * input_data[i * w + w - 1 - w] +
w10 * input_data[i * l + l - 1 - 1] + w10 * input_data[i * w + w - 1 - 1] +
w11 * input_data[i * l + l - 1] + w11 * input_data[i * w + w - 1] +
w20 * input_data[i * l + l - 1 + l - 1] + w20 * input_data[i * w + w - 1 + w - 1] +
w21 * input_data[i * l + l - 1 + l]; w21 * input_data[i * w + w - 1 + w];
output_data[i * l] = output_data[i * w] =
output_data[i * l] * newscale_data[c] + newbias_data[c]; output_data[i * w] * newscale_data[c] + newbias_data[c];
output_data[i * l + l - 1] = output_data[i * w + w - 1] =
output_data[i * l + l - 1] * newscale_data[c] + newbias_data[c]; output_data[i * w + w - 1] * newscale_data[c] + newbias_data[c];
if (if_relu) { if (if_relu) {
output_data[i * l] = output_data[i * l] < 0 ? 0 : output_data[i * l]; output_data[i * w] = output_data[i * w] < 0 ? 0 : output_data[i * w];
output_data[i * l + l - 1] = output_data[i * w + w - 1] =
output_data[i * l + l - 1] < 0 ? 0 : output_data[i * l + l - 1]; output_data[i * w + w - 1] < 0 ? 0 : output_data[i * w + w - 1];
} }
} }
...@@ -774,7 +774,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, ...@@ -774,7 +774,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
const int h = static_cast<int>(input->dims()[2]); const int h = static_cast<int>(input->dims()[2]);
const int w = static_cast<int>(input->dims()[3]); const int w = static_cast<int>(input->dims()[3]);
const int l = h; // const int l = h;
const int batch_size = static_cast<int>(input->dims()[0]); const int batch_size = static_cast<int>(input->dims()[0]);
const int c = static_cast<int>(input->dims()[1]); const int c = static_cast<int>(input->dims()[1]);
...@@ -790,7 +790,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, ...@@ -790,7 +790,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
vnewbias = vdupq_n_f32(newbias_data[j]); vnewbias = vdupq_n_f32(newbias_data[j]);
vnewscale = vdupq_n_f32(newscale_data[j]); vnewscale = vdupq_n_f32(newscale_data[j]);
int l_mid = l - 2; // l=1->l_mid=-1,l=2->l_mid=0 int w_mid = w - 2; // l=1->l_mid=-1,l=2->l_mid=0
float w00 = filter_data_tmp[0]; float w00 = filter_data_tmp[0];
float w01 = filter_data_tmp[1]; float w01 = filter_data_tmp[1];
float w02 = filter_data_tmp[2]; float w02 = filter_data_tmp[2];
...@@ -802,49 +802,49 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, ...@@ -802,49 +802,49 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
float w22 = filter_data_tmp[8]; float w22 = filter_data_tmp[8];
output_data[0] = w11 * input_data[0] + w12 * input_data[1] + output_data[0] = w11 * input_data[0] + w12 * input_data[1] +
w21 * input_data[l] + w22 * input_data[l + 1]; w21 * input_data[w] + w22 * input_data[w + 1];
output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l - output_data[w - 1] = w10 * input_data[w - 2] + w11 * input_data[w -
1] + w20 * input_data[2 * l - 2] + w21 * input_data[2 * l - 1]; 1] + w20 * input_data[2 * w - 2] + w21 * input_data[2 * w - 1];
output_data[(l - 1) * l] = output_data[(h - 1) * w] =
w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + w01 * input_data[(h - 2) * w] + w02 * input_data[(h - 2) * w +
1] + w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1]; 1] + w11 * input_data[(h - 1) * w] + w12 * input_data[(h - 1) * w + 1];
output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] + output_data[h * w - 1] = w00 * input_data[h*w-w-2] +
w01 * input_data[(l - 2) * (l + 1) + 1] + w01 * input_data[h*w-w-1] +
w10 * input_data[l * l - 2] + w10 * input_data[h * w - 2] +
w11 * input_data[l * l - 1]; w11 * input_data[h * w - 1];
output_data[0] = output_data[0] * newscale_data[j] + output_data[0] = output_data[0] * newscale_data[j] +
newbias_data[j]; output_data[l - 1] = output_data[l - 1] * newbias_data[j]; output_data[w - 1] = output_data[w - 1] *
newscale_data[j] + newbias_data[j]; output_data[(l - 1) * l] = newscale_data[j] + newbias_data[j]; output_data[(h - 1) * w] =
output_data[(l - 1) * l] * newscale_data[j] + newbias_data[j]; output_data[(h - 1) * w] * newscale_data[j] + newbias_data[j];
output_data[l * l - 1] = output_data[h * w - 1] =
output_data[l * l - 1] * newscale_data[j] + newbias_data[j]; output_data[h * w - 1] * newscale_data[j] + newbias_data[j];
if (if_relu) { if (if_relu) {
output_data[0] = output_data[0] < 0 ? 0 : output_data[0]; output_data[0] = output_data[0] < 0 ? 0 : output_data[0];
output_data[l - 1] = output_data[l - 1] < 0 ? 0 : output_data[l - output_data[w - 1] = output_data[w - 1] < 0 ? 0 : output_data[w -
1]; output_data[(l - 1) * l] = output_data[(l - 1) * l] < 0 ? 0 : 1]; output_data[(h - 1) * w] = output_data[(h - 1) * w] < 0 ? 0 :
output_data[(l - 1) * l]; output_data[l * l - 1] = output_data[l * l - 1] output_data[(h - 1) * w]; output_data[h * w - 1] = output_data[h * w - 1]
< 0 ? 0 : output_data[l * l - 1]; < 0 ? 0 : output_data[h * w - 1];
} }
for (int i = 1; i < l - 1; ++i) { for (int i = 1; i < h - 1; ++i) {
output_data[i * l] = output_data[i * w] =
w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] w01 * input_data[i * w - w] + w02 * input_data[i * w - w + 1]
+ w11 * input_data[i * l] + w12 * input_data[i * l + 1] + w21 * + w11 * input_data[i * w] + w12 * input_data[i * w + 1] + w21 *
input_data[i * l + l] + w22 * input_data[i * l + l + 1]; output_data[i * input_data[i * w + w] + w22 * input_data[i * w + w + 1]; output_data[i *
l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] + w01 * input_data[i w + w - 1] = w00 * input_data[i * w + w - 1 - w - 1] + w01 * input_data[i
* l + l - 1 - l] + w10 * input_data[i * l + l - 1 - 1] + w11 * * w + w - 1 - w] + w10 * input_data[i * w + w - 1 - 1] + w11 *
input_data[i * l + l - 1] + w20 * input_data[i * l + l - 1 + l - 1] + w21 input_data[i * w + w - 1] + w20 * input_data[i * w + w - 1 + w - 1] + w21
* input_data[i * l + l - 1 + l]; output_data[i * l] = output_data[i * l] * input_data[i * w + w - 1 + w]; output_data[i * w] = output_data[i * w]
* newscale_data[j] + newbias_data[j]; output_data[i * l + l - 1] = * newscale_data[j] + newbias_data[j]; output_data[i * w + w - 1] =
output_data[i * l + l - 1] * newscale_data[j] + output_data[i * w + w - 1] * newscale_data[j] +
newbias_data[j]; newbias_data[j];
if (if_relu) { if (if_relu) {
output_data[i * l] = output_data[i * l] < 0 ? 0 : output_data[i output_data[i * w] = output_data[i * w] < 0 ? 0 : output_data[i
* l]; output_data[i * l + l - 1] = output_data[i * l + l - 1] < 0 ? 0 : * w]; output_data[i * w + w - 1] = output_data[i * w + w - 1] < 0 ? 0 :
output_data[i * l + l - 1]; output_data[i * w + w - 1];
} }
} }
...@@ -853,11 +853,11 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, ...@@ -853,11 +853,11 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1,
tmp2, tmp3, tmp4, tmp5, out0; in0 = vld1q_f32(input_tmp); in2 = tmp2, tmp3, tmp4, tmp5, out0; in0 = vld1q_f32(input_tmp); in2 =
vld1q_f32(input_tmp + l); const float *input_tmp_end = input_tmp + (l - vld1q_f32(input_tmp + w); const float *input_tmp_end = input_tmp + (h -
2) * l; in4 = vld1q_f32(input_tmp_end); in6 = vld1q_f32(input_tmp_end + 2) * w; in4 = vld1q_f32(input_tmp_end); in6 = vld1q_f32(input_tmp_end +
l); int c_mid = l_mid; auto output_ptr = output_data + 1; for (; c_mid > w); int c_mid = w_mid; auto output_ptr = output_data + 1; for (; c_mid >
3; c_mid -= 4) { in1 = vld1q_f32(input_tmp + 4); in3 = 3; c_mid -= 4) { in1 = vld1q_f32(input_tmp + 4); in3 =
vld1q_f32(input_tmp + l + 4); vld1q_f32(input_tmp + w + 4);
tmp0 = vextq_f32(in0, in1, 1); tmp0 = vextq_f32(in0, in1, 1);
tmp1 = vextq_f32(in0, in1, 2); tmp1 = vextq_f32(in0, in1, 2);
...@@ -878,7 +878,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, ...@@ -878,7 +878,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
vst1q_f32(output_ptr, out0); vst1q_f32(output_ptr, out0);
in5 = vld1q_f32(input_tmp_end + 4); in5 = vld1q_f32(input_tmp_end + 4);
in7 = vld1q_f32(input_tmp_end + l + 4); in7 = vld1q_f32(input_tmp_end + w + 4);
tmp0 = vextq_f32(in4, in5, 1); tmp0 = vextq_f32(in4, in5, 1);
tmp1 = vextq_f32(in4, in5, 2); tmp1 = vextq_f32(in4, in5, 2);
...@@ -895,7 +895,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, ...@@ -895,7 +895,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
if (if_relu) { if (if_relu) {
out0 = vmaxq_f32(out0, vzero); out0 = vmaxq_f32(out0, vzero);
} }
vst1q_f32(output_ptr + (l - 1) * l, out0); vst1q_f32(output_ptr + (h - 1) * w, out0);
// can optimize to each 8 stride. // can optimize to each 8 stride.
input_tmp += 4; input_tmp += 4;
...@@ -908,8 +908,8 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, ...@@ -908,8 +908,8 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
} }
// top right pad // top right pad
float32x4_t pad0 = vdupq_n_f32(input_data[l - 1]); float32x4_t pad0 = vdupq_n_f32(input_data[w - 1]);
float32x4_t pad1 = vdupq_n_f32(input_data[2 * l - 1]); float32x4_t pad1 = vdupq_n_f32(input_data[2 * w - 1]);
tmp0 = vextq_f32(in0, pad0, 1); tmp0 = vextq_f32(in0, pad0, 1);
tmp1 = vextq_f32(in0, pad0, 2); tmp1 = vextq_f32(in0, pad0, 2);
...@@ -939,8 +939,8 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, ...@@ -939,8 +939,8 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
} }
// bottom right pad // bottom right pad
float32x4_t pad2 = vdupq_n_f32(input_data[l * l - 1 - l]); float32x4_t pad2 = vdupq_n_f32(input_data[h * w - 1 - w]);
float32x4_t pad3 = vdupq_n_f32(input_data[l * l - 1]); float32x4_t pad3 = vdupq_n_f32(input_data[h * w - 1]);
tmp0 = vextq_f32(in4, pad2, 1); tmp0 = vextq_f32(in4, pad2, 1);
tmp1 = vextq_f32(in4, pad2, 2); tmp1 = vextq_f32(in4, pad2, 2);
...@@ -959,29 +959,29 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, ...@@ -959,29 +959,29 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
} }
for (int i = 0; i < c_mid; ++i) { for (int i = 0; i < c_mid; ++i) {
if (i == 0) { if (i == 0) {
vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 0); vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 0);
} }
if (i == 1) { if (i == 1) {
vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 1); vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 1);
} }
if (i == 2) { if (i == 2) {
vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 2); vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 2);
} }
} }
// mid // mid
for (int i = 0; i < l - 2; ++i) { for (int i = 0; i < h - 2; ++i) {
auto output_ptr = output_data + (i + 1) * l + 1; auto output_ptr = output_data + (i + 1) * w + 1;
input_tmp = input_data + i * l; input_tmp = input_data + i * w;
auto in0_tmp = vld1q_f32(input_tmp); auto in0_tmp = vld1q_f32(input_tmp);
auto in2_tmp = vld1q_f32(input_tmp + l); auto in2_tmp = vld1q_f32(input_tmp + w);
auto in4_tmp = vld1q_f32(input_tmp + l + l); auto in4_tmp = vld1q_f32(input_tmp + w + w);
c_mid = l_mid; c_mid = w_mid;
for (; c_mid > 3; c_mid -= 4) { for (; c_mid > 3; c_mid -= 4) {
auto in1_tmp = vld1q_f32(input_tmp + 4); auto in1_tmp = vld1q_f32(input_tmp + 4);
auto in3_tmp = vld1q_f32(input_tmp + l + 4); auto in3_tmp = vld1q_f32(input_tmp + w + 4);
auto in5_tmp = vld1q_f32(input_tmp + l + l + 4); auto in5_tmp = vld1q_f32(input_tmp + w + w + 4);
tmp0 = vextq_f32(in0_tmp, in1_tmp, 1); tmp0 = vextq_f32(in0_tmp, in1_tmp, 1);
tmp1 = vextq_f32(in0_tmp, in1_tmp, 2); tmp1 = vextq_f32(in0_tmp, in1_tmp, 2);
...@@ -1012,9 +1012,9 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, ...@@ -1012,9 +1012,9 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
in4_tmp = in5_tmp; in4_tmp = in5_tmp;
} }
float32x4_t pad0 = vdupq_n_f32(input_data[i * l + l - 1]); float32x4_t pad0 = vdupq_n_f32(input_data[i * w + w - 1]);
float32x4_t pad1 = vdupq_n_f32(input_data[i * l + l - 1 + l]); float32x4_t pad1 = vdupq_n_f32(input_data[i * w + w - 1 + w]);
float32x4_t pad2 = vdupq_n_f32(input_data[i * l + l - 1 + l + l]); float32x4_t pad2 = vdupq_n_f32(input_data[i * w + w - 1 + w + w]);
tmp0 = vextq_f32(in0_tmp, pad0, 1); tmp0 = vextq_f32(in0_tmp, pad0, 1);
tmp1 = vextq_f32(in0_tmp, pad0, 2); tmp1 = vextq_f32(in0_tmp, pad0, 2);
...@@ -1058,6 +1058,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, ...@@ -1058,6 +1058,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
#endif #endif
} }
/// w!=h not fix
void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter, void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter,
Tensor *output, const Tensor *new_scale, Tensor *output, const Tensor *new_scale,
const Tensor *new_bias, bool if_relu) { const Tensor *new_bias, bool if_relu) {
...@@ -1273,7 +1274,8 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter, ...@@ -1273,7 +1274,8 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
const int in_l = in_h; const int in_l = in_h;
const int inhxw = in_h * in_w; const int inhxw = in_h * in_w;
const int outhxw = out_h * out_w; const int outhxw = out_h * out_w;
const int if_pad = in_l - 1 == (out_l - 1) * 2 ? 1 : 0; /// todo : fix if_pad when w != h
const int if_pad = in_w - 1 == (out_w - 1) * 2 ? 1 : 0;
const int batch_size = static_cast<int>(input->dims()[0]); const int batch_size = static_cast<int>(input->dims()[0]);
const int c = static_cast<int>(input->dims()[1]); const int c = static_cast<int>(input->dims()[1]);
const float *input_row_ptr; const float *input_row_ptr;
...@@ -1379,9 +1381,9 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter, ...@@ -1379,9 +1381,9 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
if ((w4 != w_times)) { if ((w4 != w_times)) {
vst1q_f32(output_row_ptr, res3); vst1q_f32(output_row_ptr, res3);
} else { } else {
if (out_l - 2 - w_times * 3 == 1) { if (out_w - 2 - w_times * 3 == 1) {
vst1q_lane_f32(output_row_ptr, res3, 0); vst1q_lane_f32(output_row_ptr, res3, 0);
} else if (out_l - 2 - w_times * 3 == 2) { } else if (out_w - 2 - w_times * 3 == 2) {
vst1q_lane_f32(output_row_ptr, res3, 0); vst1q_lane_f32(output_row_ptr, res3, 0);
vst1q_lane_f32(output_row_ptr + 1, res3, 1); vst1q_lane_f32(output_row_ptr + 1, res3, 1);
} }
...@@ -1391,28 +1393,28 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter, ...@@ -1391,28 +1393,28 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
} }
output_data_tmp[0] = input_const[0] * w11 + input_const[1] * w12 + output_data_tmp[0] = input_const[0] * w11 + input_const[1] * w12 +
input_const[in_l] * w21 + input_const[in_w] * w21 +
input_const[in_l + 1] * w22; input_const[in_w + 1] * w22;
out2in_mid = (out_l - 1) * 2; out2in_mid = (out_h - 1) * 2;
output_data_tmp[out_l - 1] = output_data_tmp[out_w - 1] =
w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
w20 * input_const[out2in_mid + in_w - 1] + w20 * input_const[out2in_mid + in_w - 1] +
w21 * input_const[out2in_mid + in_w] + w21 * input_const[out2in_mid + in_w] +
(1 - if_pad) * (w12 * input_const[out2in_mid + 1] + (1 - if_pad) * (w12 * input_const[out2in_mid + 1] +
w22 * input_const[out2in_mid + in_w + 1]); w22 * input_const[out2in_mid + in_w + 1]);
out2in_mid = (out_l - 1) * 2 * in_w; out2in_mid = (out_h - 1) * 2 * in_w;
output_data_tmp[out_l * (out_l - 1)] = output_data_tmp[out_w * (out_h - 1)] =
w01 * input_const[out2in_mid - in_w] + w01 * input_const[out2in_mid - in_w] +
w02 * input_const[out2in_mid - in_w + 1] + w02 * input_const[out2in_mid - in_w + 1] +
w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] + w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] +
(1 - if_pad) * (w21 * input_const[out2in_mid + in_w] + (1 - if_pad) * (w21 * input_const[out2in_mid + in_w] +
w22 * input_const[out2in_mid + in_w + 1]); w22 * input_const[out2in_mid + in_w + 1]);
out2in_mid = (out_l - 1) * 2 * in_w + (out_l - 1) * 2; out2in_mid = (out_h - 1) * 2 * in_w + (out_h - 1) * 2;
output_data_tmp[out_l * out_l - 1] = output_data_tmp[out_h * out_w - 1] =
w00 * input_const[out2in_mid - in_w - 1] + w00 * input_const[out2in_mid - in_w - 1] +
w01 * input_const[out2in_mid - in_w] + w01 * input_const[out2in_mid - in_w] +
w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
...@@ -1423,21 +1425,21 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter, ...@@ -1423,21 +1425,21 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
w22 * input_const[out2in_mid + in_w + 1]); w22 * input_const[out2in_mid + in_w + 1]);
if (if_bias) { if (if_bias) {
output_data_tmp[0] += bias_data[j]; output_data_tmp[0] += bias_data[j];
output_data_tmp[out_l - 1] += bias_data[j]; output_data_tmp[out_w - 1] += bias_data[j];
output_data_tmp[out_l * (out_l - 1)] += bias_data[j]; output_data_tmp[out_w * (out_h - 1)] += bias_data[j];
output_data_tmp[out_l * out_l - 1] += bias_data[j]; output_data_tmp[out_h * out_w - 1] += bias_data[j];
} }
for (int i = 1; i < out_h - 1; i++) { for (int i = 1; i < out_h - 1; i++) {
out2in_mid = i * 2 * in_w; out2in_mid = i * 2 * in_w;
output_data_tmp[i * out_l] = w01 * input_const[out2in_mid - in_w] + output_data_tmp[i * out_w] = w01 * input_const[out2in_mid - in_w] +
w02 * input_const[out2in_mid - in_w + 1] + w02 * input_const[out2in_mid - in_w + 1] +
w11 * input_const[out2in_mid] + w11 * input_const[out2in_mid] +
w12 * input_const[out2in_mid + 1] + w12 * input_const[out2in_mid + 1] +
w21 * input_const[out2in_mid + in_w] + w21 * input_const[out2in_mid + in_w] +
w22 * input_const[out2in_mid + in_w + 1]; w22 * input_const[out2in_mid + in_w + 1];
out2in_mid = i * 2 * in_w + (out_l - 1) * 2; out2in_mid = i * 2 * in_w + (out_h - 1) * 2;
output_data_tmp[i * out_l + out_l - 1] = output_data_tmp[i * out_w + out_w - 1] =
w00 * input_const[out2in_mid - in_w - 1] + w00 * input_const[out2in_mid - in_w - 1] +
w01 * input_const[out2in_mid - in_w] + w01 * input_const[out2in_mid - in_w] +
w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
...@@ -1447,8 +1449,8 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter, ...@@ -1447,8 +1449,8 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
w12 * input_const[out2in_mid + 1] + w12 * input_const[out2in_mid + 1] +
w22 * input_const[out2in_mid + in_w + 1]); w22 * input_const[out2in_mid + in_w + 1]);
if (if_bias) { if (if_bias) {
output_data_tmp[i * out_l] += bias_data[j]; output_data_tmp[i * out_w] += bias_data[j];
output_data_tmp[i * out_l + out_l - 1] += bias_data[j]; output_data_tmp[i * out_w + out_w - 1] += bias_data[j];
} }
} }
filter_data_tmp += 9; filter_data_tmp += 9;
...@@ -1655,11 +1657,12 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, ...@@ -1655,11 +1657,12 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
const int in_w = static_cast<int>(input->dims()[3]); const int in_w = static_cast<int>(input->dims()[3]);
const int out_h = static_cast<int>(output->dims()[2]); const int out_h = static_cast<int>(output->dims()[2]);
const int out_w = static_cast<int>(output->dims()[3]); const int out_w = static_cast<int>(output->dims()[3]);
const int out_l = out_h; // const int out_l = out_h;
const int in_l = in_h; // const int in_l = in_h;
const int inhxw = in_h * in_w; const int inhxw = in_h * in_w;
const int outhxw = out_h * out_w; const int outhxw = out_h * out_w;
const int if_pad = in_l - 1 == (out_l - 1) * 2 ? 1 : 0; /// todo : fix if_pad when w != h
const int if_pad = in_w - 1 == (out_w - 1) * 2 ? 1 : 0;
const int batch_size = static_cast<int>(input->dims()[0]); const int batch_size = static_cast<int>(input->dims()[0]);
const int c = static_cast<int>(input->dims()[1]); const int c = static_cast<int>(input->dims()[1]);
const int w_times = (out_w - 2) / 3; const int w_times = (out_w - 2) / 3;
...@@ -1773,9 +1776,9 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, ...@@ -1773,9 +1776,9 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
vst1q_lane_f32(output_row_ptr + 1, res3, 1); vst1q_lane_f32(output_row_ptr + 1, res3, 1);
vst1q_lane_f32(output_row_ptr + 2, res3, 2); vst1q_lane_f32(output_row_ptr + 2, res3, 2);
} else { } else {
if (out_l - 2 - w_times * 3 == 1) { if (out_w - 2 - w_times * 3 == 1) {
vst1q_lane_f32(output_row_ptr, res3, 0); vst1q_lane_f32(output_row_ptr, res3, 0);
} else if (out_l - 2 - w_times * 3 == 2) { } else if (out_w - 2 - w_times * 3 == 2) {
vst1q_lane_f32(output_row_ptr, res3, 0); vst1q_lane_f32(output_row_ptr, res3, 0);
vst1q_lane_f32(output_row_ptr + 1, res3, 1); vst1q_lane_f32(output_row_ptr + 1, res3, 1);
} }
...@@ -1785,28 +1788,28 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, ...@@ -1785,28 +1788,28 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
} }
output_data_tmp[0] = input_const[0] * w11 + input_const[1] * w12 + output_data_tmp[0] = input_const[0] * w11 + input_const[1] * w12 +
input_const[in_l] * w21 + input_const[in_w] * w21 +
input_const[in_l + 1] * w22; input_const[in_w + 1] * w22;
out2in_mid = (out_l - 1) * 2; out2in_mid = (out_h - 1) * 2;
output_data_tmp[out_l - 1] = output_data_tmp[out_w - 1] =
w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
w20 * input_const[out2in_mid + in_w - 1] + w20 * input_const[out2in_mid + in_w - 1] +
w21 * input_const[out2in_mid + in_w] + w21 * input_const[out2in_mid + in_w] +
(1 - if_pad) * (w12 * input_const[out2in_mid + 1] + (1 - if_pad) * (w12 * input_const[out2in_mid + 1] +
w22 * input_const[out2in_mid + in_w + 1]); w22 * input_const[out2in_mid + in_w + 1]);
out2in_mid = (out_l - 1) * 2 * in_w; out2in_mid = (out_h - 1) * 2 * in_w;
output_data_tmp[out_l * (out_l - 1)] = output_data_tmp[out_w * (out_h - 1)] =
w01 * input_const[out2in_mid - in_w] + w01 * input_const[out2in_mid - in_w] +
w02 * input_const[out2in_mid - in_w + 1] + w02 * input_const[out2in_mid - in_w + 1] +
w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] + w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] +
(1 - if_pad) * (w21 * input_const[out2in_mid + in_w] + (1 - if_pad) * (w21 * input_const[out2in_mid + in_w] +
w22 * input_const[out2in_mid + in_w + 1]); w22 * input_const[out2in_mid + in_w + 1]);
out2in_mid = (out_l - 1) * 2 * in_w + (out_l - 1) * 2; out2in_mid = (out_h - 1) * 2 * in_w + (out_h - 1) * 2;
output_data_tmp[out_l * out_l - 1] = output_data_tmp[out_h * out_w - 1] =
w00 * input_const[out2in_mid - in_w - 1] + w00 * input_const[out2in_mid - in_w - 1] +
w01 * input_const[out2in_mid - in_w] + w01 * input_const[out2in_mid - in_w] +
w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
...@@ -1817,38 +1820,38 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, ...@@ -1817,38 +1820,38 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
w22 * input_const[out2in_mid + in_w + 1]); w22 * input_const[out2in_mid + in_w + 1]);
output_data_tmp[0] = output_data_tmp[0] =
output_data_tmp[0] * newscale_data[j] + newbias_data[j]; output_data_tmp[0] * newscale_data[j] + newbias_data[j];
output_data_tmp[out_l - 1] = output_data_tmp[out_w - 1] =
output_data_tmp[out_l - 1] * newscale_data[j] + newbias_data[j]; output_data_tmp[out_w - 1] * newscale_data[j] + newbias_data[j];
output_data_tmp[out_l * (out_l - 1)] = output_data_tmp[out_w * (out_h - 1)] =
output_data_tmp[out_l * (out_l - 1)] * newscale_data[j] + output_data_tmp[out_w * (out_h - 1)] * newscale_data[j] +
newbias_data[j]; newbias_data[j];
output_data_tmp[out_l * out_l - 1] = output_data_tmp[out_h * out_w - 1] =
output_data_tmp[out_l * out_l - 1] * newscale_data[j] + output_data_tmp[out_h * out_w - 1] * newscale_data[j] +
newbias_data[j]; newbias_data[j];
if (if_relu) { if (if_relu) {
output_data_tmp[0] = output_data_tmp[0] < 0 ? 0 : output_data_tmp[0]; output_data_tmp[0] = output_data_tmp[0] < 0 ? 0 : output_data_tmp[0];
output_data_tmp[out_l - 1] = output_data_tmp[out_w - 1] =
output_data_tmp[out_l - 1] < 0 ? 0 : output_data_tmp[out_l - 1]; output_data_tmp[out_w - 1] < 0 ? 0 : output_data_tmp[out_w - 1];
output_data_tmp[out_l * (out_l - 1)] = output_data_tmp[out_w * (out_h - 1)] =
output_data_tmp[out_l * (out_l - 1)] < 0 output_data_tmp[out_w * (out_h - 1)] < 0
? 0 ? 0
: output_data_tmp[out_l * (out_l - 1)]; : output_data_tmp[out_w * (out_h - 1)];
output_data_tmp[out_l * out_l - 1] = output_data_tmp[out_h * out_w - 1] =
output_data_tmp[out_l * out_l - 1] < 0 output_data_tmp[out_h * out_w - 1] < 0
? 0 ? 0
: output_data_tmp[out_l * out_l - 1]; : output_data_tmp[out_h * out_w - 1];
} }
for (int i = 1; i < out_h - 1; i++) { for (int i = 1; i < out_h - 1; i++) {
out2in_mid = i * 2 * in_w; out2in_mid = i * 2 * in_w;
output_data_tmp[i * out_l] = w01 * input_const[out2in_mid - in_w] + output_data_tmp[i * out_w] = w01 * input_const[out2in_mid - in_w] +
w02 * input_const[out2in_mid - in_w + 1] + w02 * input_const[out2in_mid - in_w + 1] +
w11 * input_const[out2in_mid] + w11 * input_const[out2in_mid] +
w12 * input_const[out2in_mid + 1] + w12 * input_const[out2in_mid + 1] +
w21 * input_const[out2in_mid + in_w] + w21 * input_const[out2in_mid + in_w] +
w22 * input_const[out2in_mid + in_w + 1]; w22 * input_const[out2in_mid + in_w + 1];
out2in_mid = i * 2 * in_w + (out_l - 1) * 2; out2in_mid = i * 2 * in_w + (out_h - 1) * 2;
output_data_tmp[i * out_l + out_l - 1] = output_data_tmp[i * out_w + out_w - 1] =
w00 * input_const[out2in_mid - in_w - 1] + w00 * input_const[out2in_mid - in_w - 1] +
w01 * input_const[out2in_mid - in_w] + w01 * input_const[out2in_mid - in_w] +
w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
...@@ -1857,18 +1860,18 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, ...@@ -1857,18 +1860,18 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
(1 - if_pad) * (w02 * input_const[out2in_mid - in_w + 1] + (1 - if_pad) * (w02 * input_const[out2in_mid - in_w + 1] +
w12 * input_const[out2in_mid + 1] + w12 * input_const[out2in_mid + 1] +
w22 * input_const[out2in_mid + in_w + 1]); w22 * input_const[out2in_mid + in_w + 1]);
output_data_tmp[i * out_l] = output_data_tmp[i * out_w] =
output_data_tmp[i * out_l] * newscale_data[j] + newbias_data[j]; output_data_tmp[i * out_w] * newscale_data[j] + newbias_data[j];
output_data_tmp[i * out_l + out_l - 1] = output_data_tmp[i * out_w + out_w - 1] =
output_data_tmp[i * out_l + out_l - 1] * newscale_data[j] + output_data_tmp[i * out_w + out_w - 1] * newscale_data[j] +
newbias_data[j]; newbias_data[j];
if (if_relu) { if (if_relu) {
output_data_tmp[i * out_l] = output_data_tmp[i * out_w] =
output_data_tmp[i * out_l] < 0 ? 0 : output_data_tmp[i * out_l]; output_data_tmp[i * out_w] < 0 ? 0 : output_data_tmp[i * out_w];
output_data_tmp[i * out_l + out_l - 1] = output_data_tmp[i * out_w + out_w - 1] =
output_data_tmp[i * out_l + out_l - 1] < 0 output_data_tmp[i * out_w + out_w - 1] < 0
? 0 ? 0
: output_data_tmp[i * out_l + out_l - 1]; : output_data_tmp[i * out_w + out_w - 1];
} }
} }
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册