add openmp in depthwise_conv3x3_s1p1

3cb0cb18 · zhaojiaying01 · 67bd7f3f · 3cb0cb18
隐藏空白更改
内联并排

Showing with 13 addition and 21 deletion

src/operators/math/depthwise_conv3x3.cpp src/operators/math/depthwise_conv3x3.cpp +13 -21

未找到文件。
--- a/src/operators/math/depthwise_conv3x3.cpp
+++ b/src/operators/math/depthwise_conv3x3.cpp
@@ -253,34 +253,29 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input,
                          framework::Tensor *output, framework::Tensor *bias,
                          bool if_bias, bool if_relu) {
 #if __ARM_NEON
-  const float *input_data = input->data<float>();
-  const float *filter_data = filter->data<float>();
-  float *output_data = output->mutable_data<float>();
-  const float *bias_data;
-  if (if_bias) {
-    bias_data = bias->data<float>();
-  }
-
-  const int h = static_cast<int>(input->dims()[2]);
-  const int w = static_cast<int>(input->dims()[3]);
-  //  const int l = h;
+  const float *bias_data = bias->data<float>();
  const int batch_size = static_cast<int>(input->dims()[0]);
  const int c = static_cast<int>(input->dims()[1]);
+  const int h = static_cast<int>(input->dims()[2]);
+  const int w = static_cast<int>(input->dims()[3]);
  const int hxw = h * w;
-  float32x4_t vbias = vdupq_n_f32(0.0);
+  // const int l = h;

  // leftTop, rightTop, leftBottom, rightBottom
-  int lt = 0;
-  int rt = w - 1;
-  int lb = (h - 1) * w;
-  int rb = h * w - 1;
+  const int lt = 0;
+  const int rt = w - 1;
+  const int lb = (h - 1) * w;
+  const int rb = h * w - 1;

  float32x4_t zero = vdupq_n_f32(0.0);

  for (int b = 0; b < batch_size; ++b) {
-    const float *filter_data_tmp = filter_data;
-
+#pragma omp parallel for
    for (int j = 0; j < c; ++j) {
+      const float *filter_data_tmp = filter->data<float>() + j * 9;
+      const float *input_data = input->data<float>() + j * hxw;
+      float *output_data = output->mutable_data<float>() + j * hxw;
+      float32x4_t vbias;
      if (if_bias) {
        vbias = vdupq_n_f32(bias_data[j]);
      }
@@ -552,9 +547,6 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input,
          }
        }
      }
-      output_data += hxw;
-      input_data += hxw;
-      filter_data_tmp += 9;
    }
  }
 #endif