optimize deconv omp

e3a8a08e · liutuo · 4d9c948a · e3a8a08e · e3a8a08e · e3a8a08e
5 changed file
--- a/mace/kernels/arm/deconv_2d_neon.h
+++ b/mace/kernels/arm/deconv_2d_neon.h
@@ -26,28 +26,24 @@ namespace kernels {

 void Deconv2dNeonK3x3S1(const float *input,
                        const float *filter,
-                        const float *bias,
                        const index_t *in_shape,
                        const index_t *out_shape,
                        float *output);

 void Deconv2dNeonK3x3S2(const float *input,
                        const float *filter,
-                        const float *bias,
                        const index_t *in_shape,
                        const index_t *out_shape,
                        float *output);

 void Deconv2dNeonK4x4S1(const float *input,
                        const float *filter,
-                        const float *bias,
                        const index_t *in_shape,
                        const index_t *out_shape,
                        float *output);

 void Deconv2dNeonK4x4S2(const float *input,
                        const float *filter,
-                        const float *bias,
                        const index_t *in_shape,
                        const index_t *out_shape,
                        float *output);

--- a/mace/kernels/arm/deconv_2d_neon_3x3.cc
+++ b/mace/kernels/arm/deconv_2d_neon_3x3.cc
@@ -20,7 +20,6 @@ namespace kernels {

 void Deconv2dNeonK3x3S1(const float *input,
                        const float *filter,
-                        const float *bias,
                        const index_t *in_shape,
                        const index_t *out_shape,
                        float *output) {
@@ -40,12 +39,6 @@ void Deconv2dNeonK3x3S1(const float *input,
      if (oc + 1 < outch) {
        float *out_base0 = output + (b * outch + oc) * out_img_size;
        float *out_base1 = out_base0 + out_img_size;
-
-        const float bias_value0 = bias ? bias[oc] : 0.f;
-        const float bias_value1 = bias ? bias[oc + 1] : 0.f;
-        std::fill_n(out_base0, out_img_size, bias_value0);
-        std::fill_n(out_base1, out_img_size, bias_value1);
-
        for (index_t ic = 0; ic < inch; ++ic) {
          const float *input_base = input + (b * inch + ic) * h * w;
          const float *kernel_base0 = filter + (oc * inch + ic) * 9;
@@ -197,8 +190,6 @@ void Deconv2dNeonK3x3S1(const float *input,
        }
      } else {
        float *out_base0 = output + (b * outch + oc) * outh * outw;
-        const float bias_value0 = bias ? bias[oc] : 0.f;
-        std::fill_n(out_base0, outh * outw, bias_value0);
        for (index_t ic = 0; ic < inch; ++ic) {
          const float *input_base = input + (b * inch + ic) * h * w;
          const float *kernel_base0 = filter + (oc * inch + ic) * 9;
@@ -290,7 +281,6 @@ void Deconv2dNeonK3x3S1(const float *input,

 void Deconv2dNeonK3x3S2(const float *input,
                        const float *filter,
-                        const float *bias,
                        const index_t *in_shape,
                        const index_t *out_shape,
                        float *output) {
@@ -303,15 +293,11 @@ void Deconv2dNeonK3x3S2(const float *input,
  const index_t outw = out_shape[3];
  const index_t out_img_size = outh * outw;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(3)
  for (index_t b = 0; b < out_shape[0]; ++b) {
    for (index_t oc = 0; oc < outch; ++oc) {
-      float *out_base = output + (b * outch + oc) * out_img_size;
-
-      const float bias_value = bias ? bias[oc] : 0.f;
-      std::fill_n(out_base, out_img_size, bias_value);
-
      for (index_t ic = 0; ic < inch; ++ic) {
+        float *out_base = output + (b * outch + oc) * out_img_size;
        const float *input_base = input + (b * inch + ic) * h * w;
        const float *kernel_base = filter + (oc * inch + ic) * 9;
        const float *in = input_base;

--- a/mace/kernels/arm/deconv_2d_neon_4x4.cc
+++ b/mace/kernels/arm/deconv_2d_neon_4x4.cc
@@ -20,7 +20,6 @@ namespace kernels {

 void Deconv2dNeonK4x4S1(const float *input,
                        const float *filter,
-                        const float *bias,
                        const index_t *in_shape,
                        const index_t *out_shape,
                        float *output) {
@@ -32,16 +31,12 @@ void Deconv2dNeonK4x4S1(const float *input,
  const index_t outw = out_shape[3];
  const index_t outch = out_shape[1];
  const index_t out_img_size = outh * outw;
-#pragma omp parallel for
+#pragma omp parallel for collapse(2)
  for (int b = 0; b < out_shape[0]; ++b) {
    for (int oc = 0; oc < outch; oc += 2) {
      if (oc + 1 < outch) {
        float *out_base = output + (b * outch + oc) * out_img_size;
        float *out_base1 = out_base + out_img_size;
-        const float bias_value = bias ? bias[oc] : 0.f;
-        std::fill_n(out_base, out_img_size, bias_value);
-        const float bias_value1 = bias ? bias[oc + 1] : 0.f;
-        std::fill_n(out_base1, out_img_size, bias_value1);
        for (int q = 0; q < inch; q++) {
          const float *input_base = input + (b * inch + q) * h * w;
          const float *in = input_base;
@@ -257,8 +252,6 @@ void Deconv2dNeonK4x4S1(const float *input,
        }
      } else {
        float *out_base = output + (b * outch + oc) * out_img_size;
-        const float bias_value = bias ? bias[oc] : 0.f;
-        std::fill_n(out_base, out_img_size, bias_value);
        for (int q = 0; q < inch; q++) {
          const float *input_base = input + (b * inch + q) * h * w;
          const float *kernel_base = filter + (oc * inch + q) * 16;
@@ -381,7 +374,6 @@ void Deconv2dNeonK4x4S1(const float *input,

 void Deconv2dNeonK4x4S2(const float *input,
                        const float *filter,
-                        const float *bias,
                        const index_t *in_shape,
                        const index_t *out_shape,
                        float *output) {
@@ -394,14 +386,11 @@ void Deconv2dNeonK4x4S2(const float *input,
  const index_t outch = out_shape[1];
  const index_t out_img_size = outh * outw;

-#pragma omp parallel for
+#pragma omp parallel for collapse(3)
  for (int b = 0; b < out_shape[0]; ++b) {
    for (int p = 0; p < outch; p++) {
-      float *out_base = output + (b * outch + p) * out_img_size;
-      const float bias_value = bias ? bias[p] : 0.f;
-      std::fill_n(out_base, outh * outw, bias_value);
-
      for (int q = 0; q < inch; q++) {
+        float *out_base = output + (b * outch + p) * out_img_size;
        const float *input_base = input + (b * inch + q) * h * w;
        const float *kernel_base = filter + (p * inch + q) * 16;
        const float *in = input_base;

--- a/mace/kernels/deconv_2d.h
+++ b/mace/kernels/deconv_2d.h
@@ -184,7 +184,6 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {

  void Deconv2dGeneral(const float *input,
                       const float *filter,
-                       const float *bias,
                       const index_t kernel_h,
                       const index_t kernel_w,
                       const int *strides,
@@ -206,23 +205,25 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
      }
    }

-#pragma omp parallel for
-    for (int b = 0; b < in_shape[0]; ++b) {
-      for (int oc = 0; oc < out_shape[1]; ++oc) {
-        float *out_base =
-            output + (b * out_shape[1] + oc) * out_img_size;
-        const float bias_value = bias ? bias[oc] : 0.f;
-        std::fill_n(out_base, out_img_size, bias_value);
+    const index_t batch = in_shape[0];
+    const index_t out_channels = out_shape[1];
+    const index_t in_channels = in_shape[1];
+
+#pragma omp parallel for collapse(4)
+    for (int b = 0; b < batch; ++b) {
+      for (int oc = 0; oc < out_channels; ++oc) {
        for (int i = 0; i < in_height; ++i) {
          for (int j = 0; j < in_width; ++j) {
+            float *out_base =
+                output + (b * out_channels + oc) * out_img_size;
            const index_t out_offset =
                i * strides[0] * out_width + j * strides[1];
-            for (int ic = 0; ic < in_shape[1]; ++ic) {
+            for (int ic = 0; ic < in_channels; ++ic) {
              const index_t input_idx =
-                  (b * in_shape[1] + ic) * in_img_size + i * in_width + j;
+                  (b * in_channels + ic) * in_img_size + i * in_width + j;
              const float val = input[input_idx];
              const index_t kernel_offset =
-                  (oc * in_shape[1] + ic) * kernel_size;
+                  (oc * in_channels + ic) * kernel_size;
              for (int k = 0; k < kernel_size; ++k) {
                const index_t out_idx = out_offset + index_map[k];
                const index_t kernel_idx = kernel_offset + k;
@@ -248,7 +249,7 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {

    const index_t out_height = out_shape[2];
    const index_t out_width = out_shape[3];
-#pragma omp parallel for
+#pragma omp parallel for collapse(3)
    for (int i = 0; i < batch; ++i) {
      for (int j = 0; j < channel; ++j) {
        for (int k = 0; k < out_height; ++k) {
@@ -324,7 +325,6 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
               "Input/Output batch size mismatch");
    std::function<void(const float *input,
                       const float *filter,
-                       const float *bias,
                       const index_t *in_shape,
                       const index_t *out_shape,
                       float *output)> deconv_func;
@@ -354,6 +354,8 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
    scratch->Rewind();
    scratch->GrowSize(padded_out_size);
    Tensor padded_out(scratch->Scratch(padded_out_size), DT_FLOAT);
+    padded_out.Reshape(padded_out_shape);
+    padded_out.Clear();
    auto *padded_out_data = padded_out.mutable_data<float>();

    bool use_neon_3x3_s1 = kernel_h == kernel_w && kernel_h == 3 &&
@@ -369,13 +371,11 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
    if (use_neon_3x3_s1) {
      deconv_func = [=](const float *input,
                        const float *filter,
-                        const float *bias,
                        const index_t *in_shape,
                        const index_t *padded_out_shape,
                        float *padded_output) {
        Deconv2dNeonK3x3S1(input,
                           filter,
-                           bias,
                           in_shape,
                           padded_out_shape,
                           padded_output);
@@ -383,13 +383,11 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
    } else if (use_neon_3x3_s2) {
      deconv_func = [=](const float *input,
                        const float *filter,
-                        const float *bias,
                        const index_t *in_shape,
                        const index_t *padded_out_shape,
                        float *padded_output) {
        Deconv2dNeonK3x3S2(input,
                           filter,
-                           bias,
                           in_shape,
                           padded_out_shape,
                           padded_output);
@@ -397,13 +395,11 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
    } else if (use_neon_4x4_s1) {
      deconv_func = [=](const float *input,
                        const float *filter,
-                        const float *bias,
                        const index_t *in_shape,
                        const index_t *padded_out_shape,
                        float *padded_output) {
        Deconv2dNeonK4x4S1(input,
                           filter,
-                           bias,
                           in_shape,
                           padded_out_shape,
                           padded_output);
@@ -411,13 +407,11 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
    } else if (use_neon_4x4_s2) {
      deconv_func = [=](const float *input,
                        const float *filter,
-                        const float *bias,
                        const index_t *in_shape,
                        const index_t *padded_out_shape,
                        float *padded_output) {
        Deconv2dNeonK4x4S2(input,
                           filter,
-                           bias,
                           in_shape,
                           padded_out_shape,
                           padded_output);
@@ -425,13 +419,11 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
    } else {
      deconv_func = [=](const float *input,
                        const float *filter,
-                        const float *bias,
                        const index_t *in_shape,
                        const index_t *padded_out_shape,
                        float *padded_output) {
        Deconv2dGeneral(input,
                        filter,
-                        bias,
                        kernel_h,
                        kernel_w,
                        strides_.data(),
@@ -444,9 +436,24 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
    bool no_pad =
        padded_out_h == output_shape[2] && padded_out_w == output_shape[3];
    float *out_data = no_pad ? output_data : padded_out_data;
+
+    if (bias_data != nullptr) {
+      const index_t batch = output_shape[0];
+      const index_t channels = output_shape[1];
+      const index_t img_size = output_shape[2] * output_shape[3];
+#pragma omp parallel for collapse(3)
+      for (index_t b = 0; b < batch; ++b) {
+        for (index_t c = 0; c < channels; ++c) {
+          for (index_t i = 0; i < img_size; ++i) {
+            output_data[(b * channels + c) * img_size + i] +=
+                bias_data[c];
+          }
+        }
+      }
+    }
+
    deconv_func(input_data,
                filter_data,
-                bias_data,
                in_shape,
                padded_out_shape.data(),
                out_data);
@@ -459,6 +466,8 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
                 output_data);
    }

+
+
    DoActivation<float>(output_data,
                 output_data,
                 output->size(),

--- a/mace/ops/deconv_2d_benchmark.cc
+++ b/mace/ops/deconv_2d_benchmark.cc
@@ -120,15 +120,12 @@ static void Deconv2d(int iters,
  MACE_BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, float, GPU); \
  MACE_BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, half, GPU);

-MACE_BM_DECONV_2D(1, 128, 15, 15, 1, 1, 1, 15, 15, VALID, 256);
 MACE_BM_DECONV_2D(1, 32, 60, 60, 1, 1, 1, 60, 60, VALID, 128);

 MACE_BM_DECONV_2D(1, 128, 60, 60, 3, 3, 1, 62, 62, VALID, 128);
 MACE_BM_DECONV_2D(1, 32, 60, 60, 3, 3, 1, 60, 60, SAME, 32);

-MACE_BM_DECONV_2D(1, 128, 60, 60, 4, 4, 1, 63, 63, VALID, 128);
 MACE_BM_DECONV_2D(1, 32, 60, 60, 4, 4, 1, 60, 60, SAME, 32);
-MACE_BM_DECONV_2D(1, 3, 224, 224, 4, 4, 2, 448, 448, SAME, 32);
 MACE_BM_DECONV_2D(1, 3, 224, 224, 4, 4, 2, 450, 450, VALID, 32);
 MACE_BM_DECONV_2D(1, 3, 512, 512, 7, 7, 2, 1023, 1023, SAME, 32);
 MACE_BM_DECONV_2D(1, 128, 16, 16, 5, 5, 1, 20, 20, VALID, 32);
@@ -138,10 +135,10 @@ MACE_BM_DECONV_2D(1, 3, 480, 480, 1, 1, 1, 480, 480, VALID, 3);

 MACE_BM_DECONV_2D(1, 64, 32, 32, 1, 1, 1, 32, 32, VALID, 128);
 MACE_BM_DECONV_2D(1, 64, 33, 32, 3, 3, 2, 65, 63, SAME, 128);
-MACE_BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 447, 447, SAME, 32);
-MACE_BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 449, 449, VALID, 32);
 MACE_BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 448, 448, SAME, 32);

+MACE_BM_DECONV_2D(1, 32, 1014, 762, 9, 9, 2, 2035, 1531, VALID, 1);
+
 }  // namespace test
 }  // namespace ops
 }  // namespace mace