Fix openmp bugs for cpu kernel.

0436bb8e · liuqi · c5d83495 · 0436bb8e · 0436bb8e · 0436bb8e
14 changed file
--- a/mace/BUILD
+++ b/mace/BUILD
@@ -51,3 +51,11 @@ config_setting(
    },
    visibility = ["//visibility:public"],
 )
+config_setting(
+    name = "openmp_enabled",
+    define_values = {
+        "openmp": "true",
+    },
+    visibility = ["//visibility:public"],
+)
--- a/mace/examples/BUILD
+++ b/mace/examples/BUILD
 # Examples
-load("//mace:mace.bzl", "if_android", "if_neon_enabled")
+load("//mace:mace.bzl", "if_android", "if_neon_enabled", "if_openmp_enabled")
 cc_binary(
    name = "helloworld",
    srcs = [
        "helloworld.cc",
    ],
-    linkopts = if_neon_enabled(["-fopenmp"]),
+    linkopts = if_openmp_enabled(["-fopenmp"]),
    deps = [
        "//mace/core",
        "//mace/ops",
@@ -17,7 +17,7 @@ cc_test(
    name = "benchmark_example",
    testonly = 1,
    srcs = ["benchmark_example.cc"],
-    linkopts = if_neon_enabled(["-fopenmp"]),
+    linkopts = if_openmp_enabled(["-fopenmp"]),
    linkstatic = 1,
    deps = [
        "//mace/core",

--- a/mace/kernels/BUILD
+++ b/mace/kernels/BUILD
@@ -7,7 +7,7 @@ package(
 licenses(["notice"])  # Apache 2.0
-load("//mace:mace.bzl", "if_android", "if_neon_enabled")
+load("//mace:mace.bzl", "if_android", "if_neon_enabled", "if_openmp_enabled")
 cc_library(
    name = "kernels",
@@ -23,7 +23,7 @@ cc_library(
    ]) + if_neon_enabled(glob([
        "neon/*.h",
    ])),
-    copts = if_neon_enabled(["-fopenmp"]),
+    copts = if_openmp_enabled(["-fopenmp"]),
    linkopts = if_android(["-lm"]),
    deps = [
        "//mace/core",

--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -86,19 +86,18 @@ struct BatchNormFunctor : BatchNormFunctorBase {
      }
    }
-    index_t pos = 0;
-#pragma omp parallel for
+#pragma omp parallel for collapse(4)
    for (index_t n = 0; n < batch; ++n) {
      for (index_t h = 0; h < height; ++h) {
        for (index_t w = 0; w < width; ++w) {
          for (index_t c = 0; c < channels; ++c) {
+            index_t pos = (((n * height) + h) * width + w) * channels + c;
            if (folded_constant_) {
              output_ptr[pos] = scale_ptr[c] * input_ptr[pos] + offset_ptr[c];
            } else {
              output_ptr[pos] = new_scale[c] * input_ptr[pos] + new_offset[c];
            }
-            ++pos;
          }
        }
      }

--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -33,14 +33,13 @@ struct BiasAddFunctor {
    T *output_ptr = output->mutable_data<T>();
-    index_t pos = 0;
+#pragma omp parallel for collapse(4)
-#pragma omp parallel for
    for (index_t n = 0; n < batch; ++n) {
      for (index_t h = 0; h < height; ++h) {
        for (index_t w = 0; w < width; ++w) {
          for (index_t c = 0; c < channels; ++c) {
+            index_t pos = (((n * height) + h) * width + w) * channels + c;
            output_ptr[pos] = input_ptr[pos] + bias_ptr[c];
-            ++pos;
          }
        }
      }

--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -103,13 +103,15 @@ struct Conv2dFunctor : Conv2dFunctorBase {
    auto bias_data = bias == nullptr ? nullptr : bias->data<T>();
    auto output_data = output->mutable_data<T>();
+#pragma omp parallel for collapse(4)
    for (int n = 0; n < batch; ++n) {
      for (int h = 0; h < height; ++h) {
        for (int w = 0; w < width; ++w) {
          for (int c = 0; c < channels; ++c) {
+            const int out_idx = ((n * height + h) * width + w) * channels + c;
            T bias_channel = 0.0f;
            if (bias) bias_channel = bias_data[c];
-            *output_data = bias_channel;
+            output_data[out_idx] = bias_channel;
            T sum = 0.0f;
            const T *filter_ptr = filter_data + c;
            for (int kh = 0; kh < kernel_h; ++kh) {
@@ -123,8 +125,6 @@ struct Conv2dFunctor : Conv2dFunctorBase {
                                   inw >= padded_w_start && inw < padded_w_stop,
                               "Out of range read from input: ", inh, ", ",
                               inw);
-                    // else padding with 0:
-                    // sum += 0;
                  } else {
                    index_t input_offset =
                        n * input_height * input_width * input_channels +
@@ -136,13 +136,11 @@ struct Conv2dFunctor : Conv2dFunctorBase {
                }
              }
            }
-            *output_data += sum;
+            output_data[out_idx] += sum;
-            output_data++;
          }
        }
      }
    }
-    output_data = output->mutable_data<T>();
    DoActivation(output_data, output_data, output->NumElements(), activation_,
                 relux_max_limit_, prelu_alpha_);
  }

--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -114,7 +114,7 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
    const T *bias_ptr = bias == nullptr ? nullptr : bias->data<T>();
    T *output_ptr = output->mutable_data<T>();
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(4)
    for (int n = 0; n < batch; ++n) {
      for (int h = 0; h < height; ++h) {
        for (int w = 0; w < width; ++w) {

--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -191,6 +191,10 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
        {1, kwg_size / 32, 32, 1},
        {1, kwg_size / 64, 64, 1},
        {1, kwg_size / 128, 128, 1},
+        {4, kwg_size / 16, 4, 1},
+        {4, kwg_size / 28, 7, 1},
+        {4, kwg_size / 32, 8, 1},
+        {4, kwg_size / 56, 14, 1},
        {3, 15, 9, 1},
        {7, 15, 9, 1},
        {9, 7, 15, 1},

--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -96,11 +96,12 @@ struct PoolingFunctor : PoolingFunctorBase {
    int padded_w_start = 0 - paddings[1] / 2;
    if (pooling_type_ == MAX) {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(4)
      for (int b = 0; b < batch; ++b) {
        for (int h = 0; h < height; ++h) {
          for (int w = 0; w < width; ++w) {
            for (int c = 0; c < channels; ++c) {
+              index_t out_offset = (((b * height) + h) * width + w) * channels + c;
              index_t in_offset = b * in_image_size * input_channels + c;
              T res = std::numeric_limits<T>::lowest();
              for (int kh = 0; kh < kernel_h; ++kh) {
@@ -114,18 +115,18 @@ struct PoolingFunctor : PoolingFunctorBase {
                  }
                }
              }
-              *output = res;
+              output[out_offset] = res;
-              output++;
            }
          }
        }
      }
    } else if (pooling_type_ == AVG) {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(4)
      for (int b = 0; b < batch; ++b) {
        for (int h = 0; h < height; ++h) {
          for (int w = 0; w < width; ++w) {
            for (int c = 0; c < channels; ++c) {
+              index_t out_offset = (((b * height) + h) * width + w) * channels + c;
              index_t in_offset = b * in_image_size * input_channels + c;
              T sum = 0;
              int block_size = 0;
@@ -141,8 +142,7 @@ struct PoolingFunctor : PoolingFunctorBase {
                  }
                }
              }
-              *output = sum / block_size;
+              output[out_offset] = sum / block_size;
-              output++;
            }
          }
        }

--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
@@ -29,21 +29,20 @@ struct SoftmaxFunctor {
    const index_t num_classes = logits_shape.back();
 #pragma omp parallel for
    for (index_t i = 0; i < batch_size; ++i) {
-      T max_value = *logits_ptr;
+      const index_t pos = i * num_classes;
+      T max_value = logits_ptr[pos];
      for (index_t c = 1; c < num_classes; ++c) {
-        max_value = std::max(max_value, logits_ptr[c]);
+        max_value = std::max(max_value, logits_ptr[pos + c]);
      }
      // TODO: check overflow?
      T sum = 0;
      std::vector<T> exp_data(num_classes);
      for (index_t c = 0; c < num_classes; ++c) {
-        exp_data[c] = ::exp((*logits_ptr - max_value));
+        exp_data[c] = ::exp((logits_ptr[pos + c] - max_value));
        sum += exp_data[c];
-        logits_ptr++;
      }
      for (index_t c = 0; c < num_classes; ++c) {
-        *output_ptr = exp_data[c] / sum;
+        output_ptr[pos + c] = exp_data[c] / sum;
-        output_ptr++;
      }
    }
  }

--- a/mace/mace.bzl
+++ b/mace/mace.bzl
@@ -53,3 +53,9 @@ def if_not_hexagon_enabled(a):
      "//mace:hexagon_enabled": [],
      "//conditions:default": a,
  })
+def if_openmp_enabled(a):
+  return select({
+      "//mace:openmp_enabled": a,
+      "//conditions:default": [],
+  })
--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
@@ -7,7 +7,7 @@ package(
 licenses(["notice"])  # Apache 2.0
-load("//mace:mace.bzl", "if_android", "if_neon_enabled")
+load("//mace:mace.bzl", "if_android", "if_neon_enabled", "if_openmp_enabled")
 cc_library(
    name = "test",
@@ -34,7 +34,7 @@ cc_library(
        ["*.h"],
        exclude = ["ops_test_util.h"],
    ),
-    copts = if_neon_enabled(["-DMACE_ENABLE_NEON"]),
+    copts = if_openmp_enabled(["-fopenmp"]) + if_neon_enabled(["-DMACE_ENABLE_NEON"]),
    deps = [
        "//mace/kernels",
    ],

--- a/tools/bazel-adb-run.sh
+++ b/tools/bazel-adb-run.sh
@@ -43,7 +43,8 @@ bazel build -c opt $STRIP --verbose_failures $BAZEL_TARGET \
   --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
   --copt="-DMACE_DISABLE_NO_TUNING_WARNING" \
   --copt="-Werror=return-type" \
-   --define neon=false
+   --define neon=false \
+   --define openmp=true
 if [ $? -ne 0 ]; then
  exit 1

--- a/tools/export_local_lib.sh
+++ b/tools/export_local_lib.sh
@@ -55,7 +55,8 @@ build_target()
    --copt="-std=c++11" \
    --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
    --copt="-Werror=return-type" \
-    --copt="-DMACE_OBFUSCATE_LITERALS" || exit -1
+    --copt="-DMACE_OBFUSCATE_LITERALS" \
+    --define openmp=true || exit -1
 }
 merge_libs()