Merge branch 'cpu' into 'master'

Support to run on local PC. See merge request !232

Merge branch 'cpu' into 'master'
Support to run on local PC. See merge request !232
909f89d9 · 叶剑武 · 5dc00f8e · ceeb69fc · 909f89d9 · 909f89d9
19 changed file
--- a/mace/BUILD
+++ b/mace/BUILD
@@ -51,3 +51,11 @@ config_setting(
    },
    visibility = ["//visibility:public"],
 )
+
+config_setting(
+    name = "openmp_enabled",
+    define_values = {
+        "openmp": "true",
+    },
+    visibility = ["//visibility:public"],
+)
--- a/mace/core/BUILD
+++ b/mace/core/BUILD
@@ -42,9 +42,8 @@ cc_library(
        "runtime/opencl/*.h",
        "runtime/hexagon/*.h",
    ]),
-    linkopts = if_android([
+    linkopts = ["-ldl",] + if_android([
        "-pie",
-        "-ldl",
        "-lm",
    ]),
    deps = [

--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -94,7 +94,6 @@ class Operator : public OperatorBase {

    for (const string &output_str : operator_def.output()) {
      if (ws->HasTensor(output_str)) {
-        Tensor *found_tensor = ws->GetTensor(output_str);
        outputs_.push_back(ws->GetTensor(output_str));
      } else {
        outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor(

--- a/mace/core/runtime/hexagon/hexagon_control_wrapper.cc
+++ b/mace/core/runtime/hexagon/hexagon_control_wrapper.cc
@@ -158,7 +158,6 @@ bool HexagonControlWrapper::TeardownGraph() {

 void HexagonControlWrapper::PrintLog() {
  char *buf;
-  unsigned char *p;
  if ((buf = new char[PRINT_BUFSIZE]) == NULL) return;
  hexagon_nn_getlog(nn_id_, reinterpret_cast<unsigned char*>(buf), PRINT_BUFSIZE);
  LOG(INFO) << string(buf);
@@ -168,7 +167,6 @@ void HexagonControlWrapper::PrintLog() {
 void HexagonControlWrapper::PrintGraph() {
  LOG(INFO) << "Print Graph";
  char *buf;
-  unsigned char *p;
  if ((buf = new char[PRINT_BUFSIZE]) == NULL) return;
  hexagon_nn_snpprint(nn_id_, reinterpret_cast<unsigned char*>(buf), PRINT_BUFSIZE);
  LOG(INFO) << string(buf);

--- a/mace/examples/BUILD
+++ b/mace/examples/BUILD
 # Examples
-load("//mace:mace.bzl", "if_android", "if_neon_enabled")
+load("//mace:mace.bzl", "if_android", "if_neon_enabled", "if_openmp_enabled")

 cc_binary(
    name = "helloworld",
    srcs = [
        "helloworld.cc",
    ],
-    linkopts = if_neon_enabled(["-fopenmp"]),
+    linkopts = if_openmp_enabled(["-fopenmp"]),
    deps = [
        "//mace/core",
        "//mace/ops",
@@ -17,7 +17,7 @@ cc_test(
    name = "benchmark_example",
    testonly = 1,
    srcs = ["benchmark_example.cc"],
-    linkopts = if_neon_enabled(["-fopenmp"]),
+    linkopts = if_openmp_enabled(["-fopenmp"]),
    linkstatic = 1,
    deps = [
        "//mace/core",

--- a/mace/kernels/BUILD
+++ b/mace/kernels/BUILD
@@ -7,7 +7,7 @@ package(

 licenses(["notice"])  # Apache 2.0

-load("//mace:mace.bzl", "if_android", "if_neon_enabled")
+load("//mace:mace.bzl", "if_android", "if_neon_enabled", "if_openmp_enabled")

 cc_library(
    name = "kernels",
@@ -23,7 +23,7 @@ cc_library(
    ]) + if_neon_enabled(glob([
        "neon/*.h",
    ])),
-    copts = if_neon_enabled(["-fopenmp"]),
+    copts = if_openmp_enabled(["-fopenmp"]),
    linkopts = if_android(["-lm"]),
    deps = [
        "//mace/core",

--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -86,19 +86,18 @@ struct BatchNormFunctor : BatchNormFunctorBase {
      }
    }

-    index_t pos = 0;

-#pragma omp parallel for
+#pragma omp parallel for collapse(4)
    for (index_t n = 0; n < batch; ++n) {
      for (index_t h = 0; h < height; ++h) {
        for (index_t w = 0; w < width; ++w) {
          for (index_t c = 0; c < channels; ++c) {
+            index_t pos = (((n * height) + h) * width + w) * channels + c;
            if (folded_constant_) {
              output_ptr[pos] = scale_ptr[c] * input_ptr[pos] + offset_ptr[c];
            } else {
              output_ptr[pos] = new_scale[c] * input_ptr[pos] + new_offset[c];
            }
-            ++pos;
          }
        }
      }

--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -33,14 +33,13 @@ struct BiasAddFunctor {
    T *output_ptr = output->mutable_data<T>();


-    index_t pos = 0;
-#pragma omp parallel for
+#pragma omp parallel for collapse(4)
    for (index_t n = 0; n < batch; ++n) {
      for (index_t h = 0; h < height; ++h) {
        for (index_t w = 0; w < width; ++w) {
          for (index_t c = 0; c < channels; ++c) {
+            index_t pos = (((n * height) + h) * width + w) * channels + c;
            output_ptr[pos] = input_ptr[pos] + bias_ptr[c];
-            ++pos;
          }
        }
      }

--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -94,8 +94,6 @@ struct Conv2dFunctor : Conv2dFunctorBase {
    index_t padded_h_stop = input_height + paddings[0] - paddings[0] / 2;
    index_t padded_w_stop = input_width + paddings[1] - paddings[1] / 2;

-    index_t kernel_size = input_channels * kernel_h * kernel_w;
-
    Tensor::MappingGuard input_mapper(input);
    Tensor::MappingGuard filter_mapper(filter);
    Tensor::MappingGuard bias_mapper(bias);
@@ -105,13 +103,15 @@ struct Conv2dFunctor : Conv2dFunctorBase {
    auto bias_data = bias == nullptr ? nullptr : bias->data<T>();
    auto output_data = output->mutable_data<T>();

+#pragma omp parallel for collapse(4)
    for (int n = 0; n < batch; ++n) {
      for (int h = 0; h < height; ++h) {
        for (int w = 0; w < width; ++w) {
          for (int c = 0; c < channels; ++c) {
+            const int out_idx = ((n * height + h) * width + w) * channels + c;
            T bias_channel = 0.0f;
            if (bias) bias_channel = bias_data[c];
-            *output_data = bias_channel;
+            output_data[out_idx] = bias_channel;
            T sum = 0.0f;
            const T *filter_ptr = filter_data + c;
            for (int kh = 0; kh < kernel_h; ++kh) {
@@ -125,8 +125,6 @@ struct Conv2dFunctor : Conv2dFunctorBase {
                                   inw >= padded_w_start && inw < padded_w_stop,
                               "Out of range read from input: ", inh, ", ",
                               inw);
-                    // else padding with 0:
-                    // sum += 0;
                  } else {
                    index_t input_offset =
                        n * input_height * input_width * input_channels +
@@ -138,13 +136,11 @@ struct Conv2dFunctor : Conv2dFunctorBase {
                }
              }
            }
-            *output_data += sum;
-            output_data++;
+            output_data[out_idx] += sum;
          }
        }
      }
    }
-    output_data = output->mutable_data<T>();
    DoActivation(output_data, output_data, output->NumElements(), activation_,
                 relux_max_limit_, prelu_alpha_);
  }

--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -105,8 +105,6 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
    index_t padded_h_stop = input_height + paddings[0] - paddings[0] / 2;
    index_t padded_w_stop = input_width + paddings[1] - paddings[1] / 2;

-    const index_t kernel_size = kernel_h * kernel_w;
-
    Tensor::MappingGuard input_mapper(input);
    Tensor::MappingGuard filter_mapper(filter);
    Tensor::MappingGuard bias_mapper(bias);
@@ -116,7 +114,7 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
    const T *bias_ptr = bias == nullptr ? nullptr : bias->data<T>();
    T *output_ptr = output->mutable_data<T>();

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(4)
    for (int n = 0; n < batch; ++n) {
      for (int h = 0; h < height; ++h) {
        for (int w = 0; w < width; ++w) {

--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -62,17 +62,17 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
  for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
    const int in_idx = mul24(in_ch_blk, in_width);
    int filter_x_part0 = in_ch_blk << 2;
+    int in_hb_idx = height_idx;
    for (short hb_idx = 0; hb_idx < 3; ++hb_idx) {
-      // TODO (heliangliang) optimize out these muls
-      int in_hb_value = height_idx + mul24(hb_idx, dilation_h);
-      in_hb_value = select(in_hb_value + batch_idx,
-                           -1,
-                           (in_hb_value < 0 || in_hb_value >= in_height));
+      int in_hb_value = select(in_hb_idx + batch_idx,
+                               -1,
+                               (in_hb_idx < 0 || in_hb_idx >= in_height));
      int filter_x_part1 = 0;
+      int in_width_idx = 0;
      for (short width_idx = 0; width_idx < 3; ++width_idx) {
        int in_width_value;
 #define READ_INPUT(i)                                                                \
-        in_width_value = in_width##i + mul24(width_idx, dilation_w);                 \
+        in_width_value = in_width##i + in_width_idx;                                 \
        in_width_value = select(in_idx + in_width_value,                             \
                                -1,                                                  \
                                (in_width_value < 0 || in_width_value >= in_width)); \
@@ -120,8 +120,10 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
        out4 = mad(in4.w, weights3, out4);

        filter_x_part1 += rounded_in_ch;
+        in_width_idx += dilation_w;
      }
      filter_x_part0 += rounded_in_ch_x_3;
+      in_hb_idx += dilation_h;
    }
  }


--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -191,6 +191,10 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
        {1, kwg_size / 32, 32, 1},
        {1, kwg_size / 64, 64, 1},
        {1, kwg_size / 128, 128, 1},
+        {4, kwg_size / 16, 4, 1},
+        {4, kwg_size / 28, 7, 1},
+        {4, kwg_size / 32, 8, 1},
+        {4, kwg_size / 56, 14, 1},
        {3, 15, 9, 1},
        {7, 15, 9, 1},
        {9, 7, 15, 1},

--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -76,7 +76,6 @@ struct PoolingFunctor : PoolingFunctorBase {
    index_t height = output_shape[1];
    index_t width = output_shape[2];
    index_t channels = output_shape[3];
-    index_t out_image_size = height * width;

    index_t input_height = input_shape[1];
    index_t input_width = input_shape[2];
@@ -97,11 +96,12 @@ struct PoolingFunctor : PoolingFunctorBase {
    int padded_w_start = 0 - paddings[1] / 2;

    if (pooling_type_ == MAX) {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(4)
      for (int b = 0; b < batch; ++b) {
        for (int h = 0; h < height; ++h) {
          for (int w = 0; w < width; ++w) {
            for (int c = 0; c < channels; ++c) {
+              index_t out_offset = (((b * height) + h) * width + w) * channels + c;
              index_t in_offset = b * in_image_size * input_channels + c;
              T res = std::numeric_limits<T>::lowest();
              for (int kh = 0; kh < kernel_h; ++kh) {
@@ -115,18 +115,18 @@ struct PoolingFunctor : PoolingFunctorBase {
                  }
                }
              }
-              *output = res;
-              output++;
+              output[out_offset] = res;
            }
          }
        }
      }
    } else if (pooling_type_ == AVG) {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(4)
      for (int b = 0; b < batch; ++b) {
        for (int h = 0; h < height; ++h) {
          for (int w = 0; w < width; ++w) {
            for (int c = 0; c < channels; ++c) {
+              index_t out_offset = (((b * height) + h) * width + w) * channels + c;
              index_t in_offset = b * in_image_size * input_channels + c;
              T sum = 0;
              int block_size = 0;
@@ -142,8 +142,7 @@ struct PoolingFunctor : PoolingFunctorBase {
                  }
                }
              }
-              *output = sum / block_size;
-              output++;
+              output[out_offset] = sum / block_size;
            }
          }
        }

--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
@@ -29,21 +29,20 @@ struct SoftmaxFunctor {
    const index_t num_classes = logits_shape.back();
 #pragma omp parallel for
    for (index_t i = 0; i < batch_size; ++i) {
-      T max_value = *logits_ptr;
+      const index_t pos = i * num_classes;
+      T max_value = logits_ptr[pos];
      for (index_t c = 1; c < num_classes; ++c) {
-        max_value = std::max(max_value, logits_ptr[c]);
+        max_value = std::max(max_value, logits_ptr[pos + c]);
      }
      // TODO: check overflow?
      T sum = 0;
      std::vector<T> exp_data(num_classes);
      for (index_t c = 0; c < num_classes; ++c) {
-        exp_data[c] = ::exp((*logits_ptr - max_value));
+        exp_data[c] = ::exp((logits_ptr[pos + c] - max_value));
        sum += exp_data[c];
-        logits_ptr++;
      }
      for (index_t c = 0; c < num_classes; ++c) {
-        *output_ptr = exp_data[c] / sum;
-        output_ptr++;
+        output_ptr[pos + c] = exp_data[c] / sum;
      }
    }
  }

--- a/mace/mace.bzl
+++ b/mace/mace.bzl
@@ -53,3 +53,9 @@ def if_not_hexagon_enabled(a):
      "//mace:hexagon_enabled": [],
      "//conditions:default": a,
  })
+
+def if_openmp_enabled(a):
+  return select({
+      "//mace:openmp_enabled": a,
+      "//conditions:default": [],
+  })
--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
@@ -7,7 +7,7 @@ package(

 licenses(["notice"])  # Apache 2.0

-load("//mace:mace.bzl", "if_android", "if_neon_enabled")
+load("//mace:mace.bzl", "if_android", "if_neon_enabled", "if_openmp_enabled")

 cc_library(
    name = "test",
@@ -34,7 +34,7 @@ cc_library(
        ["*.h"],
        exclude = ["ops_test_util.h"],
    ),
-    copts = if_neon_enabled(["-DMACE_ENABLE_NEON"]),
+    copts = if_openmp_enabled(["-fopenmp"]) + if_neon_enabled(["-DMACE_ENABLE_NEON"]),
    deps = [
        "//mace/kernels",
    ],

--- a/mace/utils/utils.h
+++ b/mace/utils/utils.h
@@ -52,7 +52,7 @@ inline std::string ObfuscateString(const std::string &src,
                                   const std::string &lookup_table) {
  std::string dest;
  dest.resize(src.size());
-  for (int i = 0; i < src.size(); i++) {
+  for (size_t i = 0; i < src.size(); i++) {
    dest[i] = src[i] ^ lookup_table[i % lookup_table.size()];
  }
  return std::move(dest);
@@ -73,7 +73,7 @@ inline std::string ObfuscateSymbol(const std::string &src) {
  dest[0] = src[0]; // avoid invalid symbol which starts from 0-9
  const std::string encode_dict =
    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_";
-  for (int i = 1; i < src.size(); i++) {
+  for (size_t i = 1; i < src.size(); i++) {
    char ch = src[i];
    int idx;
    if (ch >= '0' && ch <= '9') {

--- a/tools/bazel-adb-run.sh
+++ b/tools/bazel-adb-run.sh
@@ -43,7 +43,8 @@ bazel build -c opt $STRIP --verbose_failures $BAZEL_TARGET \
   --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
   --copt="-DMACE_DISABLE_NO_TUNING_WARNING" \
   --copt="-Werror=return-type" \
-   --define neon=false
+   --define neon=false \
+   --define openmp=true

 if [ $? -ne 0 ]; then
  exit 1

--- a/tools/export_lib.sh
+++ b/tools/export_lib.sh
@@ -71,6 +71,17 @@ build_target()
    $DSP_MODE_BUILD_FLAGS || exit 1
 }

+build_local_target()
+{
+  BAZEL_TARGET=$1
+  bazel build --verbose_failures -c opt --strip always $BAZEL_TARGET \
+    --copt="-std=c++11" \
+    --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
+    --copt="-Werror=return-type" \
+    --copt="-DMACE_OBFUSCATE_LITERALS" \
+    --define openmp=true || exit -1
+}
+
 merge_libs()
 {
  CREATE_LIB_NAME=$1
@@ -113,10 +124,17 @@ bash mace/tools/git/gen_version_source.sh ${CODEGEN_DIR}/version/version.cc || e

 echo "Step 3: Build libmace targets"
 bazel clean
-for target in ${all_targets[*]}
-do
-  build_target ${target}
-done
+if [ x"${RUNTIME}" = x"local" ]; then
+  for target in ${all_targets[*]}
+  do
+    build_local_target ${target}
+  done
+else
+  for target in ${all_targets[*]}
+  do
+    build_target ${target}
+  done
+fi


 echo "Step 4: Create mri files and generate merged libs"