merge with origin/master

8c4f5a18 · yejianwu · f3f3a58f · 56742d51 · 8c4f5a18 · 8c4f5a18
34 changed file
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -9,7 +9,7 @@ cpplint:
    - master
  script:
    - curl -o cpplint.py https://raw.githubusercontent.com/google/styleguide/gh-pages/cpplint/cpplint.py
-    - python cpplint.py --root=mace --linelength=80 --counting=detailed $(find mace -name *.h -or -name *.cc)
+    - python cpplint.py --linelength=80 --counting=detailed $(find mace -name *.h -or -name *.cc)

 ops_test:
  stage: ops_test

--- a/WORKSPACE
+++ b/WORKSPACE
@@ -55,7 +55,7 @@ new_git_repository(
    name = "opencl_clhpp",
    build_file = "mace/third_party/opencl-clhpp.BUILD",
    commit = "4c6f7d56271727e37fb19a9b47649dd175df2b12",
-    remote = "https://github.com/KhronosGroup/OpenCL-CLHPP.git",
+    remote = "http://v9.git.n.xiaomi.com/deep-computing/OpenCL-CLHPP-Mirror.git",
 )

 new_git_repository(

--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -117,9 +117,3 @@ RUN pip install -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com
 # Download tensorflow tools
 RUN wget http://cnbj1-inner-fds.api.xiaomi.net/mace/tool/transform_graph && \
    chmod +x transform_graph
-
-# Install gitlab runner
-RUN curl -L https://packages.gitlab.com/install/repositories/runner/gitlab-ci-multi-runner/script.deb.sh | bash
-RUN apt-get install gitlab-ci-multi-runner
-
-ENTRYPOINT gitlab-runner run
--- a/docker/gitlab-runner/Dockerfile
+++ b/docker/gitlab-runner/Dockerfile
+FROM cr.d.xiaomi.net/mace/mace-dev:latest
+
+# Update source
+# Looks like mirrors.163.com does not work in xiaomi network
+# RUN sed -i 's/http:\/\/archive\.ubuntu\.com\/ubuntu\//http:\/\/mirrors\.163\.com\/ubuntu\//g' /etc/apt/sources.list
+RUN apt-get update -y
+
+# Install gitlab runner
+RUN curl -L https://packages.gitlab.com/install/repositories/runner/gitlab-ci-multi-runner/script.deb.sh | bash
+RUN apt-get install gitlab-ci-multi-runner
+
+ENTRYPOINT gitlab-runner run
--- a/mace/benchmark/BUILD
+++ b/mace/benchmark/BUILD
@@ -16,7 +16,8 @@ cc_library(
    hdrs = ["stat_summarizer.h"],
    linkstatic = 1,
    deps = [
-        "//mace/core",
+        "//mace/public",
+        "//mace/utils",
    ],
 )


--- a/mace/core/allocator.h
+++ b/mace/core/allocator.h
@@ -7,6 +7,9 @@
 #define MACE_CORE_ALLOCATOR_H_

 #include <malloc.h>
+#include <map>
+#include <limits>
+#include <vector>

 #include "mace/core/registry.h"
 #include "mace/core/types.h"
@@ -81,7 +84,7 @@ class CPUAllocator : public Allocator {
    free(data);
  };
  void *Map(void *buffer, size_t offset, size_t nbytes) const override {
-    return (char *)buffer + offset;
+    return reinterpret_cast<char*>(buffer) + offset;
  }
  void *MapImage(void *buffer,
                 const std::vector<size_t> &image_shape,

--- a/mace/core/arg_helper.cc
+++ b/mace/core/arg_helper.cc
@@ -2,6 +2,9 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //

+#include <string>
+#include <vector>
+
 #include "mace/core/arg_helper.h"
 #include "mace/utils/logging.h"


--- a/mace/core/arg_helper.h
+++ b/mace/core/arg_helper.h
@@ -5,6 +5,8 @@
 #ifndef MACE_CORE_ARG_HELPER_H_
 #define MACE_CORE_ARG_HELPER_H_

+#include <string>
+#include <vector>
 #include <map>

 #include "mace/public/mace.h"

--- a/mace/core/buffer.h
+++ b/mace/core/buffer.h
@@ -6,6 +6,8 @@
 #define MACE_CORE_BUFFER_H_

 #include <vector>
+#include <functional>
+
 #include "mace/core/allocator.h"
 #include "mace/core/types.h"

@@ -14,7 +16,7 @@ namespace mace {
 class BufferBase {
 public:
  BufferBase() : size_(0) {}
-  BufferBase(index_t size) : size_(size) {}
+  explicit BufferBase(index_t size) : size_(size) {}
  virtual ~BufferBase() {}

  virtual void *buffer() = 0;
@@ -39,7 +41,7 @@ class BufferBase {

  virtual bool OnHost() const = 0;

-  virtual index_t offset() const { return 0; };
+  virtual index_t offset() const { return 0; }

  template <typename T>
  const T *data() const {
@@ -59,7 +61,7 @@ class BufferBase {

 class Buffer : public BufferBase {
 public:
-  Buffer(Allocator *allocator)
+  explicit Buffer(Allocator *allocator)
      : BufferBase(0),
        allocator_(allocator),
        buf_(nullptr),
@@ -93,7 +95,7 @@ class Buffer : public BufferBase {
  void *buffer() {
    MACE_CHECK_NOTNULL(buf_);
    return buf_;
-  };
+  }

  const void *raw_data() const {
    if (OnHost()) {
@@ -129,7 +131,7 @@ class Buffer : public BufferBase {
  void Map(std::vector<size_t> *pitch) {
    MACE_CHECK(mapped_buf_ == nullptr, "buf has been already mapped");
    mapped_buf_ = Map(0, size_, pitch);
-  };
+  }

  void UnMap() {
    UnMap(mapped_buf_);
@@ -151,7 +153,7 @@ class Buffer : public BufferBase {
  void Copy(void *src, index_t offset, index_t length) {
    MACE_CHECK_NOTNULL(mapped_buf_);
    MACE_CHECK(length <= size_, "out of buffer");
-    memcpy(mapped_buf_, (char *)src + offset, length);
+    memcpy(mapped_buf_, reinterpret_cast<char*>(src) + offset, length);
  }

  bool OnHost() const { return allocator_->OnHost(); }
@@ -197,7 +199,7 @@ class Image : public BufferBase {
  void *buffer() {
    MACE_CHECK_NOTNULL(buf_);
    return buf_;
-  };
+  }

  const void *raw_data() const {
    MACE_CHECK_NOTNULL(mapped_buf_);
@@ -227,12 +229,12 @@ class Image : public BufferBase {
    MACE_CHECK(mapped_buf_ == nullptr, "buf has been already mapped");
    MACE_CHECK_NOTNULL(pitch);
    mapped_buf_ = allocator_->MapImage(buf_, shape_, pitch);
-  };
+  }

  void UnMap() {
    UnMap(mapped_buf_);
    mapped_buf_ = nullptr;
-  };
+  }

  void Resize(index_t size) { MACE_NOT_IMPLEMENTED; }

@@ -276,12 +278,12 @@ class BufferSlice : public BufferBase {
  void *buffer() {
    MACE_CHECK_NOTNULL(buffer_);
    return buffer_->buffer();
-  };
+  }

  const void *raw_data() const {
    if (OnHost()) {
      MACE_CHECK_NOTNULL(buffer_);
-      return (char *)buffer_->raw_data() + offset_;
+      return reinterpret_cast<const char*>(buffer_->raw_data()) + offset_;
    } else {
      MACE_CHECK_NOTNULL(mapped_buf_);
      return mapped_buf_;
@@ -304,13 +306,13 @@ class BufferSlice : public BufferBase {
    MACE_CHECK_NOTNULL(buffer_);
    MACE_CHECK(mapped_buf_ == nullptr, "mapped buf is not null");
    mapped_buf_ = buffer_->Map(offset_, length_, pitch);
-  };
+  }

  void UnMap() {
    MACE_CHECK_NOTNULL(mapped_buf_);
    buffer_->UnMap(mapped_buf_);
    mapped_buf_ = nullptr;
-  };
+  }

  void Resize(index_t size) { MACE_NOT_IMPLEMENTED; }

@@ -326,6 +328,6 @@ class BufferSlice : public BufferBase {
  index_t offset_;
  index_t length_;
 };
-}
+}  // namespace mace

 #endif  // MACE_CORE_BUFFER_H_
--- a/mace/core/mace.cc
+++ b/mace/core/mace.cc
@@ -459,7 +459,7 @@ MaceEngine::~MaceEngine() {
    MACE_CHECK(hexagon_controller_->TeardownGraph(), "hexagon teardown error");
    MACE_CHECK(hexagon_controller_->Finalize(), "hexagon finalize error");
  }
-};
+}

 bool MaceEngine::Run(const float *input,
                     const std::vector<index_t> &input_shape,
@@ -493,7 +493,6 @@ bool MaceEngine::Run(const float *input,
    auto shape = output_tensor->shape();
    int64_t output_size = std::accumulate(shape.begin(), shape.end(), 1,
                                          std::multiplies<int64_t>());
-    // TODO: check for overflow exception.
    std::memcpy(output, output_tensor->data<float>(),
                output_size * sizeof(float));
    return true;
@@ -530,7 +529,6 @@ bool MaceEngine::Run(const std::vector<MaceInputInfo> &inputs,
      int64_t output_size = std::accumulate(shape.begin(), shape.end(), 1,
                                            std::multiplies<int64_t>());
      MACE_CHECK(!shape.empty()) << "Output's shape must greater than 0";
-      // TODO: check for overflow exception.
      std::memcpy(output.second, output_tensor->data<float>(),
                  output_size * sizeof(float));
    } else {

--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -2,6 +2,8 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //

+#include <utility>
+
 #include "mace/core/net.h"
 #include "mace/utils/memory_logging.h"
 #include "mace/utils/timer.h"

--- a/mace/core/net.h
+++ b/mace/core/net.h
@@ -5,6 +5,10 @@
 #ifndef MACE_CORE_NET_H_
 #define MACE_CORE_NET_H_

+#include <memory>
+#include <string>
+#include <vector>
+
 #include "mace/core/operator.h"
 #include "mace/public/mace.h"


--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -3,6 +3,9 @@
 //

 #include <sstream>
+#include <memory>
+#include <string>
+#include <vector>

 #include "mace/core/operator.h"


--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -2,8 +2,13 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //

-#ifndef MACE_CORE_OPERATOR_H
-#define MACE_CORE_OPERATOR_H
+#ifndef MACE_CORE_OPERATOR_H_
+#define MACE_CORE_OPERATOR_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+#include <map>

 #include "mace/core/arg_helper.h"
 #include "mace/core/future.h"
@@ -100,7 +105,7 @@ class Operator : public OperatorBase {
      }
    }
  }
-  virtual bool Run(StatsFuture *future) override = 0;
+  bool Run(StatsFuture *future) override = 0;
  ~Operator() noexcept override {}
 };

@@ -150,7 +155,7 @@ class OperatorRegistry {
      RegistryType;
  OperatorRegistry();
  ~OperatorRegistry() = default;
-  RegistryType *registry() { return &registry_; };
+  RegistryType *registry() { return &registry_; }
  std::unique_ptr<OperatorBase> CreateOperator(const OperatorDef &operator_def,
                                               Workspace *ws,
                                               DeviceType type,
@@ -171,4 +176,4 @@ MACE_DECLARE_REGISTRY(OpRegistry,

 }  // namespace mace

-#endif  // MACE_CORE_OPERATOR_H
+#endif  // MACE_CORE_OPERATOR_H_
--- a/mace/core/preallocated_pooled_allocator.h
+++ b/mace/core/preallocated_pooled_allocator.h
@@ -5,7 +5,10 @@
 #ifndef MACE_CORE_PREALLOCATED_POOLED_ALLOCATOR_H_
 #define MACE_CORE_PREALLOCATED_POOLED_ALLOCATOR_H_

+#include <memory>
+#include <utility>
 #include <unordered_map>
+
 #include "mace/core/allocator.h"
 #include "mace/core/buffer.h"


--- a/mace/core/registry.h
+++ b/mace/core/registry.h
@@ -7,7 +7,7 @@

 #include <map>
 #include <memory>
-#include <mutex>
+#include <mutex>  // NOLINT(build/c++11)
 #include <string>
 #include <vector>


--- a/mace/core/runtime/cpu/cpu_runtime.cc
+++ b/mace/core/runtime/cpu/cpu_runtime.cc
@@ -2,19 +2,21 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //

-#include "mace/public/mace.h"
-#include "mace/utils/logging.h"
 #include <omp.h>
 #include <sys/syscall.h>
 #include <unistd.h>
+#include <vector>

+#include "mace/core/runtime/cpu/cpu_runtime.h"
+#include "mace/public/mace.h"
+#include "mace/utils/logging.h"
 namespace mace {

 namespace {

-static int GetCPUMaxFreq(int cpu_id) {
+int GetCPUMaxFreq(int cpu_id) {
  char path[64];
-  sprintf(path,
+  snprintf(path, sizeof(path),
          "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq",
          cpu_id);
  FILE *fp = fopen(path, "rb");
@@ -26,24 +28,25 @@ static int GetCPUMaxFreq(int cpu_id) {
  return freq;
 }

-static void SortCPUIdsByMaxFreqAsc(std::vector<int> &cpu_ids) {
-  int cpu_count = cpu_ids.size();
+void SortCPUIdsByMaxFreqAsc(std::vector<int> *cpu_ids) {
+  MACE_CHECK_NOTNULL(cpu_ids);
+  int cpu_count = cpu_ids->size();
  std::vector<int> cpu_max_freq;
  cpu_max_freq.resize(cpu_count);

  // set cpu max frequency
  for (int i = 0; i < cpu_count; ++i) {
    cpu_max_freq[i] = GetCPUMaxFreq(i);
-    cpu_ids[i] = i;
+    (*cpu_ids)[i] = i;
  }

  // sort cpu ids by max frequency asc, bubble sort
  for (int i = 0; i < cpu_count - 1; ++i) {
    for (int j = i + 1; j < cpu_count; ++j) {
      if (cpu_max_freq[i] > cpu_max_freq[j]) {
-        int tmp = cpu_ids[i];
-        cpu_ids[i] = cpu_ids[j];
-        cpu_ids[j] = tmp;
+        int tmp = (*cpu_ids)[i];
+        (*cpu_ids)[i] = (*cpu_ids)[j];
+        (*cpu_ids)[j] = tmp;

        tmp = cpu_max_freq[i];
        cpu_max_freq[i] = cpu_max_freq[j];
@@ -53,11 +56,12 @@ static void SortCPUIdsByMaxFreqAsc(std::vector<int> &cpu_ids) {
  }
 }

-static void SetThreadAffinity(cpu_set_t mask) {
+void SetThreadAffinity(cpu_set_t mask) {
  int sys_call_res;
  pid_t pid = gettid();

-  // TODO: when set omp num threads to 1, sometiomes return EINVAL(22) error
+  // TODO(chenghui): when set omp num threads to 1,
+  // sometiomes return EINVAL(22) error.
  // https://linux.die.net/man/2/sched_setaffinity
  sys_call_res = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask);
  if (sys_call_res != 0) {
@@ -68,12 +72,11 @@ static void SetThreadAffinity(cpu_set_t mask) {
 }  // namespace

 void SetCPURuntime(int omp_num_threads, CPUPowerOption power_option) {
-
  int cpu_count = omp_get_num_procs();
  LOG(INFO) << "cpu_count: " << cpu_count;
  std::vector<int> sorted_cpu_ids;
  sorted_cpu_ids.resize(cpu_count);
-  SortCPUIdsByMaxFreqAsc(sorted_cpu_ids);
+  SortCPUIdsByMaxFreqAsc(&sorted_cpu_ids);

  std::vector<int> use_cpu_ids;
  if (power_option == CPUPowerOption::DEFAULT || omp_num_threads >= cpu_count) {
@@ -92,7 +95,7 @@ void SetCPURuntime(int omp_num_threads, CPUPowerOption power_option) {
  // compute mask
  cpu_set_t mask;
  CPU_ZERO(&mask);
-  for (auto cpu_id: use_cpu_ids) {
+  for (auto cpu_id : use_cpu_ids) {
    CPU_SET(cpu_id, &mask);
  }
  LOG(INFO) << "use cpus mask: " << mask.__bits[0];

--- a/mace/core/runtime/cpu/cpu_runtime.h
+++ b/mace/core/runtime/cpu/cpu_runtime.h
@@ -3,8 +3,8 @@
 //


-#ifndef MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H
-#define MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H
+#ifndef MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_
+#define MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_

 #include "mace/public/mace.h"

@@ -14,4 +14,4 @@ void SetCPURuntime(int omp_num_threads, CPUPowerOption power_option);

 }

-#endif //MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H
+#endif  // MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_
--- a/mace/core/runtime/hexagon/hexagon_control_wrapper.cc
+++ b/mace/core/runtime/hexagon/hexagon_control_wrapper.cc
@@ -3,8 +3,11 @@
 //

 #include <sys/time.h>
-#include <thread>
+#include <thread>  // NOLINT(build/c++11)
 #include <vector>
+#include <unordered_map>
+#include <string>
+#include <utility>

 #include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
 #include "mace/core/runtime/hexagon/hexagon_nn_ops.h"
@@ -324,7 +327,7 @@ bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor,
  MACE_ASSERT(output_bytes == output_tensor->raw_size(),
              "wrong output bytes inferred.");
  return res == 0;
-};
+}

 bool HexagonControlWrapper::ExecuteGraphNew(
    const std::vector<Tensor> &input_tensors,
@@ -374,7 +377,7 @@ bool HexagonControlWrapper::ExecuteGraphNew(
  delete[] inputs;
  delete[] outputs;
  return res == 0;
-};
+}

 bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor,
                                                    Tensor *output_tensor) {

--- a/mace/core/runtime/hexagon/hexagon_control_wrapper.h
+++ b/mace/core/runtime/hexagon/hexagon_control_wrapper.h
@@ -2,8 +2,8 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //

-#ifndef MACE_DSP_HEXAGON_CONTROL_WRAPPER_H_
-#define MACE_DSP_HEXAGON_CONTROL_WRAPPER_H_
+#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_CONTROL_WRAPPER_H_
+#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_CONTROL_WRAPPER_H_

 #include <vector>

@@ -16,7 +16,7 @@ namespace mace {

 class HexagonControlWrapper {
 public:
-  HexagonControlWrapper(){};
+  HexagonControlWrapper() {}
  int GetVersion();
  bool Config();
  bool Init();
@@ -53,6 +53,6 @@ class HexagonControlWrapper {

  DISABLE_COPY_AND_ASSIGN(HexagonControlWrapper);
 };
-}
+}  // namespace mace

-#endif  // MACE_DSP_HEXAGON_CONTROL_WRAPPER_H_
+#endif  // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_CONTROL_WRAPPER_H_
--- a/mace/core/runtime/hexagon/hexagon_controller.h
+++ b/mace/core/runtime/hexagon/hexagon_controller.h
-#ifndef MACE_DSP_HEXAGON_DSP_CONTROLLER_H_
-#define MACE_DSP_HEXAGON_DSP_CONTROLLER_H_
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_CONTROLLER_H_
+#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_CONTROLLER_H_

 #include "mace/core/runtime/hexagon/hexagon_nn.h"

@@ -18,4 +22,5 @@ int hexagon_controller_DeInitHexagon();
 }
 #endif  // __cplusplus

-#endif  // MACE_DSP_HEXAGON_DSP_CONTROLLER_H_
\ No newline at end of file
+#endif  // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_CONTROLLER_H_
+
--- a/mace/core/runtime/hexagon/hexagon_nn.h
+++ b/mace/core/runtime/hexagon/hexagon_nn.h
-#ifndef _HEXAGON_NN_H
-#define _HEXAGON_NN_H
+/*
+ * Copyright (c) 2016-2017, The Linux Foundation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted (subject to the limitations in the
+ * disclaimer below) provided that the following conditions are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *    * Neither the name of The Linux Foundation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ * NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+ * GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
+ * HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_NN_H_
+#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_NN_H_
 #ifndef __QAIC_HEADER
 #define __QAIC_HEADER(ff) ff
-#endif  //__QAIC_HEADER
+#endif  // __QAIC_HEADER

 #ifndef __QAIC_HEADER_EXPORT
 #define __QAIC_HEADER_EXPORT
@@ -14,7 +49,7 @@

 #ifndef __QAIC_IMPL
 #define __QAIC_IMPL(ff) ff
-#endif  //__QAIC_IMPL
+#endif  // __QAIC_IMPL

 #ifndef __QAIC_IMPL_EXPORT
 #define __QAIC_IMPL_EXPORT
@@ -186,4 +221,4 @@ __QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute_new)(
 #ifdef __cplusplus
 }
 #endif
-#endif  //_HEXAGON_NN_H
+#endif  // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_NN_H_
--- a/mace/core/runtime/hexagon/hexagon_nn_ops.h
+++ b/mace/core/runtime/hexagon/hexagon_nn_ops.h
@@ -2,10 +2,12 @@
 // Copyright (c) 2018 XiaoMi All rights reserved.
 //

-#ifndef LIBMACE_HEXAGON_NN_OPS_H
-#define LIBMACE_HEXAGON_NN_OPS_H
+#ifndef MACE_CORE_RUNTIME_HEXAGON_HEXAGON_NN_OPS_H_
+#define MACE_CORE_RUNTIME_HEXAGON_HEXAGON_NN_OPS_H_

+#include <string>
 #include <unordered_map>
+
 #include "mace/utils/logging.h"

 namespace mace {
@@ -15,7 +17,7 @@ namespace mace {
 typedef enum op_type_enum {
 #define DEF_OP(NAME, ...) OP_##NAME,

-#include "mace/core/runtime/hexagon/ops.h"
+#include "mace/core/runtime/hexagon/ops.h"  // NOLINT(build/include)
  NN_OPS_MAX

 #undef DEF_OP
@@ -26,7 +28,7 @@ class OpMap {
  void Init() {
 #define DEF_OP(NAME) op_map_[#NAME] = OP_##NAME;

-#include "mace/core/runtime/hexagon/ops.h"
+#include "mace/core/runtime/hexagon/ops.h"  // NOLINT(build/include)

 #undef DEF_OP
  }
@@ -45,4 +47,4 @@ class OpMap {
 };
 }  // namespace mace

-#endif  // LIBMACE_HEXAGON_NN_OPS_H
+#endif  // MACE_CORE_RUNTIME_HEXAGON_HEXAGON_NN_OPS_H_
--- a/mace/core/runtime/hexagon/ops.h
+++ b/mace/core/runtime/hexagon/ops.h
+/*
+ * Copyright (c) 2016-2017, The Linux Foundation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted (subject to the limitations in the
+ * disclaimer below) provided that the following conditions are met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *    * Neither the name of The Linux Foundation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *
+ * NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+ * GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
+ * HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
 /*
 * You probably want to
 *
@@ -42,6 +77,8 @@
 *
 * otherwise the interface becomes incompatible.
 */
+// NOLINT(build/header_guard)
+
 DEF_OP(INPUT)
 DEF_OP(OUTPUT)
 DEF_OP(Nop)

--- a/mace/core/runtime/hexagon/quantize.cc
+++ b/mace/core/runtime/hexagon/quantize.cc
@@ -2,6 +2,8 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //

+#include <algorithm>
+
 #include "mace/core/runtime/hexagon/quantize.h"

 namespace mace {
@@ -93,4 +95,5 @@ void Quantizer::DeQuantize(const Tensor &in_tensor,
  }
 }

-}  // namespace mace
\ No newline at end of file
+
+}  // namespace mace
--- a/mace/core/runtime/hexagon/quantize.h
+++ b/mace/core/runtime/hexagon/quantize.h
@@ -2,8 +2,8 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //

-#ifndef MACE_DSP_UTIL_QUANTIZE_H_
-#define MACE_DSP_UTIL_QUANTIZE_H_
+#ifndef MACE_CORE_RUNTIME_HEXAGON_QUANTIZE_H_
+#define MACE_CORE_RUNTIME_HEXAGON_QUANTIZE_H_

 #include "mace/core/tensor.h"

@@ -40,6 +40,6 @@ class Quantizer {
  DISABLE_COPY_AND_ASSIGN(Quantizer);
 };

-}  // mace
+}  // namespace mace

-#endif  // MACE_DSP_UTIL_QUANTIZE_H_
+#endif  // MACE_CORE_RUNTIME_HEXAGON_QUANTIZE_H_
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -5,6 +5,10 @@
 #ifndef MACE_CORE_TENSOR_H_
 #define MACE_CORE_TENSOR_H_

+#include <string>
+#include <vector>
+#include <functional>
+
 #include "mace/core/buffer.h"
 #include "mace/core/preallocated_pooled_allocator.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -60,7 +64,7 @@ inline std::ostream &operator<<(std::ostream &os, signed char c) {
 inline std::ostream &operator<<(std::ostream &os, unsigned char c) {
  return os << static_cast<unsigned int>(c);
 }
-}
+}  // namespace numerical_chars

 class Tensor {
 public:
@@ -69,7 +73,7 @@ class Tensor {
        dtype_(type),
        buffer_(nullptr),
        is_buffer_owner_(true),
-        name_(""){};
+        name_("") {}

  Tensor(BufferBase *buffer, DataType dtype)
    : dtype_(dtype),
@@ -240,7 +244,7 @@ class Tensor {
  inline void SetSourceOpName(const std::string name) { name_ = name; }

  inline void DebugPrint() const {
-    using namespace numerical_chars;
+    using namespace numerical_chars;  // NOLINT(build/namespaces)
    std::stringstream os;
    for (index_t i : shape_) {
      os << i << ", ";
@@ -262,7 +266,7 @@ class Tensor {

  class MappingGuard {
   public:
-    MappingGuard(const Tensor *tensor) : tensor_(tensor) {
+    explicit MappingGuard(const Tensor *tensor) : tensor_(tensor) {
      if (tensor_ != nullptr) {
        tensor_->buffer_->Map(&mapped_image_pitch_);
      }
@@ -301,6 +305,6 @@ class Tensor {
  DISABLE_COPY_AND_ASSIGN(Tensor);
 };

-}  // namespace tensor
+}  // namespace mace

 #endif  // MACE_CORE_TENSOR_H_
--- a/mace/core/testing/test_benchmark.cc
+++ b/mace/core/testing/test_benchmark.cc
@@ -6,7 +6,7 @@
 #include <cstdlib>

 #include <algorithm>
-#include <regex>
+#include <regex>  // NOLINT(build/c++11)
 #include <vector>

 #include "mace/core/testing/test_benchmark.h"

--- a/mace/core/testing/test_benchmark_main.cc
+++ b/mace/core/testing/test_benchmark_main.cc
@@ -14,7 +14,6 @@ int main(int argc, char **argv) {
  mace::ConfigOpenCLRuntime(mace::GPUType::ADRENO, mace::GPUPerfHint::PERF_HIGH,
                            mace::GPUPriorityHint::PRIORITY_HIGH);

-  // TODO Use gflags
  if (argc == 2) {
    mace::testing::Benchmark::Run(argv[1]);
  } else {

--- a/mace/core/types.h
+++ b/mace/core/types.h
@@ -6,6 +6,7 @@
 #define MACE_CORE_TYPES_H_

 #include <cstdint>
+#include <string>

 #include "mace/public/mace.h"
 #include "include/half.hpp"

--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -4,6 +4,7 @@

 #include <string>
 #include <vector>
+#include <utility>

 #include "mace/core/arg_helper.h"
 #include "mace/core/workspace.h"
@@ -52,16 +53,16 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
  unsigned char *model_data_ptr = nullptr;
  for (auto &const_tensor : net_def.tensors()) {
    if (model_data_ptr == nullptr ||
-        reinterpret_cast<long long>(const_tensor.data()) <
-            reinterpret_cast<long long>(model_data_ptr)) {
+        reinterpret_cast<int64_t>(const_tensor.data()) <
+            reinterpret_cast<int64_t>(model_data_ptr)) {
      model_data_ptr = const_cast<unsigned char *>(const_tensor.data());
    }
  }
  for (auto &const_tensor : net_def.tensors()) {
    model_data_size = std::max(
        model_data_size,
-        static_cast<index_t>((reinterpret_cast<long long>(const_tensor.data()) -
-                              reinterpret_cast<long long>(model_data_ptr)) +
+        static_cast<index_t>((reinterpret_cast<int64_t>(const_tensor.data()) -
+                              reinterpret_cast<int64_t>(model_data_ptr)) +
                             const_tensor.data_size() *
                                 GetEnumTypeSize(const_tensor.data_type())));
  }
@@ -89,7 +90,8 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
      dims.push_back(d);
    }

-    index_t offset = (long long)const_tensor.data() - (long long)model_data_ptr;
+    index_t offset = reinterpret_cast<int64_t>(const_tensor.data())
+        - reinterpret_cast<int64_t>(model_data_ptr);
    std::unique_ptr<Tensor> tensor(
        new Tensor(BufferSlice(tensor_buffer_.get(), offset,
                               const_tensor.data_size() *
@@ -116,7 +118,7 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
  // As DSP may have different data output type for each op,
  // we stick to the same concept.
  for (auto &op : net_def.op()) {
-    if (! op.mem_id().empty()){
+    if (!op.mem_id().empty()) {
      const DataType op_dtype = static_cast<DataType>(
          ArgumentHelper::GetSingleArgument<OperatorDef, int>(
              op, "T", static_cast<int>(DT_FLOAT)));
@@ -142,11 +144,14 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
        std::unique_ptr<Tensor> tensor
            (new Tensor(preallocated_allocator_.GetBuffer(mem_ids[i]), dtype));
        tensor->SetSourceOpName(op.name());
-        VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")" << "; Mem: "
-                << mem_ids[i] << "; Image shape: "
-                << dynamic_cast<Image *>(tensor->UnderlyingBuffer())->image_shape()[0]
+        VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")"
+                << " Mem: "  << mem_ids[i]
+                << " Image shape: "
+                << dynamic_cast<Image *>(tensor->UnderlyingBuffer())
+                    ->image_shape()[0]
                << ", "
-                << dynamic_cast<Image *>(tensor->UnderlyingBuffer())->image_shape()[1];
+                << dynamic_cast<Image *>(tensor->UnderlyingBuffer())
+                    ->image_shape()[1];
        tensor_map_[op.output(i)] = std::move(tensor);
      }
    }

--- a/mace/core/workspace.h
+++ b/mace/core/workspace.h
@@ -5,6 +5,11 @@
 #ifndef MACE_CORE_WORKSPACE_H_
 #define MACE_CORE_WORKSPACE_H_

+#include <map>
+#include <string>
+#include <vector>
+#include <memory>
+
 #include "mace/core/preallocated_pooled_allocator.h"
 #include "mace/core/tensor.h"
 #include "mace/public/mace.h"

--- a/mace/python/tools/memory_optimizer.py
+++ b/mace/python/tools/memory_optimizer.py
@@ -43,6 +43,9 @@ class MemoryOptimizer(object):
      mem_size[1] = output_shape[0] * output_shape[1]
    return mem_size

+  def mem_area(self, memory_size):
+    return memory_size[0] * memory_size[1]
+
  def optimize(self):
    for op in self.net_def.op:
      if self.is_buffer_image_op(op):
@@ -54,22 +57,34 @@ class MemoryOptimizer(object):
        print('WARNING: the number of output shape is not equal to the number of output.')
        return
      for i in range(len(op.output)):
-        if len(self.idle_mem) == 0:
-          # allocate new mem
+        op_mem_size = self.get_mem_size(op.type, op.output_shape[i].dims)
+        mem_id = -1
+        if len(self.idle_mem) > 0:
+          best_mem_candidate_id = -1
+          best_mem_candidate_delta_area = sys.maxint
+          best_mem_candidate_shape = []
+          for mid in self.idle_mem:
+            reuse_mem_size = self.mem_block[mid]
+            resize_mem_size = [max(reuse_mem_size[0], op_mem_size[0]), max(reuse_mem_size[1], op_mem_size[1])]
+            delta_mem_area = self.mem_area(resize_mem_size) - self.mem_area(reuse_mem_size)
+            if delta_mem_area < best_mem_candidate_delta_area:
+              best_mem_candidate_id = mid
+              best_mem_candidate_delta_area = delta_mem_area
+              best_mem_candidate_shape = resize_mem_size
+
+          if best_mem_candidate_delta_area <= self.mem_area(op_mem_size):
+            # reuse
+            self.mem_block[best_mem_candidate_id] = best_mem_candidate_shape
+            mem_id = best_mem_candidate_id
+            self.idle_mem.remove(mem_id)
+
+        if mem_id == -1:
          mem_id = self.total_mem_count
          self.total_mem_count += 1
-        else:
-          # reuse mem
-          mem_id = self.idle_mem.pop()
+          self.mem_block[mem_id] = op_mem_size

        op.mem_id.extend([mem_id])
        self.op_mem[op.output[i]] = mem_id
-        if mem_id not in self.mem_block:
-          self.mem_block[mem_id] = [0, 0]
-        mem_size = self.mem_block[mem_id]
-        op_mem_size = self.get_mem_size(op.type, op.output_shape[i].dims)
-        mem_size[0] = max(mem_size[0], op_mem_size[0])
-        mem_size[1] = max(mem_size[1], op_mem_size[1])

      # de-ref input tensor mem
      for ipt in op.input:

--- a/tools/benchmark.sh
+++ b/tools/benchmark.sh
@@ -29,7 +29,8 @@ if [ "$EMBED_MODEL_DATA" = 0 ]; then
 fi

 if [ x"$TARGET_ABI" == x"host" ]; then
-  bazel build --verbose_failures -c opt --strip always //mace/benchmark:benchmark_model \
+  bazel build --verbose_failures -c opt --strip always \
+    //mace/benchmark:benchmark_model \
    --copt="-std=c++11" \
    --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
    --copt="-Werror=return-type" \
@@ -52,16 +53,18 @@ if [ x"$TARGET_ABI" == x"host" ]; then
      $OPTION_ARGS || exit 1

 else
-  bazel build --verbose_failures -c opt --strip always //mace/benchmark:benchmark_model \
+  bazel build --verbose_failures -c opt --strip always \
+    //mace/benchmark:benchmark_model \
    --crosstool_top=//external:android/crosstool \
    --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
    --cpu=${TARGET_ABI} \
    --copt="-std=c++11" \
    --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
    --copt="-Werror=return-type" \
+    --copt="-DMACE_OBFUSCATE_LITERALS" \
    --copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \
-    --copt="-O3" \
    --define openmp=true \
+    --copt="-O3" \
    --define production=true || exit 1

  cp bazel-bin/mace/benchmark/benchmark_model $MODEL_OUTPUT_DIR
@@ -70,11 +73,14 @@ else
  IFS=',' read -r -a INPUT_NAMES <<< "${INPUT_NODES}"
  for NAME in "${INPUT_NAMES[@]}";do
    FORMATTED_NAME=$(sed s/[^[:alnum:]]/_/g <<< ${NAME})
-    adb -s $DEVICE_ID push ${MODEL_OUTPUT_DIR}/${INPUT_FILE_NAME}_${FORMATTED_NAME} ${PHONE_DATA_DIR} || exit 1
+    adb -s $DEVICE_ID push ${MODEL_OUTPUT_DIR}/${INPUT_FILE_NAME}_${FORMATTED_NAME} \
+        ${PHONE_DATA_DIR} > /dev/null || exit 1
  done
-  adb -s $DEVICE_ID push ${MODEL_OUTPUT_DIR}/benchmark_model ${PHONE_DATA_DIR} || exit 1
+  adb -s $DEVICE_ID push ${MODEL_OUTPUT_DIR}/benchmark_model \
+      ${PHONE_DATA_DIR} > /dev/null || exit 1
  if [ "$EMBED_MODEL_DATA" = 0 ]; then
-    adb -s $DEVICE_ID push ${MODEL_OUTPUT_DIR}/${MODEL_TAG}.data ${PHONE_DATA_DIR} || exit 1
+    adb -s $DEVICE_ID push ${MODEL_OUTPUT_DIR}/${MODEL_TAG}.data
+        ${PHONE_DATA_DIR} > /dev/null || exit 1
  fi

  adb -s $DEVICE_ID </dev/null shell \