Merge branch 'fix_warnings' into 'master'

fix warnings See merge request !464

Merge branch 'fix_warnings' into 'master'
fix warnings See merge request !464
03a12d1e · Liangliang He · 6ae2fe6f · 6376fe6f · 03a12d1e · 03a12d1e
72 changed file
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -5,11 +5,11 @@ workspace(name = "mace")
 # This statement defines the @com_google_protobuf repo.
 http_archive(
    name = "com_google_protobuf",
-    sha256 = "40d39d97a7b514b3e34daef732f822eca0081960b269863f5b573db5548cb237",
-    strip_prefix = "protobuf-3.4.0rc3",
+    sha256 = "542703acadc3f690d998f4641e1b988f15ba57ebca05fdfb1cd9095bec007948",
+    strip_prefix = "protobuf-3.4.0",
    urls = [
-        "https://cnbj1.fds.api.xiaomi.com/mace/third-party/protobuf/protobuf-3.4.0rc3.zip",
-        "https://github.com/google/protobuf/archive/v3.4.0rc3.zip"
+        "https://cnbj1.fds.api.xiaomi.com/mace/third-party/protobuf/protobuf-3.4.0.zip",
+        "https://github.com/google/protobuf/archive/v3.4.0.zip"
    ],
 )

@@ -38,21 +38,22 @@ new_http_archive(
 new_http_archive(
    name = "opencl_clhpp",
    build_file = "mace/third_party/opencl-clhpp/opencl-clhpp.BUILD",
-    sha256 = "d4eb63372ad31f7efcae626852f75f7929ff28d1cabb5f50ef11035963a69b46",
-    strip_prefix = "OpenCL-CLHPP-2.0.10",
+    sha256 = "dab6f1834ec6e3843438cc0f97d63817902aadd04566418c1fcc7fb78987d4e7",
+    strip_prefix = "OpenCL-CLHPP-4c6f7d56271727e37fb19a9b47649dd175df2b12",
    urls = [
-        "https://cnbj1.fds.api.xiaomi.com/mace/third-party/OpenCL-CLHPP/OpenCL-CLHPP-2.0.10.zip",
-        "https://github.com/KhronosGroup/OpenCL-CLHPP/archive/v2.0.10.zip"
+        "https://cnbj1.fds.api.xiaomi.com/mace/third-party/OpenCL-CLHPP/OpenCL-CLHPP-4c6f7d56271727e37fb19a9b47649dd175df2b12.zip",
+        "https://github.com/KhronosGroup/OpenCL-CLHPP/archive/4c6f7d56271727e37fb19a9b47649dd175df2b12.zip"
    ],
 )

 new_http_archive(
    name = "half",
    build_file = "mace/third_party/half/half.BUILD",
-    sha256 = "cdd70d3bf3fe091b688e7ab3f48471c881a197d2c186c95cca8bf156961fb41c",
+    sha256 = "0f514a1e877932b21dc5edc26a148ddc700b6af2facfed4c030ca72f74d0219e",
+    strip_prefix = "half-code-356-trunk",
    urls = [
-        "https://cnbj1.fds.api.xiaomi.com/mace/third-party/half/half-1.12.0.zip",
-        "https://jaist.dl.sourceforge.net/project/half/half/1.12.0/half-1.12.0.zip"
+        "https://cnbj1.fds.api.xiaomi.com/mace/third-party/half/half-code-356-trunk.zip",
+        "https://sourceforge.net/code-snapshots/svn/h/ha/half/code/half-code-356-trunk.zip"
    ],
 )


--- a/mace/benchmark/statistics.cc
+++ b/mace/benchmark/statistics.cc
@@ -58,9 +58,9 @@ std::string ShapeToString(const std::vector<OutputShape> &output_shape) {

  std::stringstream stream;
  stream << "[";
-  for (int i = 0; i < output_shape.size(); ++i) {
+  for (size_t i = 0; i < output_shape.size(); ++i) {
    const std::vector<index_t> &dims = output_shape[i].dims();
-    for (int j = 0; j < dims.size(); ++j) {
+    for (size_t j = 0; j < dims.size(); ++j) {
      stream << dims[j];
      if (j != dims.size() - 1) {
        stream << ",";
@@ -83,7 +83,7 @@ std::string VectorToString(const std::vector<T> &vec) {

  std::stringstream stream;
  stream << "[";
-  for (int i = 0; i < vec.size(); ++i) {
+  for (size_t i = 0; i < vec.size(); ++i) {
    stream << vec[i];
    if (i != vec.size() - 1) {
      stream << ",";

--- a/mace/benchmark/statistics.h
+++ b/mace/benchmark/statistics.h
@@ -54,24 +54,24 @@ class TimeInfo {
             sum_(0), square_sum(0)
  {}

-  const int64_t round() const {
+  int64_t round() const {
    return round_;
  }

-  const T first() const {
+  T first() const {
    return first_;
  }

-  const T sum() const {
+  T sum() const {
    return sum_;
  }

-  const double avg() const {
+  double avg() const {
    return round_ == 0 ? std::numeric_limits<double>::quiet_NaN() :
           sum_ * 1.0f / round_;
  }

-  const double std_deviation() const {
+  double std_deviation() const {
    if (round_ == 0 || min_ == max_) {
      return 0;
    }
@@ -111,12 +111,12 @@ class TimeInfo {
  }

 private:
+  int64_t round_;
  T first_;
  T curr_;
  T min_;
  T max_;
  T sum_;
-  int64_t round_;
  double square_sum;
 };


--- a/mace/core/allocator.h
+++ b/mace/core/allocator.h
@@ -21,6 +21,7 @@
 #include <vector>
 #include <cstring>

+#include "mace/core/macros.h"
 #include "mace/core/registry.h"
 #include "mace/core/types.h"
 #include "mace/public/mace.h"
@@ -83,6 +84,8 @@ class CPUAllocator : public Allocator {

  void *NewImage(const std::vector<size_t> &shape,
                 const DataType dt) const override {
+    MACE_UNUSED(shape);
+    MACE_UNUSED(dt);
    LOG(FATAL) << "Allocate CPU image";
    return nullptr;
  }
@@ -96,14 +99,20 @@ class CPUAllocator : public Allocator {
    free(data);
  };
  void *Map(void *buffer, size_t offset, size_t nbytes) const override {
+    MACE_UNUSED(nbytes);
    return reinterpret_cast<char*>(buffer) + offset;
  }
  void *MapImage(void *buffer,
                 const std::vector<size_t> &image_shape,
                 std::vector<size_t> *mapped_image_pitch) const override {
+    MACE_UNUSED(image_shape);
+    MACE_UNUSED(mapped_image_pitch);
    return buffer;
  }
-  void Unmap(void *buffer, void *mapper_ptr) const override {}
+  void Unmap(void *buffer, void *mapper_ptr) const override {
+    MACE_UNUSED(buffer);
+    MACE_UNUSED(mapper_ptr);
+  }
  bool OnHost() const override { return true; }
 };


--- a/mace/core/buffer.h
+++ b/mace/core/buffer.h
@@ -21,6 +21,7 @@
 #include <functional>

 #include "mace/core/allocator.h"
+#include "mace/core/macros.h"
 #include "mace/core/types.h"

 namespace mace {
@@ -133,6 +134,7 @@ class Buffer : public BufferBase {

  void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
    MACE_CHECK_NOTNULL(buf_);
+    MACE_UNUSED(pitch);
    return allocator_->Map(buf_, offset, length);
  }

@@ -232,6 +234,9 @@ class Image : public BufferBase {
  std::vector<size_t> image_shape() const { return shape_; }

  void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
+    MACE_UNUSED(offset);
+    MACE_UNUSED(length);
+    MACE_UNUSED(pitch);
    MACE_NOT_IMPLEMENTED;
    return nullptr;
  }
@@ -254,9 +259,17 @@ class Image : public BufferBase {
    mapped_buf_ = nullptr;
  }

-  void Resize(index_t size) { MACE_NOT_IMPLEMENTED; }
+  void Resize(index_t size) {
+    MACE_UNUSED(size);
+    MACE_NOT_IMPLEMENTED;
+  }

-  void Copy(void *src, index_t offset, index_t length) { MACE_NOT_IMPLEMENTED; }
+  void Copy(void *src, index_t offset, index_t length) {
+    MACE_UNUSED(src);
+    MACE_UNUSED(offset);
+    MACE_UNUSED(length);
+    MACE_NOT_IMPLEMENTED;
+  }

  bool OnHost() const { return allocator_->OnHost(); }

@@ -327,11 +340,17 @@ class BufferSlice : public BufferBase {
  }

  void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
+    MACE_UNUSED(offset);
+    MACE_UNUSED(length);
+    MACE_UNUSED(pitch);
    MACE_NOT_IMPLEMENTED;
    return nullptr;
  }

-  void UnMap(void *mapped_ptr) const { MACE_NOT_IMPLEMENTED; }
+  void UnMap(void *mapped_ptr) const {
+    MACE_UNUSED(mapped_ptr);
+    MACE_NOT_IMPLEMENTED;
+  }

  void Map(std::vector<size_t> *pitch) {
    MACE_CHECK_NOTNULL(buffer_);
@@ -350,7 +369,12 @@ class BufferSlice : public BufferBase {
      " to ", size, " is illegal");
  }

-  void Copy(void *src, index_t offset, index_t length) { MACE_NOT_IMPLEMENTED; }
+  void Copy(void *src, index_t offset, index_t length) {
+    MACE_UNUSED(src);
+    MACE_UNUSED(offset);
+    MACE_UNUSED(length);
+    MACE_NOT_IMPLEMENTED;
+  }

  index_t offset() const { return offset_; }


--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -14,6 +14,7 @@

 #include <utility>

+#include "mace/core/macros.h"
 #include "mace/core/net.h"
 #include "mace/utils/memory_logging.h"
 #include "mace/utils/timer.h"
@@ -25,7 +26,10 @@ NetBase::NetBase(const std::shared_ptr<const OperatorRegistry> op_registry,
                 const std::shared_ptr<const NetDef> net_def,
                 Workspace *ws,
                 DeviceType type)
-    : name_(net_def->name()), op_registry_(op_registry) {}
+    : name_(net_def->name()), op_registry_(op_registry) {
+  MACE_UNUSED(ws);
+  MACE_UNUSED(type);
+}

 SerialNet::SerialNet(const std::shared_ptr<const OperatorRegistry> op_registry,
                     const std::shared_ptr<const NetDef> net_def,

--- a/mace/core/runtime/cpu/cpu_runtime.cc
+++ b/mace/core/runtime/cpu/cpu_runtime.cc
@@ -33,6 +33,7 @@ namespace mace {

 namespace {

+#ifndef MACE_ENABLE_OPENMP
 int GetCPUCount() {
  char path[32];
  int cpu_count = 0;
@@ -50,12 +51,14 @@ int GetCPUCount() {
    cpu_count++;
  }
 }
+#endif

 int GetCPUMaxFreq(int cpu_id) {
  char path[64];
  snprintf(path, sizeof(path),
          "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq",
          cpu_id);
+
  FILE *fp = fopen(path, "rb");
  if (!fp) {
    LOG(WARNING) << "File: " << path << " not exists.";
@@ -63,47 +66,14 @@ int GetCPUMaxFreq(int cpu_id) {
  }

  int freq = 0;
-  fscanf(fp, "%d", &freq);
+  int items_read = fscanf(fp, "%d", &freq);
+  if (items_read != 1) {
+    LOG(WARNING) << "Read file: " << path << " failed.";
+  }
  fclose(fp);
  return freq;
 }

-void SortCPUIdsByMaxFreqAsc(std::vector<int> *cpu_ids, int *big_core_offset) {
-  MACE_CHECK_NOTNULL(cpu_ids);
-  int cpu_count = cpu_ids->size();
-  std::vector<int> cpu_max_freq;
-  cpu_max_freq.resize(cpu_count);
-
-  // set cpu max frequency
-  for (int i = 0; i < cpu_count; ++i) {
-    cpu_max_freq[i] = GetCPUMaxFreq(i);
-    (*cpu_ids)[i] = i;
-  }
-
-  // sort cpu ids by max frequency asc, bubble sort
-  for (int i = 0; i < cpu_count - 1; ++i) {
-    for (int j = i + 1; j < cpu_count; ++j) {
-      if (cpu_max_freq[i] > cpu_max_freq[j]) {
-        int tmp = (*cpu_ids)[i];
-        (*cpu_ids)[i] = (*cpu_ids)[j];
-        (*cpu_ids)[j] = tmp;
-
-        tmp = cpu_max_freq[i];
-        cpu_max_freq[i] = cpu_max_freq[j];
-        cpu_max_freq[j] = tmp;
-      }
-    }
-  }
-
-  *big_core_offset = 0;
-  for (int i = 1; i < cpu_count; ++i) {
-    if (cpu_max_freq[i] > cpu_max_freq[i - 1]) {
-      *big_core_offset = i;
-      break;
-    }
-  }
-}
-
 void SetThreadAffinity(cpu_set_t mask) {
 #if defined(__ANDROID__)
  pid_t pid = gettid();

--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -23,6 +23,7 @@
 #include <utility>

 #include "mace/public/mace_runtime.h"
+#include "mace/core/macros.h"
 #include "mace/core/file_storage.h"
 #include "mace/core/runtime/opencl/opencl_extension.h"
 #include "mace/public/mace.h"
@@ -176,6 +177,8 @@ void OpenCLPrintfCallback(const char *buffer,
                          size_t length,
                          size_t final,
                          void *user_data) {
+  MACE_UNUSED(final);
+  MACE_UNUSED(user_data);
  fwrite(buffer, 1, length, stdout);
 }

@@ -218,6 +221,22 @@ void GetAdrenoContextProperties(std::vector<cl_context_properties> *properties,
  // The properties list should be terminated with 0
  properties->push_back(0);
 }
+
+GPUType ParseGPUType(const std::string &device_name) {
+  constexpr const char *kQualcommAdrenoGPUStr = "QUALCOMM Adreno(TM)";
+  constexpr const char *kMaliGPUStr = "Mali";
+  constexpr const char *kPowerVRGPUStr = "PowerVR";
+
+  if (device_name == kQualcommAdrenoGPUStr) {
+    return GPUType::QUALCOMM_ADRENO;
+  } else if (device_name.find(kMaliGPUStr) != std::string::npos) {
+    return GPUType::MALI;
+  } else if (device_name.find(kPowerVRGPUStr) != std::string::npos) {
+    return GPUType::PowerVR;
+  } else {
+    return GPUType::UNKNOWN;
+  }
+}
 }  // namespace

 void OpenCLProfilingTimer::StartTiming() {}
@@ -389,11 +408,11 @@ cl::Device &OpenCLRuntime::device() { return *device_; }

 cl::CommandQueue &OpenCLRuntime::command_queue() { return *command_queue_; }

-const uint64_t OpenCLRuntime::device_global_mem_cache_size() const {
+uint64_t OpenCLRuntime::device_global_mem_cache_size() const {
  return device_gloabl_mem_cache_size_;
 }

-const uint32_t OpenCLRuntime::device_compute_units() const {
+uint32_t OpenCLRuntime::device_compute_units() const {
  return device_compute_units_;
 }

@@ -597,12 +616,12 @@ uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) {
  return size;
 }

-const bool OpenCLRuntime::IsNonUniformWorkgroupsSupported() {
+bool OpenCLRuntime::IsNonUniformWorkgroupsSupported() const {
  return (gpu_type_ == GPUType::QUALCOMM_ADRENO &&
      opencl_version_ == "2.0");
 }

-const GPUType OpenCLRuntime::gpu_type() const {
+GPUType OpenCLRuntime::gpu_type() const {
  return gpu_type_;
 }

@@ -610,36 +629,20 @@ const std::string OpenCLRuntime::platform_info() const {
  return platform_info_;
 }

-const GPUType OpenCLRuntime::ParseGPUType(
-    const std::string &device_name) {
-  constexpr const char *kQualcommAdrenoGPUStr = "QUALCOMM Adreno(TM)";
-  constexpr const char *kMaliGPUStr = "Mali";
-  constexpr const char *kPowerVRGPUStr = "PowerVR";
-
-  if (device_name == kQualcommAdrenoGPUStr) {
-    return GPUType::QUALCOMM_ADRENO;
-  } else if (device_name.find(kMaliGPUStr) != std::string::npos) {
-    return GPUType::MALI;
-  } else if (device_name.find(kPowerVRGPUStr) != std::string::npos) {
-    return GPUType::PowerVR;
-  } else {
-    return GPUType::UNKNOWN;
-  }
-}
 const std::string OpenCLRuntime::ParseDeviceVersion(
    const std::string &device_version) {
  // OpenCL Device version string format:
-  // OpenCL<space><major_version.minor_version><space>\
+  // OpenCL<space><major_version.minor_version><space>
  // <vendor-specific information>
  auto words = Split(device_version, ' ');
  return words[1];
 }

-const bool OpenCLRuntime::IsOutOfRangeCheckEnabled() const {
+bool OpenCLRuntime::IsOutOfRangeCheckEnabled() const {
  return out_of_range_check_;
 }

-const bool OpenCLRuntime::is_profiling_enabled() const {
+bool OpenCLRuntime::is_profiling_enabled() const {
  return is_profiling_enabled_;
 }


--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -70,18 +70,18 @@ class OpenCLRuntime {
  cl::Context &context();
  cl::Device &device();
  cl::CommandQueue &command_queue();
-  const GPUType gpu_type() const;
+  GPUType gpu_type() const;
  const std::string platform_info() const;
-  const uint64_t device_global_mem_cache_size() const;
-  const uint32_t device_compute_units() const;
+  uint64_t device_global_mem_cache_size() const;
+  uint32_t device_compute_units() const;

  void GetCallStats(const cl::Event &event, CallStats *stats);
  uint64_t GetDeviceMaxWorkGroupSize();
  uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel);
  uint64_t GetKernelWaveSize(const cl::Kernel &kernel);
-  const bool IsNonUniformWorkgroupsSupported();
-  const bool IsOutOfRangeCheckEnabled() const;
-  const bool is_profiling_enabled() const;
+  bool IsNonUniformWorkgroupsSupported() const;
+  bool IsOutOfRangeCheckEnabled() const;
+  bool is_profiling_enabled() const;

  cl::Kernel BuildKernel(const std::string &program_name,
                         const std::string &kernel_name,
@@ -112,7 +112,6 @@ class OpenCLRuntime {
      const std::string &built_program_key,
      const std::string &build_options_str,
      cl::Program *program);
-  const GPUType ParseGPUType(const std::string &device_name);
  const std::string ParseDeviceVersion(const std::string &device_version);

 private:

--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -136,6 +136,7 @@ class ActivationFunctor<DeviceType::CPU, float> {
                  const Tensor *alpha,
                  Tensor *output,
                  StatsFuture *future) {
+    MACE_UNUSED(future);
    const float *input_ptr = input->data<float>();
    float *output_ptr = output->mutable_data<float>();
    if (activation_ == PRELU) {

--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -39,6 +39,7 @@ struct AddNFunctor {
  void operator()(const std::vector<const Tensor *> &input_tensors,
                  Tensor *output_tensor,
                  StatsFuture *future) {
+    MACE_UNUSED(future);
    output_tensor->ResizeLike(input_tensors[0]);
    index_t size = output_tensor->size();
    Tensor::MappingGuard output_map(output_tensor);

--- a/mace/kernels/arm/conv_2d_neon_3x3.cc
+++ b/mace/kernels/arm/conv_2d_neon_3x3.cc
@@ -42,22 +42,25 @@ void Conv2dNeonK3x3S1(const float *input,
    for (index_t m = 0; m < out_channels; m += 2) {
      if (m + 1 < out_channels) {
        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
+#if defined(MACE_ENABLE_NEON)
        float *out_ptr1_base =
          output + b * out_batch_size + (m + 1) * out_image_size;
+#endif
        for (index_t c = 0; c < in_channels; ++c) {
          float *out_ptr0 = out_ptr0_base;
-          float *out_ptr1 = out_ptr1_base;
-
          const float *in_ptr0 = input + b * in_batch_size + c * in_image_size;
+          const float *filter_ptr0 = filter + m * in_channels * 9 + c * 9;
+
+#if defined(MACE_ENABLE_NEON)
+          float *out_ptr1 = out_ptr1_base;
          const float *in_ptr1 =
            input + b * in_batch_size + c * in_image_size + 1 * in_width;
          const float *in_ptr2 =
            input + b * in_batch_size + c * in_image_size + 2 * in_width;
          const float *in_ptr3 =
            input + b * in_batch_size + c * in_image_size + 3 * in_width;
-          const float *filter_ptr0 = filter + m * in_channels * 9 + c * 9;
          const float *filter_ptr1 = filter + (m + 1) * in_channels * 9 + c * 9;
-
+#endif
 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
          // load filter (2 outch x 3 height x 3 width): vf_outch_height
          float32x4_t vf00, vf01, vf02;
@@ -321,12 +324,14 @@ void Conv2dNeonK3x3S1(const float *input,

            const float
              *in_ptr0 = input + b * in_batch_size + c * in_image_size;
+#if defined(MACE_ENABLE_NEON)
            const float *in_ptr1 =
              input + b * in_batch_size + c * in_image_size + 1 * in_width;
            const float *in_ptr2 =
              input + b * in_batch_size + c * in_image_size + 2 * in_width;
            const float *in_ptr3 =
              input + b * in_batch_size + c * in_image_size + 3 * in_width;
+#endif
            const float *filter_ptr0 = filter + mm * in_channels * 9 + c * 9;

 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)

--- a/mace/kernels/arm/conv_2d_neon_5x5.cc
+++ b/mace/kernels/arm/conv_2d_neon_5x5.cc
@@ -121,23 +121,25 @@ void Conv2dNeonK5x5S1(const float *input,
    for (index_t m = 0; m < out_channels; m += 4) {
      if (m + 3 < out_channels) {
        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
+#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
        float *out_ptr1_base =
          output + b * out_batch_size + (m + 1) * out_image_size;
        float *out_ptr2_base =
            output + b * out_batch_size + (m + 2) * out_image_size;
        float *out_ptr3_base =
            output + b * out_batch_size + (m + 3) * out_image_size;
+#endif
        for (index_t c = 0; c < in_channels; ++c) {
          const float *in_ptr_base =
              input + b * in_batch_size + c * in_image_size;
          const float *filter_ptr0 = filter + m * in_channels * 25 + c * 25;
+#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
          const float *filter_ptr1 =
              filter + (m + 1) * in_channels * 25 + c * 25;
          const float *filter_ptr2 =
              filter + (m + 2) * in_channels * 25 + c * 25;
          const float *filter_ptr3 =
              filter + (m + 3) * in_channels * 25 + c * 25;
-#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
          for (index_t h = 0; h < out_height; ++h) {
            for (index_t w = 0; w + 3 < out_width; w += 4) {
               // input offset

--- a/mace/kernels/arm/conv_2d_neon_7x7.cc
+++ b/mace/kernels/arm/conv_2d_neon_7x7.cc
@@ -198,23 +198,25 @@ void Conv2dNeonK7x7S1(const float *input,
    for (index_t m = 0; m < out_channels; m += 4) {
      if (m + 3 < out_channels) {
        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
+#if defined(MACE_ENABLE_NEON)
        float *out_ptr1_base =
          output + b * out_batch_size + (m + 1) * out_image_size;
        float *out_ptr2_base =
            output + b * out_batch_size + (m + 2) * out_image_size;
        float *out_ptr3_base =
            output + b * out_batch_size + (m + 3) * out_image_size;
+#endif
        for (index_t c = 0; c < in_channels; ++c) {
          const float *in_ptr_base =
              input + b * in_batch_size + c * in_image_size;
          const float *filter_ptr0 = filter + m * in_channels * 49 + c * 49;
+#if defined(MACE_ENABLE_NEON)
          const float *filter_ptr1 =
              filter + (m + 1) * in_channels * 49 + c * 49;
          const float *filter_ptr2 =
              filter + (m + 2) * in_channels * 49 + c * 49;
          const float *filter_ptr3 =
              filter + (m + 3) * in_channels * 49 + c * 49;
-#if defined(MACE_ENABLE_NEON)
          for (index_t h = 0; h < out_height; ++h) {
            for (index_t w = 0; w + 3 < out_width; w += 4) {
               // input offset
@@ -352,23 +354,25 @@ void Conv2dNeonK7x7S2(const float *input,
    for (index_t m = 0; m < out_channels; m += 4) {
      if (m + 3 < out_channels) {
        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
+#if defined(MACE_ENABLE_NEON)
        float *out_ptr1_base =
            output + b * out_batch_size + (m + 1) * out_image_size;
        float *out_ptr2_base =
            output + b * out_batch_size + (m + 2) * out_image_size;
        float *out_ptr3_base =
            output + b * out_batch_size + (m + 3) * out_image_size;
+#endif
        for (index_t c = 0; c < in_channels; ++c) {
          const float *in_ptr_base =
              input + b * in_batch_size + c * in_image_size;
          const float *filter_ptr0 = filter + m * in_channels * 49 + c * 49;
+#if defined(MACE_ENABLE_NEON)
          const float *filter_ptr1 =
              filter + (m + 1) * in_channels * 49 + c * 49;
          const float *filter_ptr2 =
              filter + (m + 2) * in_channels * 49 + c * 49;
          const float *filter_ptr3 =
              filter + (m + 3) * in_channels * 49 + c * 49;
-#if defined(MACE_ENABLE_NEON)
          for (index_t h = 0; h < out_height; ++h) {
            for (index_t w = 0; w + 3 < out_width; w += 4) {
              // input offset
@@ -516,23 +520,25 @@ void Conv2dNeonK7x7S3(const float *input,
    for (index_t m = 0; m < out_channels; m += 4) {
      if (m + 3 < out_channels) {
        float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
+#if defined(MACE_ENABLE_NEON)
        float *out_ptr1_base =
            output + b * out_batch_size + (m + 1) * out_image_size;
        float *out_ptr2_base =
            output + b * out_batch_size + (m + 2) * out_image_size;
        float *out_ptr3_base =
            output + b * out_batch_size + (m + 3) * out_image_size;
+#endif
        for (index_t c = 0; c < in_channels; ++c) {
          const float *in_ptr_base =
              input + b * in_batch_size + c * in_image_size;
          const float *filter_ptr0 = filter + m * in_channels * 49 + c * 49;
+#if defined(MACE_ENABLE_NEON)
          const float *filter_ptr1 =
              filter + (m + 1) * in_channels * 49 + c * 49;
          const float *filter_ptr2 =
              filter + (m + 2) * in_channels * 49 + c * 49;
          const float *filter_ptr3 =
              filter + (m + 3) * in_channels * 49 + c * 49;
-#if defined(MACE_ENABLE_NEON)
          for (index_t h = 0; h < out_height; ++h) {
            for (index_t w = 0; w + 3 < out_width; w += 4) {
              // input offset

--- a/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
+++ b/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
@@ -17,6 +17,7 @@
 #endif

 #include "mace/kernels/arm/depthwise_conv2d_neon.h"
+#include "mace/core/macros.h"

 namespace mace {
 namespace kernels {
@@ -65,6 +66,10 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
                               const index_t valid_w_start,
                               const index_t valid_w_stop,
                               float *output) {
+#if !defined(MACE_ENABLE_NEON)
+  MACE_UNUSED(valid_w_start);
+  MACE_UNUSED(valid_w_stop);
+#endif
  const index_t multiplier = out_channels / in_channels;
  const index_t in_image_size = in_height * in_width;
  const index_t out_image_size = out_height * out_width;
@@ -305,6 +310,10 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
                               const index_t valid_w_start,
                               const index_t valid_w_stop,
                               float *output) {
+#if !defined(MACE_ENABLE_NEON)
+  MACE_UNUSED(valid_w_start);
+  MACE_UNUSED(valid_w_stop);
+#endif
  const index_t multiplier = out_channels / in_channels;
  const index_t in_image_size = in_height * in_width;
  const index_t out_image_size = out_height * out_width;

--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -64,6 +64,7 @@ struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase {
                  const float epsilon,
                  Tensor *output,
                  StatsFuture *future) {
+    MACE_UNUSED(future);
    // Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
    // The calculation formula for inference is
    // Y = \frac{ \scale } { \sqrt{var+\variance_epsilon} } * X +

--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -38,6 +38,7 @@ struct BiasAddFunctor<DeviceType::CPU, float> {
                  const Tensor *bias,
                  Tensor *output,
                  StatsFuture *future) {
+    MACE_UNUSED(future);
    const index_t batch = input->dim(0);
    const index_t channels = input->dim(1);
    const index_t height = input->dim(2);

--- a/mace/kernels/buffer_to_image.h
+++ b/mace/kernels/buffer_to_image.h
@@ -39,6 +39,10 @@ struct BufferToImageFunctor : BufferToImageFunctorBase {
                  const BufferType type,
                  Tensor *output,
                  StatsFuture *future) {
+    MACE_UNUSED(input);
+    MACE_UNUSED(type);
+    MACE_UNUSED(output);
+    MACE_UNUSED(future);
    MACE_NOT_IMPLEMENTED;
  }
 };

--- a/mace/kernels/channel_shuffle.h
+++ b/mace/kernels/channel_shuffle.h
@@ -31,6 +31,7 @@ struct ChannelShuffleFunctor {
  void operator()(const Tensor *input,
                  Tensor *output,
                  StatsFuture *future) {
+    MACE_UNUSED(future);
    output->ResizeLike(input);

    Tensor::MappingGuard logits_guard(input);
@@ -56,7 +57,7 @@ struct ChannelShuffleFunctor {
        index_t idx = c / groups_;
        for (index_t hw = 0; hw < height * width; ++hw) {
          output_base[c * image_size + hw] = input_base[
-            (c % groups_ * channels_per_group + c / groups_) * image_size + hw];
+            (g * channels_per_group + idx) * image_size + hw];
        }
      }
    }

--- a/mace/kernels/concat.h
+++ b/mace/kernels/concat.h
@@ -43,8 +43,9 @@ struct ConcatFunctor : ConcatFunctorBase {
  void operator()(const std::vector<const Tensor *> &input_list,
                  Tensor *output,
                  StatsFuture *future) {
+    MACE_UNUSED(future);
    const Tensor *input0 = input_list.front();
-    const int inputs_count = input_list.size();
+    const size_t inputs_count = input_list.size();

    std::vector<index_t> output_shape(input0->shape());
    index_t inner_size = 1;
@@ -53,7 +54,7 @@ struct ConcatFunctor : ConcatFunctorBase {
    }
    std::vector<index_t> outer_sizes(inputs_count, 0);
    outer_sizes[0] = input0->size() / inner_size;
-    for (int i = 1; i < inputs_count; ++i) {
+    for (size_t i = 1; i < inputs_count; ++i) {
      const Tensor *input = input_list[i];
      MACE_CHECK(input->dim_size() == input0->dim_size(),
                 "Ranks of all input tensors must be same.");

--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -103,8 +103,6 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
    const index_t in_batch_size = in_channels * in_image_size;
    const index_t out_batch_size = out_channels * out_image_size;
    const index_t filter_size = filter_height * filter_width;
-    const index_t in_tile_size =
-        3 * stride_w + (filter_width - 1) * dilation_w + 1;

 #pragma omp parallel for collapse(2)
    for (index_t b = 0; b < batch; ++b) {
@@ -267,6 +265,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                  const Tensor *bias,
                  Tensor *output,
                  StatsFuture *future) {
+    MACE_UNUSED(future);
    MACE_CHECK_NOTNULL(input);
    MACE_CHECK_NOTNULL(filter);
    MACE_CHECK_NOTNULL(output);
@@ -345,7 +344,6 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
    Tensor::MappingGuard bias_guard(bias);
    Tensor::MappingGuard output_guard(output);

-    auto input_data = input->data<float>();
    auto filter_data = filter->data<float>();
    auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
    auto output_data = output->mutable_data<float>();
@@ -719,7 +717,10 @@ struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
                        paddings,
                        dilations,
                        activation,
-                        relux_max_limit) {}
+                        relux_max_limit) {
+    MACE_UNUSED(is_filter_transformed);
+    MACE_UNUSED(scratch);
+  }

  void operator()(const Tensor *input,
                  const Tensor *filter,

--- a/mace/kernels/depth_to_space.h
+++ b/mace/kernels/depth_to_space.h
@@ -33,6 +33,7 @@ struct DepthToSpaceOpFunctor {
  explicit DepthToSpaceOpFunctor(const int block_size, bool d2s)
    : block_size_(block_size), d2s_(d2s) {}
  void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
+    MACE_UNUSED(future);
    const int batch_size = input->dim(0);
    const int input_depth = input->dim(1);
    const int input_height = input->dim(2);

--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -133,6 +133,7 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
                  const Tensor *bias,
                  Tensor *output,
                  StatsFuture *future) {
+    MACE_UNUSED(future);
    MACE_CHECK_NOTNULL(input);
    MACE_CHECK_NOTNULL(filter);
    MACE_CHECK_NOTNULL(output);

--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
@@ -470,6 +470,7 @@ struct EltwiseFunctor<DeviceType::CPU, float>: EltwiseFunctorBase {
                  const Tensor *input1,
                  Tensor *output,
                  StatsFuture *future) {
+    MACE_UNUSED(future);
    bool swapped = false;
    if (input1 != nullptr) {
      MACE_CHECK(input0->dim_size() == input1->dim_size()

--- a/mace/kernels/fully_connected.h
+++ b/mace/kernels/fully_connected.h
@@ -59,6 +59,7 @@ struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase {
                  const Tensor *bias,
                  Tensor *output,
                  StatsFuture *future) {
+    MACE_UNUSED(future);
    std::vector<index_t> output_shape = {input->dim(0), weight->dim(0), 1, 1};
    output->Resize(output_shape);
    const index_t N = output->dim(0);

--- a/mace/kernels/gemm.cc
+++ b/mace/kernels/gemm.cc
@@ -135,7 +135,9 @@ inline void GemmTile(const float *A,
                     const index_t stride_k,
                     const index_t stride_w,
                     float *C) {
+#if defined(MACE_ENABLE_NEON)
  index_t h, w, k;
+#endif

 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
  for (h = 0; h + 7 < height; h += 8) {
@@ -443,6 +445,7 @@ inline void GemmTile(const float *A,
 #else

 #if defined(MACE_ENABLE_NEON)  // armv7
+  w = (width >> 2) << 2;
  for (h = 0; h + 3 < height; h += 4) {
    for (k = 0; k + 3 < K; k += 4) {
      const float *a_ptr = A + (h * stride_k + k);
@@ -523,8 +526,6 @@ inline void GemmTile(const float *A,
          c_ptr2 += 4;
          c_ptr3 += 4;
        }
-
-        w = (width >> 2) << 2;
      }
      if (w < width) {
        const float *a_ptr = A + (h * stride_k + k);

--- a/mace/kernels/local_response_norm.h
+++ b/mace/kernels/local_response_norm.h
@@ -42,6 +42,7 @@ struct LocalResponseNormFunctor<DeviceType::CPU, float> {
                  float beta,
                  Tensor *output,
                  StatsFuture *future) {
+    MACE_UNUSED(future);
    const index_t batch = input->dim(0);
    const index_t channels = input->dim(1);
    const index_t height = input->dim(2);

--- a/mace/kernels/matmul.h
+++ b/mace/kernels/matmul.h
@@ -42,6 +42,7 @@ struct MatMulFunctor {
                  const Tensor *B,
                  Tensor *C,
                  StatsFuture *future) {
+    MACE_UNUSED(future);
    std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
    C->Resize(c_shape);

@@ -59,14 +60,6 @@ struct MatMulFunctor {
    // It is better to use large block size if it fits for fast cache.
    // Assume l1 cache size is 32k, we load three blocks at a time (A, B, C),
    // the block size should be sqrt(32k / sizeof(T) / 3).
-    const index_t block_size = 48;
-    const index_t block_tile_height = RoundUpDiv(height, block_size);
-    const index_t block_tile_width = RoundUpDiv(width, block_size);
-    const index_t block_tile_k = RoundUpDiv(K, block_size);
-    const index_t remain_height = height % block_size;
-    const index_t remain_width = width % block_size;
-    const index_t remain_k = K % block_size;
-    constexpr index_t register_tile_size = 4;
    memset(c_ptr_base, 0, batch * height * width * sizeof(T));

    Gemm(a_ptr_base, b_ptr_base, batch, height, K, width, c_ptr_base);

--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -36,7 +36,7 @@ void AddNFunctor<DeviceType::GPU, T>::operator()(

  auto runtime = OpenCLRuntime::Global();

-  for (int i = 1; i < size; ++i) {
+  for (size_t i = 1; i < size; ++i) {
    MACE_CHECK_NOTNULL(input_tensors[i]);
    MACE_CHECK(batch == input_tensors[i]->dim(0));
    MACE_CHECK(height == input_tensors[i]->dim(1));

--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -136,7 +136,6 @@ static void ConcatN(cl::Kernel *kernel,
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
  const index_t width = output->dim(2);
-  const index_t channel = output->dim(3);

  auto runtime = OpenCLRuntime::Global();


--- a/mace/kernels/opencl/conv_2d_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_1x1.cc
@@ -72,6 +72,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
                             StatsFuture *future,
                             uint32_t *kwg_size,
                             std::unique_ptr<BufferBase> *kernel_error) {
+  MACE_UNUSED(padding);
+  MACE_UNUSED(dilations);
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
  const index_t width = output->dim(2);

--- a/mace/kernels/opencl/depthwise_conv.cc
+++ b/mace/kernels/opencl/depthwise_conv.cc
@@ -135,7 +135,6 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
  }
  if (!IsVecEqual(*prev_input_shape, input->shape())) {
-    const index_t input_batch = input->dim(0);
    const index_t input_height = input->dim(1);
    const index_t input_width = input->dim(2);


--- a/mace/kernels/opencl/eltwise.cc
+++ b/mace/kernels/opencl/eltwise.cc
@@ -25,6 +25,7 @@ void EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
                                                       const Tensor *input1,
                                                       Tensor *output,
                                                       StatsFuture *future) {
+  MACE_UNUSED(future);
  bool swapped = false;
  if (input1 != nullptr) {
    MACE_CHECK(input0->dim_size() == input1->dim_size()

--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -206,17 +206,6 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
  }
 }

-std::vector<uint32_t> Default2DLocalWS(const uint32_t *gws,
-                                       const uint32_t kwg_size) {
-  std::vector<uint32_t> lws(3, 0);
-  uint64_t cache_size =
-      OpenCLRuntime::Global()->device_global_mem_cache_size();
-  uint32_t base = cache_size / kBaseGPUMemCacheSize;
-  lws[0] = std::min<uint32_t>(base, kwg_size);
-  lws[1] = kwg_size / lws[1];
-  return lws;
-}
-
 std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
                                       const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);

--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -19,6 +19,7 @@
 #include <vector>

 #include "mace/core/future.h"
+#include "mace/core/macros.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/types.h"
@@ -95,6 +96,7 @@ bool IsVecEqual(const std::vector<T> &input0,

 template <typename T>
 void AppendToStream(std::stringstream *ss, const std::string &delimiter, T v) {
+  MACE_UNUSED(delimiter);
  (*ss) << v;
 }

@@ -114,8 +116,6 @@ std::string Concat(Args... args) {
  return ss.str();
 }

-std::vector<uint32_t> Default2DLocalWS(const uint32_t *gws,
-                                       const uint32_t kwg_size);
 std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
                                       const uint32_t kwg_size);
 }  // namespace kernels

--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -25,6 +25,7 @@ void MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
                                                      const Tensor *B,
                                                      Tensor *C,
                                                      StatsFuture *future) {
+  MACE_UNUSED(future);
  std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
  std::vector<size_t> c_image_shape;
  CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, &c_image_shape);

--- a/mace/kernels/opencl/out_of_range_check_test.cc
+++ b/mace/kernels/opencl/out_of_range_check_test.cc
@@ -25,9 +25,9 @@ namespace mace {
 namespace kernels {
 namespace {

-const bool BufferToImageOpImpl(Tensor *buffer,
-                               Tensor *image,
-                               const std::vector<size_t> &image_shape) {
+bool BufferToImageOpImpl(Tensor *buffer,
+                         Tensor *image,
+                         const std::vector<size_t> &image_shape) {
  std::unique_ptr<BufferBase> kernel_error;
  uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
                     static_cast<uint32_t>(image_shape[1])};
@@ -150,7 +150,7 @@ TEST(OutOfRangeCheckTest, RandomTest) {
  ASSERT_FALSE(BufferToImageOpImpl(buffer, image, image_shape));

  std::vector<size_t> overflow_image_shape = image_shape;
-  for (int i = 0; i < overflow_image_shape.size(); ++i) {
+  for (size_t i = 0; i < overflow_image_shape.size(); ++i) {
    overflow_image_shape[i] += 1;
  }
  ASSERT_TRUE(BufferToImageOpImpl(buffer, image, overflow_image_shape));

--- a/mace/kernels/opencl/pad.cc
+++ b/mace/kernels/opencl/pad.cc
@@ -25,7 +25,8 @@ void PadFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input,
    Tensor *output,
    StatsFuture *future) {
-  MACE_CHECK(this->paddings_.size() == (input->dim_size() * 2));
+  MACE_CHECK(
+      this->paddings_.size() == static_cast<size_t>((input->dim_size() * 2)));
  MACE_CHECK((this->paddings_[0] == 0) && (this->paddings_[1] == 0)
                 && (this->paddings_[6] == 0) && (this->paddings_[7] == 0))
    << "Mace only support height/width dimension now";

--- a/mace/kernels/opencl/slice.cc
+++ b/mace/kernels/opencl/slice.cc
@@ -75,7 +75,7 @@ void SliceFunctor<DeviceType::GPU, T>::operator()(
  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
  cl::Event event;
  CallStats call_stats{INT64_MAX, 0};
-  for (int i = 0; i < outputs_count; ++i) {
+  for (size_t i = 0; i < outputs_count; ++i) {
    uint32_t idx = 0;
    if (runtime->IsOutOfRangeCheckEnabled()) {
      kernel_.setArg(idx++,

--- a/mace/kernels/pad.h
+++ b/mace/kernels/pad.h
@@ -47,7 +47,9 @@ struct PadFunctor : public PadFunctorBase {
  void operator()(const Tensor *input,
                  Tensor *output,
                  StatsFuture *future) {
-    MACE_CHECK(this->paddings_.size() == (input->dim_size() * 2));
+    MACE_UNUSED(future);
+    MACE_CHECK(
+        this->paddings_.size() == static_cast<size_t>(input->dim_size()) * 2);
    auto input_shape = input->shape();
    output->Resize({input_shape[0] + this->paddings_[0] + this->paddings_[1],
                    input_shape[1] + this->paddings_[2] + this->paddings_[3],

--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -173,6 +173,7 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
  void operator()(const Tensor *input_tensor,
                  Tensor *output_tensor,
                  StatsFuture *future) {
+    MACE_UNUSED(future);
    std::vector<index_t> output_shape(4);
    std::vector<index_t> filter_shape = {
      input_tensor->dim(1), input_tensor->dim(1), kernels_[0], kernels_[1]};

--- a/mace/kernels/proposal.h
+++ b/mace/kernels/proposal.h
@@ -92,7 +92,7 @@ inline std::vector<int> nms(const float *bboxes_ptr,
  for (int i = 0; i < num_bboxes; ++i) {
    if (suppressed[i] == 1) continue;
    keep.push_back(i);
-    if (keep.size() >= post_nms_top_n) break;
+    if (keep.size() >= static_cast<size_t>(post_nms_top_n)) break;
    int coord_idx = i << 2;
    const float x1 = bboxes_ptr[coord_idx];
    const float y1 = bboxes_ptr[coord_idx + 1];
@@ -141,10 +141,11 @@ struct ProposalFunctor {
                  const Tensor *img_info_tensor,
                  Tensor *output,
                  StatsFuture *future) {
+    MACE_UNUSED(future);
    MACE_CHECK(rpn_cls_prob->dim(1) == rpn_bbox_pred->dim(1) &&
        rpn_cls_prob->dim(2) == rpn_bbox_pred->dim(2));
    MACE_CHECK((rpn_cls_prob->dim(3) / 2 == rpn_bbox_pred->dim(3) / 4) &&
-        (rpn_cls_prob->dim(3) / 2 == anchors_.size()));
+        (static_cast<size_t>(rpn_cls_prob->dim(3) / 2) == anchors_.size()));
    const float *img_info = img_info_tensor->data<float>();
    const int im_height = static_cast<int>(img_info[0] - 1);
    const int im_width = static_cast<int>(img_info[1] - 1);

--- a/mace/kernels/psroi_align.h
+++ b/mace/kernels/psroi_align.h
@@ -38,6 +38,7 @@ struct PSROIAlignFunctor {
                  const Tensor *rois,
                  Tensor *output,
                  StatsFuture *future) {
+    MACE_UNUSED(future);
    const int height = static_cast<int>(input->dim(1));
    const int width = static_cast<int>(input->dim(2));
    const int channels = static_cast<int>(input->dim(3));

--- a/mace/kernels/quantize.h
+++ b/mace/kernels/quantize.h
@@ -81,6 +81,7 @@ struct QuantizeFunctor<CPU, uint8_t> {
                  Tensor *out_min,
                  Tensor *out_max,
                  StatsFuture *future) {
+    MACE_UNUSED(future);
    const float *input_data = input->data<float>();
    const float in_min_data = in_min->data<float>()[0];
    const float in_max_data = in_max->data<float>()[0];
@@ -109,6 +110,7 @@ struct DequantizeFunctor<CPU, uint8_t> {
                  const Tensor *in_max,
                  Tensor *output,
                  StatsFuture *future) {
+    MACE_UNUSED(future);
    const uint8_t *input_data = input->data<uint8_t>();
    const float in_min_data = in_min->data<float>()[0];
    const float in_max_data = in_max->data<float>()[0];
@@ -137,6 +139,7 @@ struct RequantizeFunctor<CPU, uint8_t> {
                  Tensor *out_min,
                  Tensor *out_max,
                  StatsFuture *future) {
+    MACE_UNUSED(future);
    const int *input_data = input->data<int>();
    const float in_min_data = in_min->data<float>()[0];
    const float in_max_data = in_max->data<float>()[0];

--- a/mace/kernels/reshape.h
+++ b/mace/kernels/reshape.h
@@ -35,6 +35,7 @@ struct ReshapeFunctor {
                  const std::vector<index_t> &out_shape,
                  Tensor *output,
                  StatsFuture *future) {
+    MACE_UNUSED(future);
    output->ResizeWithBuffer(out_shape, input->UnderlyingBuffer());
  }
 };

--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -137,6 +137,7 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float>
    : ResizeBilinearFunctorBase(size, align_corners) {}

  void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
+    MACE_UNUSED(future);
    const index_t batch = input->dim(0);
    const index_t channels = input->dim(1);
    const index_t in_height = input->dim(2);

--- a/mace/kernels/slice.h
+++ b/mace/kernels/slice.h
@@ -44,6 +44,7 @@ struct SliceFunctor : SliceFunctorBase {
  void operator()(const Tensor *input,
                  const std::vector<Tensor *> &output_list,
                  StatsFuture *future) {
+    MACE_UNUSED(future);
    const index_t input_channels = input->dim(axis_);
    const size_t outputs_count = output_list.size();
    const index_t output_channels = input_channels / outputs_count;

--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
@@ -39,6 +39,7 @@ struct SoftmaxFunctor;
 template<>
 struct SoftmaxFunctor<DeviceType::CPU, float> {
  void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
+    MACE_UNUSED(future);
    const index_t batch = input->dim(0);
    const index_t class_count = input->dim(1);
    const index_t class_size = input->dim(2) * input->dim(3);

--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.h
@@ -53,6 +53,10 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase {
                  const std::vector<index_t> &output_shape,
                  Tensor *batch_tensor,
                  StatsFuture *future) {
+    MACE_UNUSED(space_tensor);
+    MACE_UNUSED(output_shape);
+    MACE_UNUSED(batch_tensor);
+    MACE_UNUSED(future);
    MACE_NOT_IMPLEMENTED;
  }
 };

--- a/mace/kernels/transpose.h
+++ b/mace/kernels/transpose.h
@@ -30,6 +30,7 @@ struct TransposeFunctor {
  explicit TransposeFunctor(const std::vector<int> &dims) : dims_(dims) {}

  void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
+    MACE_UNUSED(future);
    Tensor::MappingGuard input_guard(input);
    Tensor::MappingGuard output_guard(output);
    const std::vector<index_t> &input_shape = input->shape();

--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
@@ -51,6 +51,9 @@ struct WinogradTransformFunctor : WinogradTransformFunctorBase {
      : WinogradTransformFunctorBase(padding_type, paddings) {}

  void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
+    MACE_UNUSED(input);
+    MACE_UNUSED(output);
+    MACE_UNUSED(future);
    MACE_NOT_IMPLEMENTED;
  }
 };
@@ -105,6 +108,9 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
                  const Tensor *bias,
                  Tensor *output,
                  StatsFuture *future) {
+    MACE_UNUSED(input);
+    MACE_UNUSED(bias);
+    MACE_UNUSED(output);
    MACE_NOT_IMPLEMENTED;
  }
 };

--- a/mace/ops/batch_to_space.cc
+++ b/mace/ops/batch_to_space.cc
@@ -29,6 +29,8 @@ void Register_BatchToSpaceND(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    BatchToSpaceNDOp<DeviceType::GPU, half>);
+#else
+  MACE_UNUSED(op_registry);
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/ops/batch_to_space.h
+++ b/mace/ops/batch_to_space.h
@@ -38,7 +38,7 @@ class BatchToSpaceNDOp : public Operator<D, T> {
    Tensor *space_tensor = this->Output(OUTPUT);

    std::vector<index_t> output_shape(4, 0);
-    CalculateOutputShape(batch_tensor, space_tensor, output_shape.data());
+    CalculateOutputShape(batch_tensor, output_shape.data());
    functor_(space_tensor, output_shape, const_cast<Tensor *>(batch_tensor),
             future);
    return true;
@@ -46,7 +46,6 @@ class BatchToSpaceNDOp : public Operator<D, T> {

 private:
  inline void CalculateOutputShape(const Tensor *input_tensor,
-                                   Tensor *output,
                                   index_t *output_shape) {
    auto crops = OperatorBase::GetRepeatedArgument<int>("crops", {0, 0, 0, 0});
    auto block_shape =

--- a/mace/ops/channel_shuffle.h
+++ b/mace/ops/channel_shuffle.h
@@ -45,7 +45,6 @@ class ChannelShuffleOp : public Operator<D, T> {
    MACE_CHECK(channels % group_ == 0,
               "input channels must be an integral multiple of group. ",
               input->dim(3));
-    int channels_per_group = channels / group_;
    functor_(input, output, future);

    return true;

--- a/mace/ops/folded_batch_norm_test.cc
+++ b/mace/ops/folded_batch_norm_test.cc
@@ -30,7 +30,7 @@ void CalculateScaleOffset(const std::vector<float> &gamma,
                          std::vector<float> *scale,
                          std::vector<float> *offset) {
  size_t size = gamma.size();
-  for (int i = 0; i < size; ++i) {
+  for (size_t i = 0; i < size; ++i) {
    (*scale)[i] = gamma[i] / std::sqrt(var[i] + epsilon);
    (*offset)[i] = beta[i] - mean[i] * (*scale)[i];
  }

--- a/mace/ops/proposal_test.cc
+++ b/mace/ops/proposal_test.cc
@@ -45,7 +45,7 @@ TEST_F(ProposalOpTest, CPUSimple) {
      .Finalize(net.NewOperatorDef());

  std::vector<float> scores(height * width * 18);
-  for (int i = 0 ; i < scores.size(); ++i) {
+  for (size_t i = 0 ; i < scores.size(); ++i) {
    scores[i] = i;
  }


--- a/mace/ops/quantize_test.cc
+++ b/mace/ops/quantize_test.cc
@@ -95,8 +95,6 @@ TEST_F(QuantizeTest, TestQuantizeTrend) {
  net.RunOp();

  auto output = net.GetTensor("Output");
-  auto output_min = net.GetTensor("OutputMin");
-  auto output_max = net.GetTensor("OutputMax");

  const uint8_t *output_data = net.GetTensor("Output")->data<uint8_t>();
  for (int i = 1; i < output->size(); ++i) {

--- a/mace/ops/space_to_batch.cc
+++ b/mace/ops/space_to_batch.cc
@@ -30,6 +30,8 @@ void Register_SpaceToBatchND(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    SpaceToBatchNDOp<DeviceType::GPU, half>);
+#else
+  MACE_UNUSED(op_registry);
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/ops/space_to_batch.h
+++ b/mace/ops/space_to_batch.h
@@ -39,7 +39,7 @@ class SpaceToBatchNDOp : public Operator<D, T> {
    Tensor *batch_tensor = this->Output(OUTPUT);

    std::vector<index_t> output_shape(4, 0);
-    CalculateOutputShape(space_tensor, batch_tensor, output_shape.data());
+    CalculateOutputShape(space_tensor, output_shape.data());
    functor_(const_cast<Tensor *>(space_tensor), output_shape, batch_tensor,
             future);
    return true;
@@ -47,7 +47,6 @@ class SpaceToBatchNDOp : public Operator<D, T> {

 private:
  inline void CalculateOutputShape(const Tensor *input_tensor,
-                                   Tensor *output,
                                   index_t *output_shape) {
    auto paddings =
        OperatorBase::GetRepeatedArgument<int>("paddings", {0, 0, 0, 0});

--- a/mace/ops/transpose.h
+++ b/mace/ops/transpose.h
@@ -35,11 +35,11 @@ class TransposeOp : public Operator<D, T> {
    const Tensor *input = this->Input(INPUT);
    Tensor *output = this->Output(OUTPUT);
    const std::vector<index_t> &input_shape = input->shape();
-    MACE_CHECK(input_shape.size() == 4 && dims_.size() == 4
-                 || input_shape.size() == 2 && dims_.size() == 2,
+    MACE_CHECK((input_shape.size() == 4 && dims_.size() == 4)
+                 || (input_shape.size() == 2 && dims_.size() == 2),
               "rank should be 2 or 4");
    std::vector<index_t> output_shape;
-    for (int i = 0; i < dims_.size(); ++i) {
+    for (size_t i = 0; i < dims_.size(); ++i) {
      output_shape.push_back(input_shape[dims_[i]]);
    }
    output->Resize(output_shape);

--- a/mace/ops/winograd_inverse_transform.cc
+++ b/mace/ops/winograd_inverse_transform.cc
@@ -30,6 +30,8 @@ void Register_WinogradInverseTransform(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    WinogradInverseTransformOp<DeviceType::GPU, half>);
+#else
+  MACE_UNUSED(op_registry);
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/ops/winograd_transform.cc
+++ b/mace/ops/winograd_transform.cc
@@ -30,6 +30,8 @@ void Register_WinogradTransform(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    WinogradTransformOp<DeviceType::GPU, half>);
+#else
+  MACE_UNUSED(op_registry);
 #endif  // MACE_ENABLE_OPENCL
 }


--- a/mace/tools/validation/BUILD
+++ b/mace/tools/validation/BUILD
@@ -4,13 +4,15 @@ load("//mace:mace.bzl", "if_openmp_enabled", "if_android")
 cc_binary(
    name = "mace_run",
    srcs = ["mace_run.cc"],
+    copts = if_openmp_enabled(["-fopenmp"]) + if_android([
+        "-DMACE_ENABLE_OPENCL",
+    ]),
    linkopts = if_openmp_enabled(["-fopenmp"]),
    linkstatic = 1,
-    copts = if_android(["-DMACE_ENABLE_OPENCL"]),
    deps = [
        "//external:gflags_nothreads",
-        "//mace/codegen:generated_models",
        "//mace/codegen:generated_mace_engine_factory",
-        "//mace/core:core",
+        "//mace/codegen:generated_models",
+        "//mace/core",
    ],
 )
--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
@@ -385,7 +385,7 @@ int Main(int argc, char **argv) {
    ParseShape(output_shapes[i], &output_shape_vec[i]);
  }

-  bool ret;
+  bool ret = false;
 #pragma omp parallel for
  for (int i = 0; i < FLAGS_restart_round; ++i) {
    VLOG(0) << "restart round " << i;
@@ -395,9 +395,8 @@ int Main(int argc, char **argv) {
  }
  if (ret) {
    return 0;
-  } else {
-    return -1;
  }
+  return -1;
 }

 }  // namespace validation

--- a/mace/utils/BUILD
+++ b/mace/utils/BUILD
@@ -12,20 +12,18 @@ load("//mace:mace.bzl", "if_android")
 cc_library(
    name = "utils",
    srcs = [
-        "command_line_flags.cc",
        "logging.cc",
        "string_util.cc",
    ],
    hdrs = [
-        "command_line_flags.h",
        "env_time.h",
        "logging.h",
        "memory_logging.h",
+        "rwlock.h",
        "string_util.h",
        "timer.h",
        "tuner.h",
        "utils.h",
-        "rwlock.h",
    ],
    linkopts = if_android([
        "-llog",
@@ -35,24 +33,6 @@ cc_library(
    ],
 )

-cc_test(
-    name = "utils_test",
-    testonly = 1,
-    srcs = [
-        "utils_test.cc",
-    ],
-    linkopts = if_android([
-        "-pie",
-        "-lm",
-    ]),
-    linkstatic = 1,
-    deps = [
-        ":utils",
-        "@gtest//:gtest",
-        "@gtest//:gtest_main",
-    ],
-)
-
 cc_library(
    name = "utils_dev",
    srcs = [

--- a/mace/utils/command_line_flags.cc
+++ b/mace/utils/command_line_flags.cc
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/utils/command_line_flags.h"
-
-#include <cstring>
-#include <iomanip>
-
-#include "mace/utils/logging.h"
-
-namespace mace {
-namespace utils {
-
-bool StringConsume(const std::string &x, std::string *arg) {
-  MACE_CHECK_NOTNULL(arg);
-  if ((arg->size() >= x.size()) &&
-      (memcmp(arg->data(), x.data(), x.size()) == 0)) {
-    *arg = arg->substr(x.size());
-    return true;
-  }
-  return false;
-}
-
-bool ParseStringFlag(std::string arg,
-                     std::string flag,
-                     std::string *dst,
-                     bool *value_parsing_ok) {
-  *value_parsing_ok = true;
-  if (StringConsume("--", &arg) && StringConsume(flag, &arg) &&
-      StringConsume("=", &arg)) {
-    *dst = arg;
-    return true;
-  }
-
-  return false;
-}
-
-bool ParseInt32Flag(std::string arg,
-                    std::string flag,
-                    int32_t *dst,
-                    bool *value_parsing_ok) {
-  *value_parsing_ok = true;
-  if (StringConsume("--", &arg) && StringConsume(flag, &arg) &&
-      StringConsume("=", &arg)) {
-    char extra;
-    if (sscanf(arg.data(), "%d%c", dst, &extra) != 1) {
-      LOG(ERROR) << "Couldn't interpret value " << arg << " for flag " << flag
-                 << ".";
-      *value_parsing_ok = false;
-    }
-    return true;
-  }
-
-  return false;
-}
-
-bool ParseInt64Flag(std::string arg,
-                    std::string flag,
-                    int64_t *dst,
-                    bool *value_parsing_ok) {
-  *value_parsing_ok = true;
-  if (StringConsume("--", &arg) && StringConsume(flag, &arg) &&
-      StringConsume("=", &arg)) {
-    char extra;
-    if (sscanf(arg.data(), "%lld%c", dst, &extra) != 1) {
-      LOG(ERROR) << "Couldn't interpret value " << arg << " for flag " << flag
-                 << ".";
-      *value_parsing_ok = false;
-    }
-    return true;
-  }
-
-  return false;
-}
-
-bool ParseBoolFlag(std::string arg, std::string flag,
-                   bool *dst, bool *value_parsing_ok) {
-  *value_parsing_ok = true;
-  if (StringConsume("--", &arg) && StringConsume(flag, &arg)) {
-    if (arg.empty()) {
-      *dst = true;
-      return true;
-    }
-
-    if (arg == "=true") {
-      *dst = true;
-      return true;
-    } else if (arg == "=false") {
-      *dst = false;
-      return true;
-    } else {
-      LOG(ERROR) << "Couldn't interpret value " << arg << " for flag " << flag
-                 << ".";
-      *value_parsing_ok = false;
-      return true;
-    }
-  }
-
-  return false;
-}
-
-bool ParseFloatFlag(std::string arg,
-                    std::string flag,
-                    float *dst,
-                    bool *value_parsing_ok) {
-  *value_parsing_ok = true;
-  if (StringConsume("--", &arg) && StringConsume(flag, &arg) &&
-      StringConsume("=", &arg)) {
-    char extra;
-    if (sscanf(arg.data(), "%f%c", dst, &extra) != 1) {
-      LOG(ERROR) << "Couldn't interpret value " << arg << " for flag " << flag
-                 << ".";
-      *value_parsing_ok = false;
-    }
-    return true;
-  }
-
-  return false;
-}
-
-}  // namespace utils
-
-Flag::Flag(const char *name, int *dst, const std::string &usage_text)
-    : name_(name), type_(TYPE_INT), int_value_(dst), usage_text_(usage_text) {}
-
-Flag::Flag(const char *name, int64_t *dst, const std::string &usage_text)
-    : name_(name),
-      type_(TYPE_INT64),
-      int64_value_(dst),
-      usage_text_(usage_text) {}
-
-Flag::Flag(const char *name, bool *dst, const std::string &usage_text)
-    : name_(name),
-      type_(TYPE_BOOL),
-      bool_value_(dst),
-      usage_text_(usage_text) {}
-
-Flag::Flag(const char *name, std::string *dst, const std::string &usage_text)
-    : name_(name),
-      type_(TYPE_STRING),
-      string_value_(dst),
-      usage_text_(usage_text) {}
-
-Flag::Flag(const char *name, float *dst, const std::string &usage_text)
-    : name_(name),
-      type_(TYPE_FLOAT),
-      float_value_(dst),
-      usage_text_(usage_text) {}
-
-bool Flag::Parse(std::string arg, bool *value_parsing_ok) const {
-  bool result = false;
-  if (type_ == TYPE_INT) {
-    result = utils::ParseInt32Flag(arg, name_, int_value_, value_parsing_ok);
-  } else if (type_ == TYPE_INT64) {
-    result = utils::ParseInt64Flag(arg, name_, int64_value_, value_parsing_ok);
-  } else if (type_ == TYPE_BOOL) {
-    result = utils::ParseBoolFlag(arg, name_, bool_value_, value_parsing_ok);
-  } else if (type_ == TYPE_STRING) {
-    result = utils::ParseStringFlag(arg, name_,
-                                    string_value_, value_parsing_ok);
-  } else if (type_ == TYPE_FLOAT) {
-    result = utils::ParseFloatFlag(arg, name_, float_value_, value_parsing_ok);
-  }
-  return result;
-}
-
-/*static*/ bool Flags::Parse(int *argc,
-                             char **argv,
-                             const std::vector<Flag> &flag_list) {
-  bool result = true;
-  std::vector<char *> unknown_flags;
-  for (int i = 1; i < *argc; ++i) {
-    if (std::string(argv[i]) == "--") {
-      while (i < *argc) {
-        unknown_flags.push_back(argv[i]);
-        ++i;
-      }
-      break;
-    }
-
-    bool was_found = false;
-    for (const Flag &flag : flag_list) {
-      bool value_parsing_ok;
-      was_found = flag.Parse(argv[i], &value_parsing_ok);
-      if (!value_parsing_ok) {
-        result = false;
-      }
-      if (was_found) {
-        break;
-      }
-    }
-    if (!was_found) {
-      unknown_flags.push_back(argv[i]);
-    }
-  }
-  // Passthrough any extra flags.
-  int dst = 1;  // Skip argv[0]
-  for (char *f : unknown_flags) {
-    argv[dst++] = f;
-  }
-  argv[dst++] = nullptr;
-  *argc = unknown_flags.size() + 1;
-  return result && (*argc < 2 || strcmp(argv[1], "--help") != 0);
-}
-
-std::string Flags::Usage(const std::string &cmdline,
-                         const std::vector<Flag> &flag_list) {
-  std::stringstream usage_text;
-  usage_text << "usage: " << cmdline << std::endl;
-
-  if (!flag_list.empty()) {
-    usage_text << "Flags: " << std::endl;
-  }
-  for (const Flag &flag : flag_list) {
-    usage_text << "\t" << std::left << std::setw(30) << flag.name_;
-    usage_text << flag.usage_text_ << std::endl;
-  }
-  return usage_text.str();
-}
-
-}  // namespace mace
--- a/mace/utils/command_line_flags.h
+++ b/mace/utils/command_line_flags.h
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_UTILS_COMMAND_LINE_FLAGS_H_
-#define MACE_UTILS_COMMAND_LINE_FLAGS_H_
-
-#include <string>
-#include <vector>
-
-namespace mace {
-
-class Flag {
- public:
-  Flag(const char *name, int *dst1, const std::string &usage_text);
-  Flag(const char *name, int64_t *dst1, const std::string &usage_text);
-  Flag(const char *name, bool *dst, const std::string &usage_text);
-  Flag(const char *name, std::string *dst, const std::string &usage_text);
-  Flag(const char *name, float *dst, const std::string &usage_text);
-
- private:
-  friend class Flags;
-
-  bool Parse(std::string arg, bool *value_parsing_ok) const;
-
-  std::string name_;
-  enum { TYPE_INT, TYPE_INT64, TYPE_BOOL, TYPE_STRING, TYPE_FLOAT } type_;
-  int *int_value_;
-  int64_t *int64_value_;
-  bool *bool_value_;
-  std::string *string_value_;
-  float *float_value_;
-  std::string usage_text_;
-};
-
-class Flags {
- public:
-  // Parse the command line represented by argv[0, ..., (*argc)-1] to find flag
-  // instances matching flags in flaglist[].  Update the variables associated
-  // with matching flags, and remove the matching arguments from (*argc, argv).
-  // Return true iff all recognized flag values were parsed correctly, and the
-  // first remaining argument is not "--help".
-  static bool Parse(int *argc, char **argv, const std::vector<Flag> &flag_list);
-
-  // Return a usage message with command line cmdline, and the
-  // usage_text strings in flag_list[].
-  static std::string Usage(const std::string &cmdline,
-                           const std::vector<Flag> &flag_list);
-};
-
-}  // namespace mace
-
-#endif  // MACE_UTILS_COMMAND_LINE_FLAGS_H_
--- a/mace/utils/string_util.h
+++ b/mace/utils/string_util.h
@@ -58,7 +58,7 @@ std::string MakeString(const std::vector<T> &args) {
  std::stringstream ss;
  ss << "[";
  const size_t size = args.size();
-  for (int i = 0; i < size; ++i) {
+  for (size_t i = 0; i < size; ++i) {
    ss << args[i];
    if (i < size - 1) {
      ss << ", ";

--- a/mace/utils/tuner.h
+++ b/mace/utils/tuner.h
@@ -146,7 +146,7 @@ class Tuner {
    for (iter = 0; iter < num_runs; ++iter) {
      res = func(params, timer, tuning_result);
      total_time_us += timer->AccumulatedMicros();
-      if (iter >= 1 && total_time_us > 100000 || total_time_us > 200000) {
+      if ((iter >= 1 && total_time_us > 100000) || total_time_us > 200000) {
        ++iter;
        break;
      }
@@ -165,7 +165,7 @@ class Tuner {
                                  std::vector<param_type> *)> &func,
      Timer *timer,
      std::vector<param_type> *opt_params) {
-    RetType res;
+    RetType res = 0;
    double opt_time = std::numeric_limits<double>::max();
    auto params = param_generator();
    std::vector<param_type> tuning_result;

--- a/mace/utils/tuner_production.cc
+++ b/mace/utils/tuner_production.cc
@@ -22,6 +22,7 @@ namespace mace {
 bool GetTuningParams(
    const char *path,
    std::unordered_map<std::string, std::vector<unsigned int>> *param_table) {
+  (void)(path);
  extern const std::map<std::string, std::vector<unsigned int>>
      kTuningParamsData;
  for (auto it = kTuningParamsData.begin(); it != kTuningParamsData.end();

--- a/mace/utils/tuner_test.cc
+++ b/mace/utils/tuner_test.cc
@@ -54,7 +54,7 @@ TEST_F(TunerTest, SimpleRun) {
 }

 TEST_F(TunerTest, SimpleTune) {
-  int expect = 3;
+  unsigned int expect = 3;
  auto TunerFunc = [&](const std::vector<unsigned int> &params, Timer *timer,
                       std::vector<uint32_t> *tuning_result) -> int {
    int res = 0;

--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -277,7 +277,7 @@ def bazel_build(target,
    stdout_buff = []
    process_output = make_output_processor(stdout_buff)
    if abi == "host":
-        p = sh.bazel(
+        bazel_args = (
            "build",
            "-c",
            "opt",
@@ -287,12 +287,17 @@ def bazel_build(target,
            target,
            "--copt=-std=c++11",
            "--copt=-D_GLIBCXX_USE_C99_MATH_TR1",
-            "--copt=-Werror=return-type",
+            "--copt=-Werror",
+            "--copt=-Wextra",
+            "--copt=-Wno-missing-field-initializers",
            "--copt=-O3",
            "--define",
            "openmp=%s" % str(enable_openmp).lower(),
            "--define",
            "production=%s" % str(production_mode).lower(),
+        )
+        p = sh.bazel(
+            *bazel_args,
            _out=process_output,
            _bg=True,
            _err_to_out=True)
@@ -311,7 +316,9 @@ def bazel_build(target,
            "--cpu=%s" % abi,
            "--copt=-std=c++11",
            "--copt=-D_GLIBCXX_USE_C99_MATH_TR1",
-            "--copt=-Werror=return-type",
+            "--copt=-Werror",
+            "--copt=-Wextra",
+            "--copt=-Wno-missing-field-initializers",
            "--copt=-DMACE_OBFUSCATE_LITERALS",
            "--copt=-O3",
            "--define",