diff --git a/WORKSPACE b/WORKSPACE
index 6f4f4d3be22b5462d680b9fedb7fd15fad9978ba..a1df23e64d49423a409440c08b0c9b8226ded3c7 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -5,11 +5,11 @@ workspace(name = "mace")
 # This statement defines the @com_google_protobuf repo.
 http_archive(
     name = "com_google_protobuf",
-    sha256 = "40d39d97a7b514b3e34daef732f822eca0081960b269863f5b573db5548cb237",
-    strip_prefix = "protobuf-3.4.0rc3",
+    sha256 = "542703acadc3f690d998f4641e1b988f15ba57ebca05fdfb1cd9095bec007948",
+    strip_prefix = "protobuf-3.4.0",
     urls = [
-        "https://cnbj1.fds.api.xiaomi.com/mace/third-party/protobuf/protobuf-3.4.0rc3.zip",
-        "https://github.com/google/protobuf/archive/v3.4.0rc3.zip"
+        "https://cnbj1.fds.api.xiaomi.com/mace/third-party/protobuf/protobuf-3.4.0.zip",
+        "https://github.com/google/protobuf/archive/v3.4.0.zip"
     ],
 )
 
@@ -38,21 +38,22 @@ new_http_archive(
 new_http_archive(
     name = "opencl_clhpp",
     build_file = "mace/third_party/opencl-clhpp/opencl-clhpp.BUILD",
-    sha256 = "d4eb63372ad31f7efcae626852f75f7929ff28d1cabb5f50ef11035963a69b46",
-    strip_prefix = "OpenCL-CLHPP-2.0.10",
+    sha256 = "dab6f1834ec6e3843438cc0f97d63817902aadd04566418c1fcc7fb78987d4e7",
+    strip_prefix = "OpenCL-CLHPP-4c6f7d56271727e37fb19a9b47649dd175df2b12",
     urls = [
-        "https://cnbj1.fds.api.xiaomi.com/mace/third-party/OpenCL-CLHPP/OpenCL-CLHPP-2.0.10.zip",
-        "https://github.com/KhronosGroup/OpenCL-CLHPP/archive/v2.0.10.zip"
+        "https://cnbj1.fds.api.xiaomi.com/mace/third-party/OpenCL-CLHPP/OpenCL-CLHPP-4c6f7d56271727e37fb19a9b47649dd175df2b12.zip",
+        "https://github.com/KhronosGroup/OpenCL-CLHPP/archive/4c6f7d56271727e37fb19a9b47649dd175df2b12.zip"
     ],
 )
 
 new_http_archive(
     name = "half",
     build_file = "mace/third_party/half/half.BUILD",
-    sha256 = "cdd70d3bf3fe091b688e7ab3f48471c881a197d2c186c95cca8bf156961fb41c",
+    sha256 = "0f514a1e877932b21dc5edc26a148ddc700b6af2facfed4c030ca72f74d0219e",
+    strip_prefix = "half-code-356-trunk",
     urls = [
-        "https://cnbj1.fds.api.xiaomi.com/mace/third-party/half/half-1.12.0.zip",
-        "https://jaist.dl.sourceforge.net/project/half/half/1.12.0/half-1.12.0.zip"
+        "https://cnbj1.fds.api.xiaomi.com/mace/third-party/half/half-code-356-trunk.zip",
+        "https://sourceforge.net/code-snapshots/svn/h/ha/half/code/half-code-356-trunk.zip"
     ],
 )
 
diff --git a/mace/benchmark/statistics.cc b/mace/benchmark/statistics.cc
index 60ca2fd542f6d8651ea086edc22c951ed9f055ee..278c88d7cd3a231a1ef3f6093c5c843e77c94526 100644
--- a/mace/benchmark/statistics.cc
+++ b/mace/benchmark/statistics.cc
@@ -58,9 +58,9 @@ std::string ShapeToString(const std::vector<OutputShape> &output_shape) {
 
   std::stringstream stream;
   stream << "[";
-  for (int i = 0; i < output_shape.size(); ++i) {
+  for (size_t i = 0; i < output_shape.size(); ++i) {
     const std::vector<index_t> &dims = output_shape[i].dims();
-    for (int j = 0; j < dims.size(); ++j) {
+    for (size_t j = 0; j < dims.size(); ++j) {
       stream << dims[j];
       if (j != dims.size() - 1) {
         stream << ",";
@@ -83,7 +83,7 @@ std::string VectorToString(const std::vector<T> &vec) {
 
   std::stringstream stream;
   stream << "[";
-  for (int i = 0; i < vec.size(); ++i) {
+  for (size_t i = 0; i < vec.size(); ++i) {
     stream << vec[i];
     if (i != vec.size() - 1) {
       stream << ",";
diff --git a/mace/benchmark/statistics.h b/mace/benchmark/statistics.h
index 056df9f4cdf371063937fc3acce6eaa92f555985..b4d69f275aa7b0edd933897b26929d739f6f4c58 100644
--- a/mace/benchmark/statistics.h
+++ b/mace/benchmark/statistics.h
@@ -54,24 +54,24 @@ class TimeInfo {
              sum_(0), square_sum(0)
   {}
 
-  const int64_t round() const {
+  int64_t round() const {
     return round_;
   }
 
-  const T first() const {
+  T first() const {
     return first_;
   }
 
-  const T sum() const {
+  T sum() const {
     return sum_;
   }
 
-  const double avg() const {
+  double avg() const {
     return round_ == 0 ? std::numeric_limits<double>::quiet_NaN() :
            sum_ * 1.0f / round_;
   }
 
-  const double std_deviation() const {
+  double std_deviation() const {
     if (round_ == 0 || min_ == max_) {
       return 0;
     }
@@ -111,12 +111,12 @@ class TimeInfo {
   }
 
  private:
+  int64_t round_;
   T first_;
   T curr_;
   T min_;
   T max_;
   T sum_;
-  int64_t round_;
   double square_sum;
 };
 
diff --git a/mace/core/allocator.h b/mace/core/allocator.h
index c22ea47c738d551ce586c3fecb169f1f68a18e4d..8c73025b4923cd860f3a47f3109cc4325728b259 100644
--- a/mace/core/allocator.h
+++ b/mace/core/allocator.h
@@ -21,6 +21,7 @@
 #include <vector>
 #include <cstring>
 
+#include "mace/core/macros.h"
 #include "mace/core/registry.h"
 #include "mace/core/types.h"
 #include "mace/public/mace.h"
@@ -83,6 +84,8 @@ class CPUAllocator : public Allocator {
 
   void *NewImage(const std::vector<size_t> &shape,
                  const DataType dt) const override {
+    MACE_UNUSED(shape);
+    MACE_UNUSED(dt);
     LOG(FATAL) << "Allocate CPU image";
     return nullptr;
   }
@@ -96,14 +99,20 @@ class CPUAllocator : public Allocator {
     free(data);
   };
   void *Map(void *buffer, size_t offset, size_t nbytes) const override {
+    MACE_UNUSED(nbytes);
     return reinterpret_cast<char*>(buffer) + offset;
   }
   void *MapImage(void *buffer,
                  const std::vector<size_t> &image_shape,
                  std::vector<size_t> *mapped_image_pitch) const override {
+    MACE_UNUSED(image_shape);
+    MACE_UNUSED(mapped_image_pitch);
     return buffer;
   }
-  void Unmap(void *buffer, void *mapper_ptr) const override {}
+  void Unmap(void *buffer, void *mapper_ptr) const override {
+    MACE_UNUSED(buffer);
+    MACE_UNUSED(mapper_ptr);
+  }
   bool OnHost() const override { return true; }
 };
 
diff --git a/mace/core/buffer.h b/mace/core/buffer.h
index d822c90a5de359cbe7844687e30f19926956710e..f4b252a776296b1e065816c3a9b6288d13d03837 100644
--- a/mace/core/buffer.h
+++ b/mace/core/buffer.h
@@ -21,6 +21,7 @@
 #include <functional>
 
 #include "mace/core/allocator.h"
+#include "mace/core/macros.h"
 #include "mace/core/types.h"
 
 namespace mace {
@@ -133,6 +134,7 @@ class Buffer : public BufferBase {
 
   void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
     MACE_CHECK_NOTNULL(buf_);
+    MACE_UNUSED(pitch);
     return allocator_->Map(buf_, offset, length);
   }
 
@@ -232,6 +234,9 @@ class Image : public BufferBase {
   std::vector<size_t> image_shape() const { return shape_; }
 
   void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
+    MACE_UNUSED(offset);
+    MACE_UNUSED(length);
+    MACE_UNUSED(pitch);
     MACE_NOT_IMPLEMENTED;
     return nullptr;
   }
@@ -254,9 +259,17 @@ class Image : public BufferBase {
     mapped_buf_ = nullptr;
   }
 
-  void Resize(index_t size) { MACE_NOT_IMPLEMENTED; }
+  void Resize(index_t size) {
+    MACE_UNUSED(size);
+    MACE_NOT_IMPLEMENTED;
+  }
 
-  void Copy(void *src, index_t offset, index_t length) { MACE_NOT_IMPLEMENTED; }
+  void Copy(void *src, index_t offset, index_t length) {
+    MACE_UNUSED(src);
+    MACE_UNUSED(offset);
+    MACE_UNUSED(length);
+    MACE_NOT_IMPLEMENTED;
+  }
 
   bool OnHost() const { return allocator_->OnHost(); }
 
@@ -327,11 +340,17 @@ class BufferSlice : public BufferBase {
   }
 
   void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
+    MACE_UNUSED(offset);
+    MACE_UNUSED(length);
+    MACE_UNUSED(pitch);
     MACE_NOT_IMPLEMENTED;
     return nullptr;
   }
 
-  void UnMap(void *mapped_ptr) const { MACE_NOT_IMPLEMENTED; }
+  void UnMap(void *mapped_ptr) const {
+    MACE_UNUSED(mapped_ptr);
+    MACE_NOT_IMPLEMENTED;
+  }
 
   void Map(std::vector<size_t> *pitch) {
     MACE_CHECK_NOTNULL(buffer_);
@@ -350,7 +369,12 @@ class BufferSlice : public BufferBase {
       " to ", size, " is illegal");
   }
 
-  void Copy(void *src, index_t offset, index_t length) { MACE_NOT_IMPLEMENTED; }
+  void Copy(void *src, index_t offset, index_t length) {
+    MACE_UNUSED(src);
+    MACE_UNUSED(offset);
+    MACE_UNUSED(length);
+    MACE_NOT_IMPLEMENTED;
+  }
 
   index_t offset() const { return offset_; }
 
diff --git a/mace/core/net.cc b/mace/core/net.cc
index 464f8a1ace32c9a7868c053c38f4990e953c275f..9fad47b6c767e94e3546c49a20b07986fdd5f7df 100644
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -14,6 +14,7 @@
 
 #include <utility>
 
+#include "mace/core/macros.h"
 #include "mace/core/net.h"
 #include "mace/utils/memory_logging.h"
 #include "mace/utils/timer.h"
@@ -25,7 +26,10 @@ NetBase::NetBase(const std::shared_ptr<const OperatorRegistry> op_registry,
                  const std::shared_ptr<const NetDef> net_def,
                  Workspace *ws,
                  DeviceType type)
-    : name_(net_def->name()), op_registry_(op_registry) {}
+    : name_(net_def->name()), op_registry_(op_registry) {
+  MACE_UNUSED(ws);
+  MACE_UNUSED(type);
+}
 
 SerialNet::SerialNet(const std::shared_ptr<const OperatorRegistry> op_registry,
                      const std::shared_ptr<const NetDef> net_def,
diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc
index c0c97c35a0aad5e10a5d48ed5ce4a4444be9f2d1..09891e8c4cfaab9afe9757dce75b13793e884e3f 100644
--- a/mace/core/runtime/cpu/cpu_runtime.cc
+++ b/mace/core/runtime/cpu/cpu_runtime.cc
@@ -33,6 +33,7 @@ namespace mace {
 
 namespace {
 
+#ifndef MACE_ENABLE_OPENMP
 int GetCPUCount() {
   char path[32];
   int cpu_count = 0;
@@ -50,12 +51,14 @@ int GetCPUCount() {
     cpu_count++;
   }
 }
+#endif
 
 int GetCPUMaxFreq(int cpu_id) {
   char path[64];
   snprintf(path, sizeof(path),
           "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq",
           cpu_id);
+
   FILE *fp = fopen(path, "rb");
   if (!fp) {
     LOG(WARNING) << "File: " << path << " not exists.";
@@ -63,47 +66,14 @@ int GetCPUMaxFreq(int cpu_id) {
   }
 
   int freq = 0;
-  fscanf(fp, "%d", &freq);
+  int items_read = fscanf(fp, "%d", &freq);
+  if (items_read != 1) {
+    LOG(WARNING) << "Read file: " << path << " failed.";
+  }
   fclose(fp);
   return freq;
 }
 
-void SortCPUIdsByMaxFreqAsc(std::vector<int> *cpu_ids, int *big_core_offset) {
-  MACE_CHECK_NOTNULL(cpu_ids);
-  int cpu_count = cpu_ids->size();
-  std::vector<int> cpu_max_freq;
-  cpu_max_freq.resize(cpu_count);
-
-  // set cpu max frequency
-  for (int i = 0; i < cpu_count; ++i) {
-    cpu_max_freq[i] = GetCPUMaxFreq(i);
-    (*cpu_ids)[i] = i;
-  }
-
-  // sort cpu ids by max frequency asc, bubble sort
-  for (int i = 0; i < cpu_count - 1; ++i) {
-    for (int j = i + 1; j < cpu_count; ++j) {
-      if (cpu_max_freq[i] > cpu_max_freq[j]) {
-        int tmp = (*cpu_ids)[i];
-        (*cpu_ids)[i] = (*cpu_ids)[j];
-        (*cpu_ids)[j] = tmp;
-
-        tmp = cpu_max_freq[i];
-        cpu_max_freq[i] = cpu_max_freq[j];
-        cpu_max_freq[j] = tmp;
-      }
-    }
-  }
-
-  *big_core_offset = 0;
-  for (int i = 1; i < cpu_count; ++i) {
-    if (cpu_max_freq[i] > cpu_max_freq[i - 1]) {
-      *big_core_offset = i;
-      break;
-    }
-  }
-}
-
 void SetThreadAffinity(cpu_set_t mask) {
 #if defined(__ANDROID__)
   pid_t pid = gettid();
diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index cf114f4d5b15c20ab2e522cbc18d7a49ea8606d3..6d068cf74a20d06919a9d8adf8db21d8d3ef5813 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -23,6 +23,7 @@
 #include <utility>
 
 #include "mace/public/mace_runtime.h"
+#include "mace/core/macros.h"
 #include "mace/core/file_storage.h"
 #include "mace/core/runtime/opencl/opencl_extension.h"
 #include "mace/public/mace.h"
@@ -176,6 +177,8 @@ void OpenCLPrintfCallback(const char *buffer,
                           size_t length,
                           size_t final,
                           void *user_data) {
+  MACE_UNUSED(final);
+  MACE_UNUSED(user_data);
   fwrite(buffer, 1, length, stdout);
 }
 
@@ -218,6 +221,22 @@ void GetAdrenoContextProperties(std::vector<cl_context_properties> *properties,
   // The properties list should be terminated with 0
   properties->push_back(0);
 }
+
+GPUType ParseGPUType(const std::string &device_name) {
+  constexpr const char *kQualcommAdrenoGPUStr = "QUALCOMM Adreno(TM)";
+  constexpr const char *kMaliGPUStr = "Mali";
+  constexpr const char *kPowerVRGPUStr = "PowerVR";
+
+  if (device_name == kQualcommAdrenoGPUStr) {
+    return GPUType::QUALCOMM_ADRENO;
+  } else if (device_name.find(kMaliGPUStr) != std::string::npos) {
+    return GPUType::MALI;
+  } else if (device_name.find(kPowerVRGPUStr) != std::string::npos) {
+    return GPUType::PowerVR;
+  } else {
+    return GPUType::UNKNOWN;
+  }
+}
 }  // namespace
 
 void OpenCLProfilingTimer::StartTiming() {}
@@ -389,11 +408,11 @@ cl::Device &OpenCLRuntime::device() { return *device_; }
 
 cl::CommandQueue &OpenCLRuntime::command_queue() { return *command_queue_; }
 
-const uint64_t OpenCLRuntime::device_global_mem_cache_size() const {
+uint64_t OpenCLRuntime::device_global_mem_cache_size() const {
   return device_gloabl_mem_cache_size_;
 }
 
-const uint32_t OpenCLRuntime::device_compute_units() const {
+uint32_t OpenCLRuntime::device_compute_units() const {
   return device_compute_units_;
 }
 
@@ -597,12 +616,12 @@ uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) {
   return size;
 }
 
-const bool OpenCLRuntime::IsNonUniformWorkgroupsSupported() {
+bool OpenCLRuntime::IsNonUniformWorkgroupsSupported() const {
   return (gpu_type_ == GPUType::QUALCOMM_ADRENO &&
       opencl_version_ == "2.0");
 }
 
-const GPUType OpenCLRuntime::gpu_type() const {
+GPUType OpenCLRuntime::gpu_type() const {
   return gpu_type_;
 }
 
@@ -610,36 +629,20 @@ const std::string OpenCLRuntime::platform_info() const {
   return platform_info_;
 }
 
-const GPUType OpenCLRuntime::ParseGPUType(
-    const std::string &device_name) {
-  constexpr const char *kQualcommAdrenoGPUStr = "QUALCOMM Adreno(TM)";
-  constexpr const char *kMaliGPUStr = "Mali";
-  constexpr const char *kPowerVRGPUStr = "PowerVR";
-
-  if (device_name == kQualcommAdrenoGPUStr) {
-    return GPUType::QUALCOMM_ADRENO;
-  } else if (device_name.find(kMaliGPUStr) != std::string::npos) {
-    return GPUType::MALI;
-  } else if (device_name.find(kPowerVRGPUStr) != std::string::npos) {
-    return GPUType::PowerVR;
-  } else {
-    return GPUType::UNKNOWN;
-  }
-}
 const std::string OpenCLRuntime::ParseDeviceVersion(
     const std::string &device_version) {
   // OpenCL Device version string format:
-  // OpenCL<space><major_version.minor_version><space>\
+  // OpenCL<space><major_version.minor_version><space>
   // <vendor-specific information>
   auto words = Split(device_version, ' ');
   return words[1];
 }
 
-const bool OpenCLRuntime::IsOutOfRangeCheckEnabled() const {
+bool OpenCLRuntime::IsOutOfRangeCheckEnabled() const {
   return out_of_range_check_;
 }
 
-const bool OpenCLRuntime::is_profiling_enabled() const {
+bool OpenCLRuntime::is_profiling_enabled() const {
   return is_profiling_enabled_;
 }
 
diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h
index 9588a3c6facaf78a0da1892632e53ef076dfbdb2..f7fab7471f1a75cd747204422b2670f6c282e035 100644
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -70,18 +70,18 @@ class OpenCLRuntime {
   cl::Context &context();
   cl::Device &device();
   cl::CommandQueue &command_queue();
-  const GPUType gpu_type() const;
+  GPUType gpu_type() const;
   const std::string platform_info() const;
-  const uint64_t device_global_mem_cache_size() const;
-  const uint32_t device_compute_units() const;
+  uint64_t device_global_mem_cache_size() const;
+  uint32_t device_compute_units() const;
 
   void GetCallStats(const cl::Event &event, CallStats *stats);
   uint64_t GetDeviceMaxWorkGroupSize();
   uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel);
   uint64_t GetKernelWaveSize(const cl::Kernel &kernel);
-  const bool IsNonUniformWorkgroupsSupported();
-  const bool IsOutOfRangeCheckEnabled() const;
-  const bool is_profiling_enabled() const;
+  bool IsNonUniformWorkgroupsSupported() const;
+  bool IsOutOfRangeCheckEnabled() const;
+  bool is_profiling_enabled() const;
 
   cl::Kernel BuildKernel(const std::string &program_name,
                          const std::string &kernel_name,
@@ -112,7 +112,6 @@ class OpenCLRuntime {
       const std::string &built_program_key,
       const std::string &build_options_str,
       cl::Program *program);
-  const GPUType ParseGPUType(const std::string &device_name);
   const std::string ParseDeviceVersion(const std::string &device_version);
 
  private:
diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h
index ca69aa3e516a9e4a2c549fbd97a593a62874aca2..961d48848c3f182ae91c3395ccd2960bc7dae956 100644
--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -136,6 +136,7 @@ class ActivationFunctor<DeviceType::CPU, float> {
                   const Tensor *alpha,
                   Tensor *output,
                   StatsFuture *future) {
+    MACE_UNUSED(future);
     const float *input_ptr = input->data<float>();
     float *output_ptr = output->mutable_data<float>();
     if (activation_ == PRELU) {
diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h
index abc7efd837db5bdcfc9d5e41bd8704549635e901..c61be7d21bfd1108d3857a457a3e5479ec89b39d 100644
--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -39,6 +39,7 @@ struct AddNFunctor {
   void operator()(const std::vector<const Tensor *> &input_tensors,
                   Tensor *output_tensor,
                   StatsFuture *future) {
+    MACE_UNUSED(future);
     output_tensor->ResizeLike(input_tensors[0]);
     index_t size = output_tensor->size();
     Tensor::MappingGuard output_map(output_tensor);
diff --git a/mace/kernels/arm/conv_2d_neon_3x3.cc b/mace/kernels/arm/conv_2d_neon_3x3.cc
index fba0a7e2195e0ae6e673cae0190518cad510aeb5..0e4ac0eb8c1dcc4ac0c3686d45d80c1f7f3ea266 100644
--- a/mace/kernels/arm/conv_2d_neon_3x3.cc
+++ b/mace/kernels/arm/conv_2d_neon_3x3.cc
@@ -42,22 +42,25 @@ void Conv2dNeonK3x3S1(const float *input,
     for (index_t m = 0; m < out_channels; m += 2) {
       if (m + 1 < out_channels) {
         float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
+#if defined(MACE_ENABLE_NEON)
         float *out_ptr1_base =
           output + b * out_batch_size + (m + 1) * out_image_size;
+#endif
         for (index_t c = 0; c < in_channels; ++c) {
           float *out_ptr0 = out_ptr0_base;
-          float *out_ptr1 = out_ptr1_base;
-
           const float *in_ptr0 = input + b * in_batch_size + c * in_image_size;
+          const float *filter_ptr0 = filter + m * in_channels * 9 + c * 9;
+
+#if defined(MACE_ENABLE_NEON)
+          float *out_ptr1 = out_ptr1_base;
           const float *in_ptr1 =
             input + b * in_batch_size + c * in_image_size + 1 * in_width;
           const float *in_ptr2 =
             input + b * in_batch_size + c * in_image_size + 2 * in_width;
           const float *in_ptr3 =
             input + b * in_batch_size + c * in_image_size + 3 * in_width;
-          const float *filter_ptr0 = filter + m * in_channels * 9 + c * 9;
           const float *filter_ptr1 = filter + (m + 1) * in_channels * 9 + c * 9;
-
+#endif
 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
           // load filter (2 outch x 3 height x 3 width): vf_outch_height
           float32x4_t vf00, vf01, vf02;
@@ -321,12 +324,14 @@ void Conv2dNeonK3x3S1(const float *input,
 
             const float
               *in_ptr0 = input + b * in_batch_size + c * in_image_size;
+#if defined(MACE_ENABLE_NEON)
             const float *in_ptr1 =
               input + b * in_batch_size + c * in_image_size + 1 * in_width;
             const float *in_ptr2 =
               input + b * in_batch_size + c * in_image_size + 2 * in_width;
             const float *in_ptr3 =
               input + b * in_batch_size + c * in_image_size + 3 * in_width;
+#endif
             const float *filter_ptr0 = filter + mm * in_channels * 9 + c * 9;
 
 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
diff --git a/mace/kernels/arm/conv_2d_neon_5x5.cc b/mace/kernels/arm/conv_2d_neon_5x5.cc
index 58dbed34690c9307cf36fc3db84f2fa0bd7d4cb6..f4fe7ce7ccbaa3ca69de88da6234a17b66097470 100644
--- a/mace/kernels/arm/conv_2d_neon_5x5.cc
+++ b/mace/kernels/arm/conv_2d_neon_5x5.cc
@@ -121,23 +121,25 @@ void Conv2dNeonK5x5S1(const float *input,
     for (index_t m = 0; m < out_channels; m += 4) {
       if (m + 3 < out_channels) {
         float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
+#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
         float *out_ptr1_base =
           output + b * out_batch_size + (m + 1) * out_image_size;
         float *out_ptr2_base =
             output + b * out_batch_size + (m + 2) * out_image_size;
         float *out_ptr3_base =
             output + b * out_batch_size + (m + 3) * out_image_size;
+#endif
         for (index_t c = 0; c < in_channels; ++c) {
           const float *in_ptr_base =
               input + b * in_batch_size + c * in_image_size;
           const float *filter_ptr0 = filter + m * in_channels * 25 + c * 25;
+#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
           const float *filter_ptr1 =
               filter + (m + 1) * in_channels * 25 + c * 25;
           const float *filter_ptr2 =
               filter + (m + 2) * in_channels * 25 + c * 25;
           const float *filter_ptr3 =
               filter + (m + 3) * in_channels * 25 + c * 25;
-#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
           for (index_t h = 0; h < out_height; ++h) {
             for (index_t w = 0; w + 3 < out_width; w += 4) {
                // input offset
diff --git a/mace/kernels/arm/conv_2d_neon_7x7.cc b/mace/kernels/arm/conv_2d_neon_7x7.cc
index d1bbecbb710ad65d0785d5a541dcbe4bddd854a8..057b93138cebb226fab6008798ccc273a29e574d 100644
--- a/mace/kernels/arm/conv_2d_neon_7x7.cc
+++ b/mace/kernels/arm/conv_2d_neon_7x7.cc
@@ -198,23 +198,25 @@ void Conv2dNeonK7x7S1(const float *input,
     for (index_t m = 0; m < out_channels; m += 4) {
       if (m + 3 < out_channels) {
         float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
+#if defined(MACE_ENABLE_NEON)
         float *out_ptr1_base =
           output + b * out_batch_size + (m + 1) * out_image_size;
         float *out_ptr2_base =
             output + b * out_batch_size + (m + 2) * out_image_size;
         float *out_ptr3_base =
             output + b * out_batch_size + (m + 3) * out_image_size;
+#endif
         for (index_t c = 0; c < in_channels; ++c) {
           const float *in_ptr_base =
               input + b * in_batch_size + c * in_image_size;
           const float *filter_ptr0 = filter + m * in_channels * 49 + c * 49;
+#if defined(MACE_ENABLE_NEON)
           const float *filter_ptr1 =
               filter + (m + 1) * in_channels * 49 + c * 49;
           const float *filter_ptr2 =
               filter + (m + 2) * in_channels * 49 + c * 49;
           const float *filter_ptr3 =
               filter + (m + 3) * in_channels * 49 + c * 49;
-#if defined(MACE_ENABLE_NEON)
           for (index_t h = 0; h < out_height; ++h) {
             for (index_t w = 0; w + 3 < out_width; w += 4) {
                // input offset
@@ -352,23 +354,25 @@ void Conv2dNeonK7x7S2(const float *input,
     for (index_t m = 0; m < out_channels; m += 4) {
       if (m + 3 < out_channels) {
         float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
+#if defined(MACE_ENABLE_NEON)
         float *out_ptr1_base =
             output + b * out_batch_size + (m + 1) * out_image_size;
         float *out_ptr2_base =
             output + b * out_batch_size + (m + 2) * out_image_size;
         float *out_ptr3_base =
             output + b * out_batch_size + (m + 3) * out_image_size;
+#endif
         for (index_t c = 0; c < in_channels; ++c) {
           const float *in_ptr_base =
               input + b * in_batch_size + c * in_image_size;
           const float *filter_ptr0 = filter + m * in_channels * 49 + c * 49;
+#if defined(MACE_ENABLE_NEON)
           const float *filter_ptr1 =
               filter + (m + 1) * in_channels * 49 + c * 49;
           const float *filter_ptr2 =
               filter + (m + 2) * in_channels * 49 + c * 49;
           const float *filter_ptr3 =
               filter + (m + 3) * in_channels * 49 + c * 49;
-#if defined(MACE_ENABLE_NEON)
           for (index_t h = 0; h < out_height; ++h) {
             for (index_t w = 0; w + 3 < out_width; w += 4) {
               // input offset
@@ -516,23 +520,25 @@ void Conv2dNeonK7x7S3(const float *input,
     for (index_t m = 0; m < out_channels; m += 4) {
       if (m + 3 < out_channels) {
         float *out_ptr0_base = output + b * out_batch_size + m * out_image_size;
+#if defined(MACE_ENABLE_NEON)
         float *out_ptr1_base =
             output + b * out_batch_size + (m + 1) * out_image_size;
         float *out_ptr2_base =
             output + b * out_batch_size + (m + 2) * out_image_size;
         float *out_ptr3_base =
             output + b * out_batch_size + (m + 3) * out_image_size;
+#endif
         for (index_t c = 0; c < in_channels; ++c) {
           const float *in_ptr_base =
               input + b * in_batch_size + c * in_image_size;
           const float *filter_ptr0 = filter + m * in_channels * 49 + c * 49;
+#if defined(MACE_ENABLE_NEON)
           const float *filter_ptr1 =
               filter + (m + 1) * in_channels * 49 + c * 49;
           const float *filter_ptr2 =
               filter + (m + 2) * in_channels * 49 + c * 49;
           const float *filter_ptr3 =
               filter + (m + 3) * in_channels * 49 + c * 49;
-#if defined(MACE_ENABLE_NEON)
           for (index_t h = 0; h < out_height; ++h) {
             for (index_t w = 0; w + 3 < out_width; w += 4) {
               // input offset
diff --git a/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc b/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
index fb0f3933fe6dd70aab36520fe628a99aa261994f..fb36bdaded33d5217f1ccb9ae1d9427204433cc5 100644
--- a/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
+++ b/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
@@ -17,6 +17,7 @@
 #endif
 
 #include "mace/kernels/arm/depthwise_conv2d_neon.h"
+#include "mace/core/macros.h"
 
 namespace mace {
 namespace kernels {
@@ -65,6 +66,10 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
                                const index_t valid_w_start,
                                const index_t valid_w_stop,
                                float *output) {
+#if !defined(MACE_ENABLE_NEON)
+  MACE_UNUSED(valid_w_start);
+  MACE_UNUSED(valid_w_stop);
+#endif
   const index_t multiplier = out_channels / in_channels;
   const index_t in_image_size = in_height * in_width;
   const index_t out_image_size = out_height * out_width;
@@ -305,6 +310,10 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
                                const index_t valid_w_start,
                                const index_t valid_w_stop,
                                float *output) {
+#if !defined(MACE_ENABLE_NEON)
+  MACE_UNUSED(valid_w_start);
+  MACE_UNUSED(valid_w_stop);
+#endif
   const index_t multiplier = out_channels / in_channels;
   const index_t in_image_size = in_height * in_width;
   const index_t out_image_size = out_height * out_width;
diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h
index 48abc908a35f2c7b883115a18bc137c4cb52211e..0b6fddb532f7d77afc8adaf1d19fcde85f896a37 100644
--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -64,6 +64,7 @@ struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase {
                   const float epsilon,
                   Tensor *output,
                   StatsFuture *future) {
+    MACE_UNUSED(future);
     // Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
     // The calculation formula for inference is
     // Y = \frac{ \scale } { \sqrt{var+\variance_epsilon} } * X +
diff --git a/mace/kernels/bias_add.h b/mace/kernels/bias_add.h
index a6df23501666c1bb2eca1edfbcb2191a08217fca..2ce7904d928cb71d39b0068d6b9f994d650dd7e8 100644
--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -38,6 +38,7 @@ struct BiasAddFunctor<DeviceType::CPU, float> {
                   const Tensor *bias,
                   Tensor *output,
                   StatsFuture *future) {
+    MACE_UNUSED(future);
     const index_t batch = input->dim(0);
     const index_t channels = input->dim(1);
     const index_t height = input->dim(2);
diff --git a/mace/kernels/buffer_to_image.h b/mace/kernels/buffer_to_image.h
index b2d9822d99b8e22e6afd47526afab4fe9597f13a..fadc1b0dd0d306ce0d5c5a00d0e4b7bf989050c8 100644
--- a/mace/kernels/buffer_to_image.h
+++ b/mace/kernels/buffer_to_image.h
@@ -39,6 +39,10 @@ struct BufferToImageFunctor : BufferToImageFunctorBase {
                   const BufferType type,
                   Tensor *output,
                   StatsFuture *future) {
+    MACE_UNUSED(input);
+    MACE_UNUSED(type);
+    MACE_UNUSED(output);
+    MACE_UNUSED(future);
     MACE_NOT_IMPLEMENTED;
   }
 };
diff --git a/mace/kernels/channel_shuffle.h b/mace/kernels/channel_shuffle.h
index 2aa3d600137c46d28b1c46d15c4917efd17376b0..4b89764ef02d39def9c1e9884ccac14c0cb67571 100644
--- a/mace/kernels/channel_shuffle.h
+++ b/mace/kernels/channel_shuffle.h
@@ -31,6 +31,7 @@ struct ChannelShuffleFunctor {
   void operator()(const Tensor *input,
                   Tensor *output,
                   StatsFuture *future) {
+    MACE_UNUSED(future);
     output->ResizeLike(input);
 
     Tensor::MappingGuard logits_guard(input);
@@ -56,7 +57,7 @@ struct ChannelShuffleFunctor {
         index_t idx = c / groups_;
         for (index_t hw = 0; hw < height * width; ++hw) {
           output_base[c * image_size + hw] = input_base[
-            (c % groups_ * channels_per_group + c / groups_) * image_size + hw];
+            (g * channels_per_group + idx) * image_size + hw];
         }
       }
     }
diff --git a/mace/kernels/concat.h b/mace/kernels/concat.h
index 69ee79729f845e00870a705875247d1b390a4925..f3139b581267c1eeea43277e46fa8eb17c5101d2 100644
--- a/mace/kernels/concat.h
+++ b/mace/kernels/concat.h
@@ -43,8 +43,9 @@ struct ConcatFunctor : ConcatFunctorBase {
   void operator()(const std::vector<const Tensor *> &input_list,
                   Tensor *output,
                   StatsFuture *future) {
+    MACE_UNUSED(future);
     const Tensor *input0 = input_list.front();
-    const int inputs_count = input_list.size();
+    const size_t inputs_count = input_list.size();
 
     std::vector<index_t> output_shape(input0->shape());
     index_t inner_size = 1;
@@ -53,7 +54,7 @@ struct ConcatFunctor : ConcatFunctorBase {
     }
     std::vector<index_t> outer_sizes(inputs_count, 0);
     outer_sizes[0] = input0->size() / inner_size;
-    for (int i = 1; i < inputs_count; ++i) {
+    for (size_t i = 1; i < inputs_count; ++i) {
       const Tensor *input = input_list[i];
       MACE_CHECK(input->dim_size() == input0->dim_size(),
                  "Ranks of all input tensors must be same.");
diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h
index c9a859ff84ec7d3bb83673632d473b836b97a5d2..dfe5540d16e60bf250685e6a725cede177a2ff97 100644
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -103,8 +103,6 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
     const index_t in_batch_size = in_channels * in_image_size;
     const index_t out_batch_size = out_channels * out_image_size;
     const index_t filter_size = filter_height * filter_width;
-    const index_t in_tile_size =
-        3 * stride_w + (filter_width - 1) * dilation_w + 1;
 
 #pragma omp parallel for collapse(2)
     for (index_t b = 0; b < batch; ++b) {
@@ -267,6 +265,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                   const Tensor *bias,
                   Tensor *output,
                   StatsFuture *future) {
+    MACE_UNUSED(future);
     MACE_CHECK_NOTNULL(input);
     MACE_CHECK_NOTNULL(filter);
     MACE_CHECK_NOTNULL(output);
@@ -345,7 +344,6 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
     Tensor::MappingGuard bias_guard(bias);
     Tensor::MappingGuard output_guard(output);
 
-    auto input_data = input->data<float>();
     auto filter_data = filter->data<float>();
     auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
     auto output_data = output->mutable_data<float>();
@@ -719,7 +717,10 @@ struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
                         paddings,
                         dilations,
                         activation,
-                        relux_max_limit) {}
+                        relux_max_limit) {
+    MACE_UNUSED(is_filter_transformed);
+    MACE_UNUSED(scratch);
+  }
 
   void operator()(const Tensor *input,
                   const Tensor *filter,
diff --git a/mace/kernels/depth_to_space.h b/mace/kernels/depth_to_space.h
index 193dac61cfc26cc172b2353628c71d0167c48aa9..733591a5ef8fd906945736848ef3bed6f75c1b10 100644
--- a/mace/kernels/depth_to_space.h
+++ b/mace/kernels/depth_to_space.h
@@ -33,6 +33,7 @@ struct DepthToSpaceOpFunctor {
   explicit DepthToSpaceOpFunctor(const int block_size, bool d2s)
     : block_size_(block_size), d2s_(d2s) {}
   void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
+    MACE_UNUSED(future);
     const int batch_size = input->dim(0);
     const int input_depth = input->dim(1);
     const int input_height = input->dim(2);
diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h
index a0f0b1e3b4041bbe7f56c3d8b3bf6a1ccfadbd0b..ce3c1e48551a66915fcf1888dd21d5ced19edeea 100644
--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -133,6 +133,7 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
                   const Tensor *bias,
                   Tensor *output,
                   StatsFuture *future) {
+    MACE_UNUSED(future);
     MACE_CHECK_NOTNULL(input);
     MACE_CHECK_NOTNULL(filter);
     MACE_CHECK_NOTNULL(output);
diff --git a/mace/kernels/eltwise.h b/mace/kernels/eltwise.h
index cd45614b88aed97331b253cca04d64b0a899d739..2e7bb7693b42635850c3d2ec9e9718fbc42bb71e 100644
--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
@@ -470,6 +470,7 @@ struct EltwiseFunctor<DeviceType::CPU, float>: EltwiseFunctorBase {
                   const Tensor *input1,
                   Tensor *output,
                   StatsFuture *future) {
+    MACE_UNUSED(future);
     bool swapped = false;
     if (input1 != nullptr) {
       MACE_CHECK(input0->dim_size() == input1->dim_size()
diff --git a/mace/kernels/fully_connected.h b/mace/kernels/fully_connected.h
index 0ac9ad5518f7cb34ae5c16a7ca5e9396c6aa3770..8ef384f5e7d749265aef027e80f8aceaeb4d7db2 100644
--- a/mace/kernels/fully_connected.h
+++ b/mace/kernels/fully_connected.h
@@ -59,6 +59,7 @@ struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase {
                   const Tensor *bias,
                   Tensor *output,
                   StatsFuture *future) {
+    MACE_UNUSED(future);
     std::vector<index_t> output_shape = {input->dim(0), weight->dim(0), 1, 1};
     output->Resize(output_shape);
     const index_t N = output->dim(0);
diff --git a/mace/kernels/gemm.cc b/mace/kernels/gemm.cc
index 3ba7901e3f4ac45d4d1a5bfaa87e54432890499c..0fae44de1fa7c1c77195cfd6c93140c6e60c1d05 100644
--- a/mace/kernels/gemm.cc
+++ b/mace/kernels/gemm.cc
@@ -135,7 +135,9 @@ inline void GemmTile(const float *A,
                      const index_t stride_k,
                      const index_t stride_w,
                      float *C) {
+#if defined(MACE_ENABLE_NEON)
   index_t h, w, k;
+#endif
 
 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
   for (h = 0; h + 7 < height; h += 8) {
@@ -443,6 +445,7 @@ inline void GemmTile(const float *A,
 #else
 
 #if defined(MACE_ENABLE_NEON)  // armv7
+  w = (width >> 2) << 2;
   for (h = 0; h + 3 < height; h += 4) {
     for (k = 0; k + 3 < K; k += 4) {
       const float *a_ptr = A + (h * stride_k + k);
@@ -523,8 +526,6 @@ inline void GemmTile(const float *A,
           c_ptr2 += 4;
           c_ptr3 += 4;
         }
-
-        w = (width >> 2) << 2;
       }
       if (w < width) {
         const float *a_ptr = A + (h * stride_k + k);
diff --git a/mace/kernels/local_response_norm.h b/mace/kernels/local_response_norm.h
index 2a07df310fe7e1885df373aad3fe610dad015e2d..df1560422fbea43cb0d3a5ee006296f564898047 100644
--- a/mace/kernels/local_response_norm.h
+++ b/mace/kernels/local_response_norm.h
@@ -42,6 +42,7 @@ struct LocalResponseNormFunctor<DeviceType::CPU, float> {
                   float beta,
                   Tensor *output,
                   StatsFuture *future) {
+    MACE_UNUSED(future);
     const index_t batch = input->dim(0);
     const index_t channels = input->dim(1);
     const index_t height = input->dim(2);
diff --git a/mace/kernels/matmul.h b/mace/kernels/matmul.h
index 0d94d2c571afcf11e2206654d069b4656efc727d..3b189261b6aa245ef9b9ac1e2b8803268c6acc91 100644
--- a/mace/kernels/matmul.h
+++ b/mace/kernels/matmul.h
@@ -42,6 +42,7 @@ struct MatMulFunctor {
                   const Tensor *B,
                   Tensor *C,
                   StatsFuture *future) {
+    MACE_UNUSED(future);
     std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
     C->Resize(c_shape);
 
@@ -59,14 +60,6 @@ struct MatMulFunctor {
     // It is better to use large block size if it fits for fast cache.
     // Assume l1 cache size is 32k, we load three blocks at a time (A, B, C),
     // the block size should be sqrt(32k / sizeof(T) / 3).
-    const index_t block_size = 48;
-    const index_t block_tile_height = RoundUpDiv(height, block_size);
-    const index_t block_tile_width = RoundUpDiv(width, block_size);
-    const index_t block_tile_k = RoundUpDiv(K, block_size);
-    const index_t remain_height = height % block_size;
-    const index_t remain_width = width % block_size;
-    const index_t remain_k = K % block_size;
-    constexpr index_t register_tile_size = 4;
     memset(c_ptr_base, 0, batch * height * width * sizeof(T));
 
     Gemm(a_ptr_base, b_ptr_base, batch, height, K, width, c_ptr_base);
diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc
index 105435d5a48e912bf2c147d628d9f12581ebeea1..4587a2cb7b2c8a5cebe0470533a1457bb6937e1a 100644
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -36,7 +36,7 @@ void AddNFunctor<DeviceType::GPU, T>::operator()(
 
   auto runtime = OpenCLRuntime::Global();
 
-  for (int i = 1; i < size; ++i) {
+  for (size_t i = 1; i < size; ++i) {
     MACE_CHECK_NOTNULL(input_tensors[i]);
     MACE_CHECK(batch == input_tensors[i]->dim(0));
     MACE_CHECK(height == input_tensors[i]->dim(1));
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
index f2fda4c497acd1997eea3553182d03a680fb1140..96c15fd8adfe4369d483cc6be424d341fe59b743 100644
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -136,7 +136,6 @@ static void ConcatN(cl::Kernel *kernel,
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
-  const index_t channel = output->dim(3);
 
   auto runtime = OpenCLRuntime::Global();
 
diff --git a/mace/kernels/opencl/conv_2d_1x1.cc b/mace/kernels/opencl/conv_2d_1x1.cc
index 264dce62c533b07ea1af03925e6f3721a4d8b7a4..52ed0368fc6fbe824ee2f254394ecfbc6324071b 100644
--- a/mace/kernels/opencl/conv_2d_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_1x1.cc
@@ -72,6 +72,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
                              StatsFuture *future,
                              uint32_t *kwg_size,
                              std::unique_ptr<BufferBase> *kernel_error) {
+  MACE_UNUSED(padding);
+  MACE_UNUSED(dilations);
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
diff --git a/mace/kernels/opencl/depthwise_conv.cc b/mace/kernels/opencl/depthwise_conv.cc
index fa8383d803509a9ce4966a90008201fc17473a50..78337bc591a2e25d81ab443586b8652ad92593c5 100644
--- a/mace/kernels/opencl/depthwise_conv.cc
+++ b/mace/kernels/opencl/depthwise_conv.cc
@@ -135,7 +135,6 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
   }
   if (!IsVecEqual(*prev_input_shape, input->shape())) {
-    const index_t input_batch = input->dim(0);
     const index_t input_height = input->dim(1);
     const index_t input_width = input->dim(2);
 
diff --git a/mace/kernels/opencl/eltwise.cc b/mace/kernels/opencl/eltwise.cc
index 3557032c7fdad9c6f4f1755cd4a05dad83a4809f..94b4c322bc625fa82f9d2e482c99e5b95fdd41d3 100644
--- a/mace/kernels/opencl/eltwise.cc
+++ b/mace/kernels/opencl/eltwise.cc
@@ -25,6 +25,7 @@ void EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
                                                        const Tensor *input1,
                                                        Tensor *output,
                                                        StatsFuture *future) {
+  MACE_UNUSED(future);
   bool swapped = false;
   if (input1 != nullptr) {
     MACE_CHECK(input0->dim_size() == input1->dim_size()
diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc
index 96b99d176a0b0b9a54a3436866ff77dca071a311..716d2af41a5eaf85aef3fb64d4332cbe491f6eea 100644
--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -206,17 +206,6 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
   }
 }
 
-std::vector<uint32_t> Default2DLocalWS(const uint32_t *gws,
-                                       const uint32_t kwg_size) {
-  std::vector<uint32_t> lws(3, 0);
-  uint64_t cache_size =
-      OpenCLRuntime::Global()->device_global_mem_cache_size();
-  uint32_t base = cache_size / kBaseGPUMemCacheSize;
-  lws[0] = std::min<uint32_t>(base, kwg_size);
-  lws[1] = kwg_size / lws[1];
-  return lws;
-}
-
 std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
                                        const uint32_t kwg_size) {
   std::vector<uint32_t> lws(4, 0);
diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h
index 2f3a44dcf235cabeef08862fa0099bf0bd0e9187..ae3f06cfe0f2f00000a39ca7ec5feae285e9cb61 100644
--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -19,6 +19,7 @@
 #include <vector>
 
 #include "mace/core/future.h"
+#include "mace/core/macros.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/types.h"
@@ -95,6 +96,7 @@ bool IsVecEqual(const std::vector<T> &input0,
 
 template <typename T>
 void AppendToStream(std::stringstream *ss, const std::string &delimiter, T v) {
+  MACE_UNUSED(delimiter);
   (*ss) << v;
 }
 
@@ -114,8 +116,6 @@ std::string Concat(Args... args) {
   return ss.str();
 }
 
-std::vector<uint32_t> Default2DLocalWS(const uint32_t *gws,
-                                       const uint32_t kwg_size);
 std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
                                        const uint32_t kwg_size);
 }  // namespace kernels
diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc
index 93f2bfb50e5ac45735477b95f97ade540922d224..9a16694a0284f1b6583ee633487b4725283bafea 100644
--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -25,6 +25,7 @@ void MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
                                                       const Tensor *B,
                                                       Tensor *C,
                                                       StatsFuture *future) {
+  MACE_UNUSED(future);
   std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
   std::vector<size_t> c_image_shape;
   CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, &c_image_shape);
diff --git a/mace/kernels/opencl/out_of_range_check_test.cc b/mace/kernels/opencl/out_of_range_check_test.cc
index 8dc0a372fb8852e8f0d43e5e28db537ba7a875e5..012edd70cd9a8c73a409886d37fad0b29ef8411b 100644
--- a/mace/kernels/opencl/out_of_range_check_test.cc
+++ b/mace/kernels/opencl/out_of_range_check_test.cc
@@ -25,9 +25,9 @@ namespace mace {
 namespace kernels {
 namespace {
 
-const bool BufferToImageOpImpl(Tensor *buffer,
-                               Tensor *image,
-                               const std::vector<size_t> &image_shape) {
+bool BufferToImageOpImpl(Tensor *buffer,
+                         Tensor *image,
+                         const std::vector<size_t> &image_shape) {
   std::unique_ptr<BufferBase> kernel_error;
   uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
                      static_cast<uint32_t>(image_shape[1])};
@@ -150,7 +150,7 @@ TEST(OutOfRangeCheckTest, RandomTest) {
   ASSERT_FALSE(BufferToImageOpImpl(buffer, image, image_shape));
 
   std::vector<size_t> overflow_image_shape = image_shape;
-  for (int i = 0; i < overflow_image_shape.size(); ++i) {
+  for (size_t i = 0; i < overflow_image_shape.size(); ++i) {
     overflow_image_shape[i] += 1;
   }
   ASSERT_TRUE(BufferToImageOpImpl(buffer, image, overflow_image_shape));
diff --git a/mace/kernels/opencl/pad.cc b/mace/kernels/opencl/pad.cc
index c3c90944aef80f6e971c8bbfa76045381d399786..bc093c16e3f2b66017fe368436b5f172bb9b3d5f 100644
--- a/mace/kernels/opencl/pad.cc
+++ b/mace/kernels/opencl/pad.cc
@@ -25,7 +25,8 @@ void PadFunctor<DeviceType::GPU, T>::operator()(
     const Tensor *input,
     Tensor *output,
     StatsFuture *future) {
-  MACE_CHECK(this->paddings_.size() == (input->dim_size() * 2));
+  MACE_CHECK(
+      this->paddings_.size() == static_cast<size_t>((input->dim_size() * 2)));
   MACE_CHECK((this->paddings_[0] == 0) && (this->paddings_[1] == 0)
                  && (this->paddings_[6] == 0) && (this->paddings_[7] == 0))
     << "Mace only support height/width dimension now";
diff --git a/mace/kernels/opencl/slice.cc b/mace/kernels/opencl/slice.cc
index b7f6086814fb2f63a9a24ccbde5baca0ba904f56..7944ee88b9845f93438d363069e9afbc4065f873 100644
--- a/mace/kernels/opencl/slice.cc
+++ b/mace/kernels/opencl/slice.cc
@@ -75,7 +75,7 @@ void SliceFunctor<DeviceType::GPU, T>::operator()(
   const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
   cl::Event event;
   CallStats call_stats{INT64_MAX, 0};
-  for (int i = 0; i < outputs_count; ++i) {
+  for (size_t i = 0; i < outputs_count; ++i) {
     uint32_t idx = 0;
     if (runtime->IsOutOfRangeCheckEnabled()) {
       kernel_.setArg(idx++,
diff --git a/mace/kernels/pad.h b/mace/kernels/pad.h
index bd61003205bc8d75dae104e8142d2ec4fff4e767..1e0e5ba4479754277cc1499e0517890f7bdb6613 100644
--- a/mace/kernels/pad.h
+++ b/mace/kernels/pad.h
@@ -47,7 +47,9 @@ struct PadFunctor : public PadFunctorBase {
   void operator()(const Tensor *input,
                   Tensor *output,
                   StatsFuture *future) {
-    MACE_CHECK(this->paddings_.size() == (input->dim_size() * 2));
+    MACE_UNUSED(future);
+    MACE_CHECK(
+        this->paddings_.size() == static_cast<size_t>(input->dim_size()) * 2);
     auto input_shape = input->shape();
     output->Resize({input_shape[0] + this->paddings_[0] + this->paddings_[1],
                     input_shape[1] + this->paddings_[2] + this->paddings_[3],
diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h
index 384ce834e752aa2233c247638b5c24eb36b08f78..97a65f1eae53b26d68c7c43e07833d4f01a9c33f 100644
--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -173,6 +173,7 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
   void operator()(const Tensor *input_tensor,
                   Tensor *output_tensor,
                   StatsFuture *future) {
+    MACE_UNUSED(future);
     std::vector<index_t> output_shape(4);
     std::vector<index_t> filter_shape = {
       input_tensor->dim(1), input_tensor->dim(1), kernels_[0], kernels_[1]};
diff --git a/mace/kernels/proposal.h b/mace/kernels/proposal.h
index 22d930d80b3f0a4d21bfe2c9cd14ea422e7190fb..273d17b80d480138a8036cc0ebf8a5fef9b8fa89 100644
--- a/mace/kernels/proposal.h
+++ b/mace/kernels/proposal.h
@@ -92,7 +92,7 @@ inline std::vector<int> nms(const float *bboxes_ptr,
   for (int i = 0; i < num_bboxes; ++i) {
     if (suppressed[i] == 1) continue;
     keep.push_back(i);
-    if (keep.size() >= post_nms_top_n) break;
+    if (keep.size() >= static_cast<size_t>(post_nms_top_n)) break;
     int coord_idx = i << 2;
     const float x1 = bboxes_ptr[coord_idx];
     const float y1 = bboxes_ptr[coord_idx + 1];
@@ -141,10 +141,11 @@ struct ProposalFunctor {
                   const Tensor *img_info_tensor,
                   Tensor *output,
                   StatsFuture *future) {
+    MACE_UNUSED(future);
     MACE_CHECK(rpn_cls_prob->dim(1) == rpn_bbox_pred->dim(1) &&
         rpn_cls_prob->dim(2) == rpn_bbox_pred->dim(2));
     MACE_CHECK((rpn_cls_prob->dim(3) / 2 == rpn_bbox_pred->dim(3) / 4) &&
-        (rpn_cls_prob->dim(3) / 2 == anchors_.size()));
+        (static_cast<size_t>(rpn_cls_prob->dim(3) / 2) == anchors_.size()));
     const float *img_info = img_info_tensor->data<float>();
     const int im_height = static_cast<int>(img_info[0] - 1);
     const int im_width = static_cast<int>(img_info[1] - 1);
diff --git a/mace/kernels/psroi_align.h b/mace/kernels/psroi_align.h
index f5e8b28b2f13c554ee20d2075219b5a65dd62a7c..4417fb1a98341a40d727886563866d4b5d8e5ad7 100644
--- a/mace/kernels/psroi_align.h
+++ b/mace/kernels/psroi_align.h
@@ -38,6 +38,7 @@ struct PSROIAlignFunctor {
                   const Tensor *rois,
                   Tensor *output,
                   StatsFuture *future) {
+    MACE_UNUSED(future);
     const int height = static_cast<int>(input->dim(1));
     const int width = static_cast<int>(input->dim(2));
     const int channels = static_cast<int>(input->dim(3));
diff --git a/mace/kernels/quantize.h b/mace/kernels/quantize.h
index 1ffab4880df718266dc0fdc7cea7dd64fdc362da..5483d06745c03157127930c66d0e008a225af140 100644
--- a/mace/kernels/quantize.h
+++ b/mace/kernels/quantize.h
@@ -81,6 +81,7 @@ struct QuantizeFunctor<CPU, uint8_t> {
                   Tensor *out_min,
                   Tensor *out_max,
                   StatsFuture *future) {
+    MACE_UNUSED(future);
     const float *input_data = input->data<float>();
     const float in_min_data = in_min->data<float>()[0];
     const float in_max_data = in_max->data<float>()[0];
@@ -109,6 +110,7 @@ struct DequantizeFunctor<CPU, uint8_t> {
                   const Tensor *in_max,
                   Tensor *output,
                   StatsFuture *future) {
+    MACE_UNUSED(future);
     const uint8_t *input_data = input->data<uint8_t>();
     const float in_min_data = in_min->data<float>()[0];
     const float in_max_data = in_max->data<float>()[0];
@@ -137,6 +139,7 @@ struct RequantizeFunctor<CPU, uint8_t> {
                   Tensor *out_min,
                   Tensor *out_max,
                   StatsFuture *future) {
+    MACE_UNUSED(future);
     const int *input_data = input->data<int>();
     const float in_min_data = in_min->data<float>()[0];
     const float in_max_data = in_max->data<float>()[0];
diff --git a/mace/kernels/reshape.h b/mace/kernels/reshape.h
index 2405447accb388603205c94b9a9e36a21e8dc8da..221064cca94382c6dad3f8016ecf009d52a5f708 100644
--- a/mace/kernels/reshape.h
+++ b/mace/kernels/reshape.h
@@ -35,6 +35,7 @@ struct ReshapeFunctor {
                   const std::vector<index_t> &out_shape,
                   Tensor *output,
                   StatsFuture *future) {
+    MACE_UNUSED(future);
     output->ResizeWithBuffer(out_shape, input->UnderlyingBuffer());
   }
 };
diff --git a/mace/kernels/resize_bilinear.h b/mace/kernels/resize_bilinear.h
index 6054f8988c5e80443c91280707c5d33149d5a57b..c312fbd2a7006ee29fdeabd2c5ec4fe37d93e33b 100644
--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -137,6 +137,7 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float>
     : ResizeBilinearFunctorBase(size, align_corners) {}
 
   void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
+    MACE_UNUSED(future);
     const index_t batch = input->dim(0);
     const index_t channels = input->dim(1);
     const index_t in_height = input->dim(2);
diff --git a/mace/kernels/slice.h b/mace/kernels/slice.h
index 6d45d8c3cedb8b0ec99d9329049de74f7443ac2d..16248fdeeac598674fc486cb817267825beda0e0 100644
--- a/mace/kernels/slice.h
+++ b/mace/kernels/slice.h
@@ -44,6 +44,7 @@ struct SliceFunctor : SliceFunctorBase {
   void operator()(const Tensor *input,
                   const std::vector<Tensor *> &output_list,
                   StatsFuture *future) {
+    MACE_UNUSED(future);
     const index_t input_channels = input->dim(axis_);
     const size_t outputs_count = output_list.size();
     const index_t output_channels = input_channels / outputs_count;
diff --git a/mace/kernels/softmax.h b/mace/kernels/softmax.h
index fae1d0651dd81ea9a67cd8c2df375a879bd5bc18..bd21547d2cf8294913781f1c1cb6bb3828170edb 100644
--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
@@ -39,6 +39,7 @@ struct SoftmaxFunctor;
 template<>
 struct SoftmaxFunctor<DeviceType::CPU, float> {
   void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
+    MACE_UNUSED(future);
     const index_t batch = input->dim(0);
     const index_t class_count = input->dim(1);
     const index_t class_size = input->dim(2) * input->dim(3);
diff --git a/mace/kernels/space_to_batch.h b/mace/kernels/space_to_batch.h
index 77b54990cf31239ea7e021b23c6a116080eaab7f..6703e5be37609bae6797d414c17fd18325ced57d 100644
--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.h
@@ -53,6 +53,10 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase {
                   const std::vector<index_t> &output_shape,
                   Tensor *batch_tensor,
                   StatsFuture *future) {
+    MACE_UNUSED(space_tensor);
+    MACE_UNUSED(output_shape);
+    MACE_UNUSED(batch_tensor);
+    MACE_UNUSED(future);
     MACE_NOT_IMPLEMENTED;
   }
 };
diff --git a/mace/kernels/transpose.h b/mace/kernels/transpose.h
index b5e029ed4e346b1487ac018e76ebb1356b23492d..3f49ee9c4f6198548f2178b70a70a64efa340086 100644
--- a/mace/kernels/transpose.h
+++ b/mace/kernels/transpose.h
@@ -30,6 +30,7 @@ struct TransposeFunctor {
   explicit TransposeFunctor(const std::vector<int> &dims) : dims_(dims) {}
 
   void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
+    MACE_UNUSED(future);
     Tensor::MappingGuard input_guard(input);
     Tensor::MappingGuard output_guard(output);
     const std::vector<index_t> &input_shape = input->shape();
diff --git a/mace/kernels/winograd_transform.h b/mace/kernels/winograd_transform.h
index 6f1662a2a29cc05090414f9ab30927b49ff69bab..06b6182e984d065ac47de82677a589aa1a077b75 100644
--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
@@ -51,6 +51,9 @@ struct WinogradTransformFunctor : WinogradTransformFunctorBase {
       : WinogradTransformFunctorBase(padding_type, paddings) {}
 
   void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
+    MACE_UNUSED(input);
+    MACE_UNUSED(output);
+    MACE_UNUSED(future);
     MACE_NOT_IMPLEMENTED;
   }
 };
@@ -105,6 +108,9 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
                   const Tensor *bias,
                   Tensor *output,
                   StatsFuture *future) {
+    MACE_UNUSED(input);
+    MACE_UNUSED(bias);
+    MACE_UNUSED(output);
     MACE_NOT_IMPLEMENTED;
   }
 };
diff --git a/mace/ops/batch_to_space.cc b/mace/ops/batch_to_space.cc
index fc10cc5845efcc220d7a08d9084b8fafd0d3a130..fd6a4cdb80e7104780833cfa4cda4c4171359d8a 100644
--- a/mace/ops/batch_to_space.cc
+++ b/mace/ops/batch_to_space.cc
@@ -29,6 +29,8 @@ void Register_BatchToSpaceND(OperatorRegistry *op_registry) {
                                      .TypeConstraint<half>("T")
                                      .Build(),
                     BatchToSpaceNDOp<DeviceType::GPU, half>);
+#else
+  MACE_UNUSED(op_registry);
 #endif  // MACE_ENABLE_OPENCL
 }
 
diff --git a/mace/ops/batch_to_space.h b/mace/ops/batch_to_space.h
index 2a6e931c395cde54ef618e56a0bbbf55c3c7f243..78e7e15f12eadffe5053d383cb57835879e08cfa 100644
--- a/mace/ops/batch_to_space.h
+++ b/mace/ops/batch_to_space.h
@@ -38,7 +38,7 @@ class BatchToSpaceNDOp : public Operator<D, T> {
     Tensor *space_tensor = this->Output(OUTPUT);
 
     std::vector<index_t> output_shape(4, 0);
-    CalculateOutputShape(batch_tensor, space_tensor, output_shape.data());
+    CalculateOutputShape(batch_tensor, output_shape.data());
     functor_(space_tensor, output_shape, const_cast<Tensor *>(batch_tensor),
              future);
     return true;
@@ -46,7 +46,6 @@ class BatchToSpaceNDOp : public Operator<D, T> {
 
  private:
   inline void CalculateOutputShape(const Tensor *input_tensor,
-                                   Tensor *output,
                                    index_t *output_shape) {
     auto crops = OperatorBase::GetRepeatedArgument<int>("crops", {0, 0, 0, 0});
     auto block_shape =
diff --git a/mace/ops/channel_shuffle.h b/mace/ops/channel_shuffle.h
index e8e3139e38612cc56794e0212cccd89c116e856c..562d5ac2a5fd85ae8e9b26faaf2fa789403b9059 100644
--- a/mace/ops/channel_shuffle.h
+++ b/mace/ops/channel_shuffle.h
@@ -45,7 +45,6 @@ class ChannelShuffleOp : public Operator<D, T> {
     MACE_CHECK(channels % group_ == 0,
                "input channels must be an integral multiple of group. ",
                input->dim(3));
-    int channels_per_group = channels / group_;
     functor_(input, output, future);
 
     return true;
diff --git a/mace/ops/folded_batch_norm_test.cc b/mace/ops/folded_batch_norm_test.cc
index dcb35b8d34a451767d233d923feeb976e757c645..26cad38e3c10d39b83d1fa99e9cc77b231cb3bf7 100644
--- a/mace/ops/folded_batch_norm_test.cc
+++ b/mace/ops/folded_batch_norm_test.cc
@@ -30,7 +30,7 @@ void CalculateScaleOffset(const std::vector<float> &gamma,
                           std::vector<float> *scale,
                           std::vector<float> *offset) {
   size_t size = gamma.size();
-  for (int i = 0; i < size; ++i) {
+  for (size_t i = 0; i < size; ++i) {
     (*scale)[i] = gamma[i] / std::sqrt(var[i] + epsilon);
     (*offset)[i] = beta[i] - mean[i] * (*scale)[i];
   }
diff --git a/mace/ops/proposal_test.cc b/mace/ops/proposal_test.cc
index cc4cdbe6c541c4ddad2bea75d449fe7d2908f32a..d36fd3a294cf2ad137b420e3669dc025986c4d67 100644
--- a/mace/ops/proposal_test.cc
+++ b/mace/ops/proposal_test.cc
@@ -45,7 +45,7 @@ TEST_F(ProposalOpTest, CPUSimple) {
       .Finalize(net.NewOperatorDef());
 
   std::vector<float> scores(height * width * 18);
-  for (int i = 0 ; i < scores.size(); ++i) {
+  for (size_t i = 0 ; i < scores.size(); ++i) {
     scores[i] = i;
   }
 
diff --git a/mace/ops/quantize_test.cc b/mace/ops/quantize_test.cc
index 1672ac53d9d1df71f7a53b939aec76c409404e45..536e9ad73bbc05575edfa6110c3500d1e6998700 100644
--- a/mace/ops/quantize_test.cc
+++ b/mace/ops/quantize_test.cc
@@ -95,8 +95,6 @@ TEST_F(QuantizeTest, TestQuantizeTrend) {
   net.RunOp();
 
   auto output = net.GetTensor("Output");
-  auto output_min = net.GetTensor("OutputMin");
-  auto output_max = net.GetTensor("OutputMax");
 
   const uint8_t *output_data = net.GetTensor("Output")->data<uint8_t>();
   for (int i = 1; i < output->size(); ++i) {
diff --git a/mace/ops/space_to_batch.cc b/mace/ops/space_to_batch.cc
index 0564209a28b8071ae02ce6f9fdbb5eef4a7128f2..f2474b85d49ae2a205053a9aad44ac6be202e25e 100644
--- a/mace/ops/space_to_batch.cc
+++ b/mace/ops/space_to_batch.cc
@@ -30,6 +30,8 @@ void Register_SpaceToBatchND(OperatorRegistry *op_registry) {
                                      .TypeConstraint<half>("T")
                                      .Build(),
                     SpaceToBatchNDOp<DeviceType::GPU, half>);
+#else
+  MACE_UNUSED(op_registry);
 #endif  // MACE_ENABLE_OPENCL
 }
 
diff --git a/mace/ops/space_to_batch.h b/mace/ops/space_to_batch.h
index 9bec8f8527d53d11de55561a3aada60a1d81abc4..a9a0eb4bd60dffc3c3b6357636e54ad3b3d03dfa 100644
--- a/mace/ops/space_to_batch.h
+++ b/mace/ops/space_to_batch.h
@@ -39,7 +39,7 @@ class SpaceToBatchNDOp : public Operator<D, T> {
     Tensor *batch_tensor = this->Output(OUTPUT);
 
     std::vector<index_t> output_shape(4, 0);
-    CalculateOutputShape(space_tensor, batch_tensor, output_shape.data());
+    CalculateOutputShape(space_tensor, output_shape.data());
     functor_(const_cast<Tensor *>(space_tensor), output_shape, batch_tensor,
              future);
     return true;
@@ -47,7 +47,6 @@ class SpaceToBatchNDOp : public Operator<D, T> {
 
  private:
   inline void CalculateOutputShape(const Tensor *input_tensor,
-                                   Tensor *output,
                                    index_t *output_shape) {
     auto paddings =
         OperatorBase::GetRepeatedArgument<int>("paddings", {0, 0, 0, 0});
diff --git a/mace/ops/transpose.h b/mace/ops/transpose.h
index 2ec9281c41021634f2e77c69eee4dcf97626fc2d..ff7315bb03d784ec5cbc8cf7307552658ceccf6e 100644
--- a/mace/ops/transpose.h
+++ b/mace/ops/transpose.h
@@ -35,11 +35,11 @@ class TransposeOp : public Operator<D, T> {
     const Tensor *input = this->Input(INPUT);
     Tensor *output = this->Output(OUTPUT);
     const std::vector<index_t> &input_shape = input->shape();
-    MACE_CHECK(input_shape.size() == 4 && dims_.size() == 4
-                 || input_shape.size() == 2 && dims_.size() == 2,
+    MACE_CHECK((input_shape.size() == 4 && dims_.size() == 4)
+                 || (input_shape.size() == 2 && dims_.size() == 2),
                "rank should be 2 or 4");
     std::vector<index_t> output_shape;
-    for (int i = 0; i < dims_.size(); ++i) {
+    for (size_t i = 0; i < dims_.size(); ++i) {
       output_shape.push_back(input_shape[dims_[i]]);
     }
     output->Resize(output_shape);
diff --git a/mace/ops/winograd_inverse_transform.cc b/mace/ops/winograd_inverse_transform.cc
index bcee9d64eb145f3ee7f599a4bd2505c0ca423443..763f1f165fc0ed88a9c8b4f50a91cde40c5688f8 100644
--- a/mace/ops/winograd_inverse_transform.cc
+++ b/mace/ops/winograd_inverse_transform.cc
@@ -30,6 +30,8 @@ void Register_WinogradInverseTransform(OperatorRegistry *op_registry) {
                                      .TypeConstraint<half>("T")
                                      .Build(),
                     WinogradInverseTransformOp<DeviceType::GPU, half>);
+#else
+  MACE_UNUSED(op_registry);
 #endif  // MACE_ENABLE_OPENCL
 }
 
diff --git a/mace/ops/winograd_transform.cc b/mace/ops/winograd_transform.cc
index 5c2d53d4764cf38198f275393dc950e4e36e0bd1..8181cba947b3f4719d799d73b31fe9870e58e20e 100644
--- a/mace/ops/winograd_transform.cc
+++ b/mace/ops/winograd_transform.cc
@@ -30,6 +30,8 @@ void Register_WinogradTransform(OperatorRegistry *op_registry) {
                                      .TypeConstraint<half>("T")
                                      .Build(),
                     WinogradTransformOp<DeviceType::GPU, half>);
+#else
+  MACE_UNUSED(op_registry);
 #endif  // MACE_ENABLE_OPENCL
 }
 
diff --git a/mace/tools/validation/BUILD b/mace/tools/validation/BUILD
index 0406c7e0bc06b9bfecd60b19e4609c9b2f754211..151423ef1690a991fba29b1487cab54fff120acb 100644
--- a/mace/tools/validation/BUILD
+++ b/mace/tools/validation/BUILD
@@ -4,13 +4,15 @@ load("//mace:mace.bzl", "if_openmp_enabled", "if_android")
 cc_binary(
     name = "mace_run",
     srcs = ["mace_run.cc"],
+    copts = if_openmp_enabled(["-fopenmp"]) + if_android([
+        "-DMACE_ENABLE_OPENCL",
+    ]),
     linkopts = if_openmp_enabled(["-fopenmp"]),
     linkstatic = 1,
-    copts = if_android(["-DMACE_ENABLE_OPENCL"]),
     deps = [
         "//external:gflags_nothreads",
-        "//mace/codegen:generated_models",
         "//mace/codegen:generated_mace_engine_factory",
-        "//mace/core:core",
+        "//mace/codegen:generated_models",
+        "//mace/core",
     ],
 )
diff --git a/mace/tools/validation/mace_run.cc b/mace/tools/validation/mace_run.cc
index 0fa474a24caa0744dd4a5e0545c5dbacdcb4d797..dc3faaf07ab115f8ab5a30151878bdf4a4fd83eb 100644
--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
@@ -385,7 +385,7 @@ int Main(int argc, char **argv) {
     ParseShape(output_shapes[i], &output_shape_vec[i]);
   }
 
-  bool ret;
+  bool ret = false;
 #pragma omp parallel for
   for (int i = 0; i < FLAGS_restart_round; ++i) {
     VLOG(0) << "restart round " << i;
@@ -395,9 +395,8 @@ int Main(int argc, char **argv) {
   }
   if (ret) {
     return 0;
-  } else {
-    return -1;
   }
+  return -1;
 }
 
 }  // namespace validation
diff --git a/mace/utils/BUILD b/mace/utils/BUILD
index 57bec3d0ea8714ddb272a0117b96ae39567ced37..5af9ad607b2338f26018e719d4587a80a0ad1fdd 100644
--- a/mace/utils/BUILD
+++ b/mace/utils/BUILD
@@ -12,20 +12,18 @@ load("//mace:mace.bzl", "if_android")
 cc_library(
     name = "utils",
     srcs = [
-        "command_line_flags.cc",
         "logging.cc",
         "string_util.cc",
     ],
     hdrs = [
-        "command_line_flags.h",
         "env_time.h",
         "logging.h",
         "memory_logging.h",
+        "rwlock.h",
         "string_util.h",
         "timer.h",
         "tuner.h",
         "utils.h",
-        "rwlock.h",
     ],
     linkopts = if_android([
         "-llog",
@@ -35,24 +33,6 @@ cc_library(
     ],
 )
 
-cc_test(
-    name = "utils_test",
-    testonly = 1,
-    srcs = [
-        "utils_test.cc",
-    ],
-    linkopts = if_android([
-        "-pie",
-        "-lm",
-    ]),
-    linkstatic = 1,
-    deps = [
-        ":utils",
-        "@gtest//:gtest",
-        "@gtest//:gtest_main",
-    ],
-)
-
 cc_library(
     name = "utils_dev",
     srcs = [
diff --git a/mace/utils/command_line_flags.cc b/mace/utils/command_line_flags.cc
deleted file mode 100644
index 50a67056f2e4c1569ba8c136f32886d150d05366..0000000000000000000000000000000000000000
--- a/mace/utils/command_line_flags.cc
+++ /dev/null
@@ -1,232 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/utils/command_line_flags.h"
-
-#include <cstring>
-#include <iomanip>
-
-#include "mace/utils/logging.h"
-
-namespace mace {
-namespace utils {
-
-bool StringConsume(const std::string &x, std::string *arg) {
-  MACE_CHECK_NOTNULL(arg);
-  if ((arg->size() >= x.size()) &&
-      (memcmp(arg->data(), x.data(), x.size()) == 0)) {
-    *arg = arg->substr(x.size());
-    return true;
-  }
-  return false;
-}
-
-bool ParseStringFlag(std::string arg,
-                     std::string flag,
-                     std::string *dst,
-                     bool *value_parsing_ok) {
-  *value_parsing_ok = true;
-  if (StringConsume("--", &arg) && StringConsume(flag, &arg) &&
-      StringConsume("=", &arg)) {
-    *dst = arg;
-    return true;
-  }
-
-  return false;
-}
-
-bool ParseInt32Flag(std::string arg,
-                    std::string flag,
-                    int32_t *dst,
-                    bool *value_parsing_ok) {
-  *value_parsing_ok = true;
-  if (StringConsume("--", &arg) && StringConsume(flag, &arg) &&
-      StringConsume("=", &arg)) {
-    char extra;
-    if (sscanf(arg.data(), "%d%c", dst, &extra) != 1) {
-      LOG(ERROR) << "Couldn't interpret value " << arg << " for flag " << flag
-                 << ".";
-      *value_parsing_ok = false;
-    }
-    return true;
-  }
-
-  return false;
-}
-
-bool ParseInt64Flag(std::string arg,
-                    std::string flag,
-                    int64_t *dst,
-                    bool *value_parsing_ok) {
-  *value_parsing_ok = true;
-  if (StringConsume("--", &arg) && StringConsume(flag, &arg) &&
-      StringConsume("=", &arg)) {
-    char extra;
-    if (sscanf(arg.data(), "%lld%c", dst, &extra) != 1) {
-      LOG(ERROR) << "Couldn't interpret value " << arg << " for flag " << flag
-                 << ".";
-      *value_parsing_ok = false;
-    }
-    return true;
-  }
-
-  return false;
-}
-
-bool ParseBoolFlag(std::string arg, std::string flag,
-                   bool *dst, bool *value_parsing_ok) {
-  *value_parsing_ok = true;
-  if (StringConsume("--", &arg) && StringConsume(flag, &arg)) {
-    if (arg.empty()) {
-      *dst = true;
-      return true;
-    }
-
-    if (arg == "=true") {
-      *dst = true;
-      return true;
-    } else if (arg == "=false") {
-      *dst = false;
-      return true;
-    } else {
-      LOG(ERROR) << "Couldn't interpret value " << arg << " for flag " << flag
-                 << ".";
-      *value_parsing_ok = false;
-      return true;
-    }
-  }
-
-  return false;
-}
-
-bool ParseFloatFlag(std::string arg,
-                    std::string flag,
-                    float *dst,
-                    bool *value_parsing_ok) {
-  *value_parsing_ok = true;
-  if (StringConsume("--", &arg) && StringConsume(flag, &arg) &&
-      StringConsume("=", &arg)) {
-    char extra;
-    if (sscanf(arg.data(), "%f%c", dst, &extra) != 1) {
-      LOG(ERROR) << "Couldn't interpret value " << arg << " for flag " << flag
-                 << ".";
-      *value_parsing_ok = false;
-    }
-    return true;
-  }
-
-  return false;
-}
-
-}  // namespace utils
-
-Flag::Flag(const char *name, int *dst, const std::string &usage_text)
-    : name_(name), type_(TYPE_INT), int_value_(dst), usage_text_(usage_text) {}
-
-Flag::Flag(const char *name, int64_t *dst, const std::string &usage_text)
-    : name_(name),
-      type_(TYPE_INT64),
-      int64_value_(dst),
-      usage_text_(usage_text) {}
-
-Flag::Flag(const char *name, bool *dst, const std::string &usage_text)
-    : name_(name),
-      type_(TYPE_BOOL),
-      bool_value_(dst),
-      usage_text_(usage_text) {}
-
-Flag::Flag(const char *name, std::string *dst, const std::string &usage_text)
-    : name_(name),
-      type_(TYPE_STRING),
-      string_value_(dst),
-      usage_text_(usage_text) {}
-
-Flag::Flag(const char *name, float *dst, const std::string &usage_text)
-    : name_(name),
-      type_(TYPE_FLOAT),
-      float_value_(dst),
-      usage_text_(usage_text) {}
-
-bool Flag::Parse(std::string arg, bool *value_parsing_ok) const {
-  bool result = false;
-  if (type_ == TYPE_INT) {
-    result = utils::ParseInt32Flag(arg, name_, int_value_, value_parsing_ok);
-  } else if (type_ == TYPE_INT64) {
-    result = utils::ParseInt64Flag(arg, name_, int64_value_, value_parsing_ok);
-  } else if (type_ == TYPE_BOOL) {
-    result = utils::ParseBoolFlag(arg, name_, bool_value_, value_parsing_ok);
-  } else if (type_ == TYPE_STRING) {
-    result = utils::ParseStringFlag(arg, name_,
-                                    string_value_, value_parsing_ok);
-  } else if (type_ == TYPE_FLOAT) {
-    result = utils::ParseFloatFlag(arg, name_, float_value_, value_parsing_ok);
-  }
-  return result;
-}
-
-/*static*/ bool Flags::Parse(int *argc,
-                             char **argv,
-                             const std::vector<Flag> &flag_list) {
-  bool result = true;
-  std::vector<char *> unknown_flags;
-  for (int i = 1; i < *argc; ++i) {
-    if (std::string(argv[i]) == "--") {
-      while (i < *argc) {
-        unknown_flags.push_back(argv[i]);
-        ++i;
-      }
-      break;
-    }
-
-    bool was_found = false;
-    for (const Flag &flag : flag_list) {
-      bool value_parsing_ok;
-      was_found = flag.Parse(argv[i], &value_parsing_ok);
-      if (!value_parsing_ok) {
-        result = false;
-      }
-      if (was_found) {
-        break;
-      }
-    }
-    if (!was_found) {
-      unknown_flags.push_back(argv[i]);
-    }
-  }
-  // Passthrough any extra flags.
-  int dst = 1;  // Skip argv[0]
-  for (char *f : unknown_flags) {
-    argv[dst++] = f;
-  }
-  argv[dst++] = nullptr;
-  *argc = unknown_flags.size() + 1;
-  return result && (*argc < 2 || strcmp(argv[1], "--help") != 0);
-}
-
-std::string Flags::Usage(const std::string &cmdline,
-                         const std::vector<Flag> &flag_list) {
-  std::stringstream usage_text;
-  usage_text << "usage: " << cmdline << std::endl;
-
-  if (!flag_list.empty()) {
-    usage_text << "Flags: " << std::endl;
-  }
-  for (const Flag &flag : flag_list) {
-    usage_text << "\t" << std::left << std::setw(30) << flag.name_;
-    usage_text << flag.usage_text_ << std::endl;
-  }
-  return usage_text.str();
-}
-
-}  // namespace mace
diff --git a/mace/utils/command_line_flags.h b/mace/utils/command_line_flags.h
deleted file mode 100644
index d16eb49ed90067efa6654f7aa4f2c69567e80976..0000000000000000000000000000000000000000
--- a/mace/utils/command_line_flags.h
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_UTILS_COMMAND_LINE_FLAGS_H_
-#define MACE_UTILS_COMMAND_LINE_FLAGS_H_
-
-#include <string>
-#include <vector>
-
-namespace mace {
-
-class Flag {
- public:
-  Flag(const char *name, int *dst1, const std::string &usage_text);
-  Flag(const char *name, int64_t *dst1, const std::string &usage_text);
-  Flag(const char *name, bool *dst, const std::string &usage_text);
-  Flag(const char *name, std::string *dst, const std::string &usage_text);
-  Flag(const char *name, float *dst, const std::string &usage_text);
-
- private:
-  friend class Flags;
-
-  bool Parse(std::string arg, bool *value_parsing_ok) const;
-
-  std::string name_;
-  enum { TYPE_INT, TYPE_INT64, TYPE_BOOL, TYPE_STRING, TYPE_FLOAT } type_;
-  int *int_value_;
-  int64_t *int64_value_;
-  bool *bool_value_;
-  std::string *string_value_;
-  float *float_value_;
-  std::string usage_text_;
-};
-
-class Flags {
- public:
-  // Parse the command line represented by argv[0, ..., (*argc)-1] to find flag
-  // instances matching flags in flaglist[].  Update the variables associated
-  // with matching flags, and remove the matching arguments from (*argc, argv).
-  // Return true iff all recognized flag values were parsed correctly, and the
-  // first remaining argument is not "--help".
-  static bool Parse(int *argc, char **argv, const std::vector<Flag> &flag_list);
-
-  // Return a usage message with command line cmdline, and the
-  // usage_text strings in flag_list[].
-  static std::string Usage(const std::string &cmdline,
-                           const std::vector<Flag> &flag_list);
-};
-
-}  // namespace mace
-
-#endif  // MACE_UTILS_COMMAND_LINE_FLAGS_H_
diff --git a/mace/utils/string_util.h b/mace/utils/string_util.h
index e95bd9024c4788aca9e3d07bf7f4afd45fd7bfcf..576dd8c5973c4737fa64b96ae444d9a66ac93517 100644
--- a/mace/utils/string_util.h
+++ b/mace/utils/string_util.h
@@ -58,7 +58,7 @@ std::string MakeString(const std::vector<T> &args) {
   std::stringstream ss;
   ss << "[";
   const size_t size = args.size();
-  for (int i = 0; i < size; ++i) {
+  for (size_t i = 0; i < size; ++i) {
     ss << args[i];
     if (i < size - 1) {
       ss << ", ";
diff --git a/mace/utils/tuner.h b/mace/utils/tuner.h
index 719d7699e0479ddd32b3edd688a03223e83fd99f..b6e92489910143ac10b7d9e57cc7ab02d38ade0c 100644
--- a/mace/utils/tuner.h
+++ b/mace/utils/tuner.h
@@ -146,7 +146,7 @@ class Tuner {
     for (iter = 0; iter < num_runs; ++iter) {
       res = func(params, timer, tuning_result);
       total_time_us += timer->AccumulatedMicros();
-      if (iter >= 1 && total_time_us > 100000 || total_time_us > 200000) {
+      if ((iter >= 1 && total_time_us > 100000) || total_time_us > 200000) {
         ++iter;
         break;
       }
@@ -165,7 +165,7 @@ class Tuner {
                                   std::vector<param_type> *)> &func,
       Timer *timer,
       std::vector<param_type> *opt_params) {
-    RetType res;
+    RetType res = 0;
     double opt_time = std::numeric_limits<double>::max();
     auto params = param_generator();
     std::vector<param_type> tuning_result;
diff --git a/mace/utils/tuner_production.cc b/mace/utils/tuner_production.cc
index e5a5a81e8d376ff32285b9c482e177db33a713f7..04c216f360bb0e80f419388e55df0b3459a61b29 100644
--- a/mace/utils/tuner_production.cc
+++ b/mace/utils/tuner_production.cc
@@ -22,6 +22,7 @@ namespace mace {
 bool GetTuningParams(
     const char *path,
     std::unordered_map<std::string, std::vector<unsigned int>> *param_table) {
+  (void)(path);
   extern const std::map<std::string, std::vector<unsigned int>>
       kTuningParamsData;
   for (auto it = kTuningParamsData.begin(); it != kTuningParamsData.end();
diff --git a/mace/utils/tuner_test.cc b/mace/utils/tuner_test.cc
index 4fd0b7afb558a3b068d09fa2a161685a46a8693d..c618a18224d47b14a43786a077a0ba2d7c94d833 100644
--- a/mace/utils/tuner_test.cc
+++ b/mace/utils/tuner_test.cc
@@ -54,7 +54,7 @@ TEST_F(TunerTest, SimpleRun) {
 }
 
 TEST_F(TunerTest, SimpleTune) {
-  int expect = 3;
+  unsigned int expect = 3;
   auto TunerFunc = [&](const std::vector<unsigned int> &params, Timer *timer,
                        std::vector<uint32_t> *tuning_result) -> int {
     int res = 0;
diff --git a/tools/sh_commands.py b/tools/sh_commands.py
index dfa96ed4d381faaa16f6513117175b9abb703347..e74059fc92d6f96f9530b2e83688165951305f29 100644
--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -277,7 +277,7 @@ def bazel_build(target,
     stdout_buff = []
     process_output = make_output_processor(stdout_buff)
     if abi == "host":
-        p = sh.bazel(
+        bazel_args = (
             "build",
             "-c",
             "opt",
@@ -287,12 +287,17 @@ def bazel_build(target,
             target,
             "--copt=-std=c++11",
             "--copt=-D_GLIBCXX_USE_C99_MATH_TR1",
-            "--copt=-Werror=return-type",
+            "--copt=-Werror",
+            "--copt=-Wextra",
+            "--copt=-Wno-missing-field-initializers",
             "--copt=-O3",
             "--define",
             "openmp=%s" % str(enable_openmp).lower(),
             "--define",
             "production=%s" % str(production_mode).lower(),
+        )
+        p = sh.bazel(
+            *bazel_args,
             _out=process_output,
             _bg=True,
             _err_to_out=True)
@@ -311,7 +316,9 @@ def bazel_build(target,
             "--cpu=%s" % abi,
             "--copt=-std=c++11",
             "--copt=-D_GLIBCXX_USE_C99_MATH_TR1",
-            "--copt=-Werror=return-type",
+            "--copt=-Werror",
+            "--copt=-Wextra",
+            "--copt=-Wno-missing-field-initializers",
             "--copt=-DMACE_OBFUSCATE_LITERALS",
             "--copt=-O3",
             "--define",