diff --git a/.gitignore b/.gitignore
index b3a51dd83dbcba3ce393e112b6df5fb02310ba4a..c78b060c4a06e8597917b39f67d90fb53f466a29 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,5 +6,6 @@ cmake-build-debug/
 mace/codegen/models/
 mace/codegen/opencl/
 mace/codegen/opencl_bin/
+mace/codegen/tuning/
 mace/codegen/version/
 build/
diff --git a/mace/benchmark/model_throughput_test.cc b/mace/benchmark/model_throughput_test.cc
index bc2be7a6c61f4f7f796884cebed17d340dd2ebe1..8c3a84b3f3693d3e44e1929bd9523b1435753d05 100644
--- a/mace/benchmark/model_throughput_test.cc
+++ b/mace/benchmark/model_throughput_test.cc
@@ -25,6 +25,7 @@
 #include "mace/public/mace.h"
 #include "mace/utils/env_time.h"
 #include "mace/utils/logging.h"
+#include "mace/core/types.h"
 
 namespace mace {
 
@@ -72,10 +73,43 @@ extern const std::string ModelChecksum();
 
 namespace benchmark {
 
+void Split(const std::string &str,
+           char delims,
+           std::vector<std::string> *result) {
+  MACE_CHECK_NOTNULL(result);
+  std::string tmp = str;
+  while (!tmp.empty()) {
+    size_t next_offset = tmp.find(delims);
+    result->push_back(tmp.substr(0, next_offset));
+    if (next_offset == std::string::npos) {
+      break;
+    } else {
+      tmp = tmp.substr(next_offset + 1);
+    }
+  }
+}
+
+void SplitAndParseToInts(const std::string &str,
+                         char delims,
+                         std::vector<int64_t> *result) {
+  MACE_CHECK_NOTNULL(result);
+  std::string tmp = str;
+  while (!tmp.empty()) {
+    index_t dim = atoi(tmp.data());
+    result->push_back(dim);
+    size_t next_offset = tmp.find(delims);
+    if (next_offset == std::string::npos) {
+      break;
+    } else {
+      tmp = tmp.substr(next_offset + 1);
+    }
+  }
+}
+
 void ParseShape(const std::string &str, std::vector<int64_t> *shape) {
   std::string tmp = str;
   while (!tmp.empty()) {
-    int dim = atoi(tmp.data());
+    index_t dim = atoi(tmp.data());
     shape->push_back(dim);
     size_t next_offset = tmp.find(",");
     if (next_offset == std::string::npos) {
@@ -86,6 +120,14 @@ void ParseShape(const std::string &str, std::vector<int64_t> *shape) {
   }
 }
 
+std::string FormatName(const std::string input) {
+  std::string res = input;
+  for (size_t i = 0; i < input.size(); ++i) {
+    if (!::isalnum(res[i])) res[i] = '_';
+  }
+  return res;
+}
+
 DeviceType ParseDeviceType(const std::string &device_str) {
   if (device_str.compare("CPU") == 0) {
     return DeviceType::CPU;
@@ -100,6 +142,10 @@ DeviceType ParseDeviceType(const std::string &device_str) {
   }
 }
 
+DEFINE_string(input_node, "input_node0,input_node1",
+              "input nodes, separated by comma");
+DEFINE_string(output_node, "output_node0,output_node1",
+              "output nodes, separated by comma");
 DEFINE_string(input_shape, "1,224,224,3", "input shape, separated by comma");
 DEFINE_string(output_shape, "1,224,224,2", "output shape, separated by comma");
 DEFINE_string(input_file, "", "input file name");
@@ -113,7 +159,6 @@ int Main(int argc, char **argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
   LOG(INFO) << "mace version: " << MaceVersion();
-  LOG(INFO) << "mace git version: " << MaceGitVersion();
 #ifdef MACE_CPU_MODEL_TAG
   LOG(INFO) << "cpu model checksum: "
             << mace::MACE_CPU_MODEL_TAG::ModelChecksum();
@@ -126,7 +171,9 @@ int Main(int argc, char **argv) {
   LOG(INFO) << "dsp model checksum: "
             << mace::MACE_DSP_MODEL_TAG::ModelChecksum();
 #endif
+  LOG(INFO) << "Input node: [" << FLAGS_input_node<< "]";
   LOG(INFO) << "input_shape: " << FLAGS_input_shape;
+  LOG(INFO) << "Output node: [" << FLAGS_output_node<< "]";
   LOG(INFO) << "output_shape: " << FLAGS_output_shape;
   LOG(INFO) << "input_file: " << FLAGS_input_file;
   LOG(INFO) << "cpu_model_data_file: " << FLAGS_cpu_model_data_file;
@@ -134,31 +181,63 @@ int Main(int argc, char **argv) {
   LOG(INFO) << "dsp_model_data_file: " << FLAGS_dsp_model_data_file;
   LOG(INFO) << "run_seconds: " << FLAGS_run_seconds;
 
-  std::vector<int64_t> input_shape_vec;
-  std::vector<int64_t> output_shape_vec;
-  ParseShape(FLAGS_input_shape, &input_shape_vec);
-  ParseShape(FLAGS_output_shape, &output_shape_vec);
-
-  int64_t input_size =
-      std::accumulate(input_shape_vec.begin(), input_shape_vec.end(), 1,
-                      std::multiplies<int64_t>());
-  int64_t output_size =
-      std::accumulate(output_shape_vec.begin(), output_shape_vec.end(), 1,
-                      std::multiplies<int64_t>());
-  std::unique_ptr<float[]> input_data(new float[input_size]);
-  std::unique_ptr<float[]> cpu_output_data(new float[output_size]);
-  std::unique_ptr<float[]> gpu_output_data(new float[output_size]);
-  std::unique_ptr<float[]> dsp_output_data(new float[output_size]);
-
-  // load input
-  std::ifstream in_file(FLAGS_input_file, std::ios::in | std::ios::binary);
-  if (in_file.is_open()) {
-    in_file.read(reinterpret_cast<char *>(input_data.get()),
-                 input_size * sizeof(float));
-    in_file.close();
-  } else {
-    LOG(INFO) << "Open input file failed";
-    return -1;
+  std::vector<std::string> input_names;
+  std::vector<std::string> output_names;
+  std::vector<std::string> input_shapes;
+  std::vector<std::string> output_shapes;
+  Split(FLAGS_input_node, ',', &input_names);
+  Split(FLAGS_output_node, ',', &output_names);
+  Split(FLAGS_input_shape, ':', &input_shapes);
+  Split(FLAGS_output_shape, ':', &output_shapes);
+
+  const size_t input_count = input_shapes.size();
+  const size_t output_count = output_shapes.size();
+  std::vector<std::vector<int64_t>> input_shape_vec(input_count);
+  std::vector<std::vector<int64_t>> output_shape_vec(output_count);
+  for (size_t i = 0; i < input_count; ++i) {
+    ParseShape(input_shapes[i], &input_shape_vec[i]);
+  }
+  for (size_t i = 0; i < output_count; ++i) {
+    ParseShape(output_shapes[i], &output_shape_vec[i]);
+  }
+
+  std::map<std::string, mace::MaceTensor> inputs;
+  std::map<std::string, mace::MaceTensor> cpu_outputs;
+  std::map<std::string, mace::MaceTensor> gpu_outputs;
+  std::map<std::string, mace::MaceTensor> dsp_outputs;
+  for (size_t i = 0; i < input_count; ++i) {
+    // Allocate input and output
+    int64_t input_size =
+        std::accumulate(input_shape_vec[i].begin(), input_shape_vec[i].end(), 1,
+                        std::multiplies<int64_t>());
+    auto buffer_in = std::shared_ptr<float>(new float[input_size],
+                                            std::default_delete<float[]>());
+    // load input
+    std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]),
+                          std::ios::in | std::ios::binary);
+    if (in_file.is_open()) {
+      in_file.read(reinterpret_cast<char *>(buffer_in.get()),
+                   input_size * sizeof(float));
+      in_file.close();
+    } else {
+      LOG(FATAL) << "Open input file failed";
+    }
+    inputs[input_names[i]] = mace::MaceTensor(input_shape_vec[i], buffer_in);
+  }
+
+  for (size_t i = 0; i < output_count; ++i) {
+    int64_t output_size =
+        std::accumulate(output_shape_vec[i].begin(),
+                        output_shape_vec[i].end(), 1,
+                        std::multiplies<int64_t>());
+    auto buffer_out = std::shared_ptr<float>(new float[output_size],
+                                             std::default_delete<float[]>());
+    cpu_outputs[output_names[i]] = mace::MaceTensor(output_shape_vec[i],
+                                                    buffer_out);
+    gpu_outputs[output_names[i]] = mace::MaceTensor(output_shape_vec[i],
+                                                    buffer_out);
+    dsp_outputs[output_names[i]] = mace::MaceTensor(output_shape_vec[i],
+                                                    buffer_out);
   }
 
   int64_t t0, t1, init_micros;
@@ -170,11 +249,12 @@ int Main(int argc, char **argv) {
       FLAGS_cpu_model_data_file.c_str());
   NetDef cpu_net_def = mace::MACE_CPU_MODEL_TAG::CreateNet(cpu_model_data);
 
-  mace::MaceEngine cpu_engine(&cpu_net_def, DeviceType::CPU);
+  mace::MaceEngine cpu_engine(&cpu_net_def, DeviceType::CPU, input_names,
+                              output_names);
 
   LOG(INFO) << "CPU Warm up run";
   t0 = NowMicros();
-  cpu_engine.Run(input_data.get(), input_shape_vec, cpu_output_data.get());
+  cpu_engine.Run(inputs, &cpu_outputs);
   t1 = NowMicros();
   LOG(INFO) << "CPU 1st warm up run latency: " << t1 - t0 << " us";
 #endif
@@ -187,12 +267,13 @@ int Main(int argc, char **argv) {
       FLAGS_gpu_model_data_file.c_str());
   NetDef gpu_net_def = mace::MACE_GPU_MODEL_TAG::CreateNet(gpu_model_data);
 
-  mace::MaceEngine gpu_engine(&gpu_net_def, DeviceType::OPENCL);
+  mace::MaceEngine gpu_engine(&gpu_net_def, DeviceType::OPENCL, input_names,
+                              output_names);
   mace::MACE_GPU_MODEL_TAG::UnloadModelData(gpu_model_data);
 
   LOG(INFO) << "GPU Warm up run";
   t0 = NowMicros();
-  gpu_engine.Run(input_data.get(), input_shape_vec, gpu_output_data.get());
+  gpu_engine.Run(inputs, &gpu_outputs);
   t1 = NowMicros();
   LOG(INFO) << "GPU 1st warm up run latency: " << t1 - t0 << " us";
 #endif
@@ -202,15 +283,16 @@ int Main(int argc, char **argv) {
   LOG(INFO) << "Load & init dsp model and warm up";
   const unsigned char *dsp_model_data =
       mace::MACE_DSP_MODEL_TAG::LoadModelData(
-      FLAGS_gpu_model_data_file.c_str());
+      FLAGS_dsp_model_data_file.c_str());
   NetDef dsp_net_def = mace::MACE_DSP_MODEL_TAG::CreateNet(dsp_model_data);
 
-  mace::MaceEngine dsp_engine(&dsp_net_def, DeviceType::HEXAGON);
+  mace::MaceEngine dsp_engine(&dsp_net_def, DeviceType::HEXAGON, input_names,
+                              output_names);
   mace::MACE_DSP_MODEL_TAG::UnloadModelData(dsp_model_data);
 
   LOG(INFO) << "DSP Warm up run";
   t0 = NowMicros();
-  gpu_engine.Run(input_data.get(), input_shape_vec, dsp_output_data.get());
+  dsp_engine.Run(inputs, &dsp_outputs);
   t1 = NowMicros();
   LOG(INFO) << "DSP 1st warm up run latency: " << t1 - t0 << " us";
 #endif
@@ -226,7 +308,7 @@ int Main(int argc, char **argv) {
     int64_t micros = 0;
     int64_t start = NowMicros();
     for (; micros < run_micros; ++frames) {
-      cpu_engine.Run(input_data.get(), input_shape_vec, cpu_output_data.get());
+      cpu_engine.Run(inputs, &cpu_outputs);
       int64_t end = NowMicros();
       micros = end - start;
     }
@@ -240,7 +322,7 @@ int Main(int argc, char **argv) {
     int64_t micros = 0;
     int64_t start = NowMicros();
     for (; micros < run_micros; ++frames) {
-      gpu_engine.Run(input_data.get(), input_shape_vec, gpu_output_data.get());
+      gpu_engine.Run(inputs, &gpu_outputs);
       int64_t end = NowMicros();
       micros = end - start;
     }
@@ -254,7 +336,7 @@ int Main(int argc, char **argv) {
     int64_t micros = 0;
     int64_t start = NowMicros();
     for (; micros < run_micros; ++frames) {
-      dsp_engine.Run(input_data.get(), input_shape_vec, dsp_output_data.get());
+      dsp_engine.Run(inputs, &dsp_outputs);
       int64_t end = NowMicros();
       micros = end - start;
     }
diff --git a/mace/codegen/BUILD b/mace/codegen/BUILD
index b715869984d414234af5961a21cf4867bdb2601d..bc92a7bf13b871954b5f18de9e218fad7722b459 100644
--- a/mace/codegen/BUILD
+++ b/mace/codegen/BUILD
@@ -17,14 +17,8 @@ cc_library(
 )
 
 cc_library(
-    name = "generated_opencl_dev",
-    srcs = ["opencl/opencl_encrypt_program.cc"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "generated_opencl_prod",
-    srcs = ["opencl/opencl_compiled_program.cc"],
+    name = "generated_opencl",
+    srcs = glob(["opencl/*.cc"]),
     linkstatic = 1,
 )
 
diff --git a/mace/codegen/opencl/opencl_compiled_program.cc b/mace/codegen/opencl/opencl_compiled_program.cc
deleted file mode 100644
index 62fbad33022b92d4d72a7f407cca18c5e4073fe3..0000000000000000000000000000000000000000
--- a/mace/codegen/opencl/opencl_compiled_program.cc
+++ /dev/null
@@ -1,16 +0,0 @@
-//
-// Copyright (c) 2017 XiaoMi All rights reserved.
-//
-
-// This is a generated file, DO NOT EDIT
-
-#include <map>
-#include <string>
-#include <vector>
-
-namespace mace {
-
-extern const std::map<std::string, std::vector<unsigned char>>
-    kCompiledProgramMap = {};
-
-}  // namespace mace
diff --git a/mace/codegen/tuning/tuning_params.cc b/mace/codegen/tuning/tuning_params.cc
deleted file mode 100644
index 8a8b7132a26a150d14ff9504b13aa668fc109285..0000000000000000000000000000000000000000
--- a/mace/codegen/tuning/tuning_params.cc
+++ /dev/null
@@ -1,16 +0,0 @@
-//
-// Copyright (c) 2017 XiaoMi All rights reserved.
-//
-
-// This is a generated file, DO NOT EDIT
-
-#include <map>
-#include <string>
-#include <vector>
-
-namespace mace {
-
-extern const std::map<std::string, std::vector<unsigned int>>
-    kTuningParamsData = {};
-
-}  // namespace mace
diff --git a/mace/core/BUILD b/mace/core/BUILD
index 9a957ad1d4becc788d554aa34b58d911428d60ea..1b4cdaa8d51c054a9f8b55d5c7e4266273724f5d 100644
--- a/mace/core/BUILD
+++ b/mace/core/BUILD
@@ -27,8 +27,6 @@ cc_library(
             "runtime/hexagon/*.cc",
         ],
         exclude = [
-            "runtime/opencl/opencl_production.cc",
-            "runtime/opencl/opencl_development.cc",
             "*_test.cc",
             "runtime/hexagon/hexagon_controller_dummy.cc",
         ],
@@ -50,15 +48,15 @@ cc_library(
     ]),
     deps = [
         ":opencl_headers",
-        "//mace/utils",
+        "//mace/codegen:generated_opencl",
         "//mace/codegen:generated_version",
+        "//mace/utils",
         "@half//:half",
     ] + if_production_mode([
+        "//mace/codegen:generated_tuning_params",
         "//mace/utils:utils_prod",
-        "//mace/core:opencl_prod",
     ]) + if_not_production_mode([
         "//mace/utils:utils_dev",
-        "//mace/core:opencl_dev",
     ]),
 )
 
@@ -90,26 +88,3 @@ cc_library(
     ],
     alwayslink = 1,
 )
-
-cc_library(
-    name = "opencl_dev",
-    srcs = ["runtime/opencl/opencl_development.cc"],
-    linkstatic = 1,
-    deps = [
-        ":opencl_headers",
-        "//mace/codegen:generated_opencl_dev",
-        "//mace/utils",
-    ],
-)
-
-cc_library(
-    name = "opencl_prod",
-    srcs = ["runtime/opencl/opencl_production.cc"],
-    linkstatic = 1,
-    deps = [
-        ":opencl_headers",
-        "//mace/codegen:generated_opencl_prod",
-        "//mace/codegen:generated_tuning_params",
-        "//mace/utils",
-    ],
-)
diff --git a/mace/core/file_storage.cc b/mace/core/file_storage.cc
new file mode 100644
index 0000000000000000000000000000000000000000..de9a67fa73a197d8bfe73ec27652fff2f0afd85f
--- /dev/null
+++ b/mace/core/file_storage.cc
@@ -0,0 +1,216 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/core/file_storage.h"
+
+#include <fcntl.h>
+#include <limits.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <memory>
+#include <utility>
+
+
+#include "mace/utils/logging.h"
+
+namespace mace {
+
+class FileStorageFactory::Impl {
+ public:
+  explicit Impl(const std::string &path);
+
+  std::unique_ptr<KVStorage> CreateStorage(const std::string &name);
+
+ private:
+  std::string path_;
+};
+
+FileStorageFactory::Impl::Impl(const std::string &path): path_(path) {}
+std::unique_ptr<KVStorage> FileStorageFactory::Impl::CreateStorage(
+    const std::string &name) {
+  return std::move(std::unique_ptr<KVStorage>(
+      new FileStorage(path_ + "/" + name)));
+}
+
+FileStorageFactory::FileStorageFactory(const std::string &path):
+    impl_(new FileStorageFactory::Impl(path)) {}
+
+FileStorageFactory::~FileStorageFactory() = default;
+
+std::unique_ptr<KVStorage> FileStorageFactory::CreateStorage(
+    const std::string &name) {
+  return impl_->CreateStorage(name);
+}
+
+FileStorage::FileStorage(const std::string &file_path):
+    file_path_(file_path) {}
+
+int FileStorage::Load() {
+  struct stat st;
+  if (stat(file_path_.c_str(), &st) == -1) {
+    if (errno == ENOENT) {
+      LOG(INFO) << "File " << file_path_
+                << " does not exist";
+      return 0;
+    } else {
+      LOG(WARNING) << "Stat file " << file_path_
+                   << " failed, error code: " << errno;
+      return -1;
+    }
+  }
+  int fd = open(file_path_.c_str(), O_RDONLY);
+  if (fd < 0) {
+    if (errno == ENOENT) {
+      LOG(INFO) << "File " << file_path_
+                << " does not exist";
+      return 0;
+    } else {
+      LOG(WARNING) << "open file " << file_path_
+                   << " failed, error code: " << errno;
+      return -1;
+    }
+  }
+  size_t file_size = st.st_size;
+  unsigned char *file_data =
+    static_cast<unsigned char *>(mmap(nullptr, file_size, PROT_READ,
+          MAP_PRIVATE, fd, 0));
+  int res = 0;
+  if (file_data == MAP_FAILED) {
+    LOG(WARNING) << "mmap file " << file_path_
+                 << " failed, error code: " << errno;
+
+    res = close(fd);
+    if (res != 0) {
+      LOG(WARNING) << "close file " << file_path_
+                   << " failed, error code: " << errno;
+    }
+    return -1;
+  }
+  unsigned char *file_data_ptr = file_data;
+
+  const size_t int_size = sizeof(int32_t);
+
+  int64_t data_size = 0;
+  memcpy(&data_size, file_data_ptr, sizeof(int64_t));
+  file_data_ptr += sizeof(int64_t);
+  int32_t key_size = 0;
+  int32_t value_size = 0;
+  for (int i = 0; i < data_size; ++i) {
+    memcpy(&key_size, file_data_ptr, int_size);
+    file_data_ptr += int_size;
+    std::unique_ptr<char[]> key(new char[key_size+1]);
+    memcpy(&key[0], file_data_ptr, key_size);
+    file_data_ptr += key_size;
+    key[key_size] = '\0';
+
+    memcpy(&value_size, file_data_ptr, int_size);
+    file_data_ptr += int_size;
+    std::vector<unsigned char> value(value_size);
+    memcpy(value.data(), file_data_ptr, value_size);
+    file_data_ptr += value_size;
+
+    data_.emplace(std::string(&key[0]), value);
+  }
+
+  res = munmap(file_data, file_size);
+  if (res != 0) {
+    LOG(WARNING) << "munmap file " << file_path_
+                 << " failed, error code: " << errno;
+    res = close(fd);
+    if (res != 0) {
+      LOG(WARNING) << "close file " << file_path_
+                   << " failed, error code: " << errno;
+    }
+    return -1;
+  }
+  res = close(fd);
+  if (res != 0) {
+    LOG(WARNING) << "close file " << file_path_
+                 << " failed, error code: " << errno;
+    return -1;
+  }
+  return 0;
+}
+
+bool FileStorage::Insert(const std::string &key,
+                         const std::vector<unsigned char> &value) {
+  data_.emplace(key, value);
+  return true;
+}
+
+const std::vector<unsigned char> *FileStorage::Find(const std::string &key) {
+  auto iter = data_.find(key);
+  if (iter == data_.end()) return nullptr;
+
+  return &(iter->second);
+}
+
+int FileStorage::Flush() {
+  int fd = open(file_path_.c_str(), O_WRONLY | O_CREAT, 0600);
+  if (fd < 0) {
+    LOG(WARNING) << "open file " << file_path_
+                 << " failed, error code:" << errno;
+    return -1;
+  }
+
+  const size_t int_size = sizeof(int32_t);
+
+  int64_t data_size = sizeof(int64_t);
+  for (auto &kv : data_) {
+    data_size += 2 * int_size + kv.first.size() + kv.second.size();
+  }
+  std::unique_ptr<unsigned char[]> buffer(new unsigned char[data_size]);
+  unsigned char *buffer_ptr = &buffer[0];
+
+  int64_t num_of_data = data_.size();
+  memcpy(buffer_ptr, &num_of_data, sizeof(int64_t));
+  buffer_ptr += sizeof(int64_t);
+  for (auto &kv : data_) {
+    int32_t key_size = kv.first.size();
+    memcpy(buffer_ptr, &key_size, int_size);
+    buffer_ptr += int_size;
+
+    memcpy(buffer_ptr, kv.first.c_str(), kv.first.size());
+    buffer_ptr += kv.first.size();
+
+    int32_t value_size = kv.second.size();
+    memcpy(buffer_ptr, &value_size, int_size);
+    buffer_ptr += int_size;
+
+    memcpy(buffer_ptr, kv.second.data(), kv.second.size());
+    buffer_ptr += kv.second.size();
+  }
+  int res = 0;
+  buffer_ptr = &buffer[0];
+  int64_t remain_size = data_size;
+  while (remain_size > 0) {
+    size_t buffer_size = std::min<int64_t>(remain_size, SSIZE_MAX);
+    res = write(fd, buffer_ptr, buffer_size);
+    if (res == -1) {
+      LOG(WARNING) << "write file " << file_path_
+                   << " failed, error code: " << errno;
+      res = close(fd);
+      if (res != 0) {
+        LOG(WARNING) << "close file " << file_path_
+                     << " failed, error code: " << errno;
+      }
+      return -1;
+    }
+    remain_size -= buffer_size;
+    buffer_ptr += buffer_size;
+  }
+
+  res = close(fd);
+  if (res != 0) {
+    LOG(WARNING) << "close file " << file_path_
+                 << " failed, error code: " << errno;
+    return -1;
+  }
+  return 0;
+}
+
+};  // namespace mace
diff --git a/mace/core/file_storage.h b/mace/core/file_storage.h
new file mode 100644
index 0000000000000000000000000000000000000000..c29628cdd657d7a16d494128cfee908dfdc1f18f
--- /dev/null
+++ b/mace/core/file_storage.h
@@ -0,0 +1,34 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_CORE_FILE_STORAGE_H_
+#define MACE_CORE_FILE_STORAGE_H_
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "mace/public/mace_runtime.h"
+
+namespace mace {
+
+class FileStorage : public KVStorage {
+ public:
+  explicit FileStorage(const std::string &file_path);
+
+ public:
+  int Load() override;
+  bool Insert(const std::string &key,
+              const std::vector<unsigned char> &value) override;
+  const std::vector<unsigned char> *Find(const std::string &key) override;
+  int Flush() override;
+
+ private:
+  std::string file_path_;
+  std::map<std::string, std::vector<unsigned char>> data_;
+};
+
+}  // namespace mace
+
+#endif  // MACE_CORE_FILE_STORAGE_H_
diff --git a/mace/core/mace.cc b/mace/core/mace.cc
index 98b0190d242c2853faecaa40338630e5e7a8400e..484c9472af8e9e12124c78b054d539584e4d2801 100644
--- a/mace/core/mace.cc
+++ b/mace/core/mace.cc
@@ -4,8 +4,10 @@
 
 #include <memory>
 
+#include "mace/core/file_storage.h"
 #include "mace/core/net.h"
 #include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/types.h"
 #include "mace/public/mace.h"
 
@@ -94,6 +96,7 @@ MaceEngine::Impl::Impl(const NetDef *net_def,
       net_(nullptr),
       hexagon_controller_(nullptr) {
   LOG(INFO) << "MACE version: " << MaceVersion();
+  // Set storage path for internal usage
   for (auto input_name : input_nodes) {
     ws_->CreateTensor(MakeString("mace_input_node_", input_name, ":0"),
                       GetDeviceAllocator(device_type_), DT_FLOAT);
@@ -173,6 +176,7 @@ MaceStatus MaceEngine::Impl::Run(
       LOG(FATAL) << "Net run failed";
     }
   }
+  OpenCLRuntime::Global()->SaveBuiltCLProgram();
   for (auto &output : *outputs) {
     Tensor *output_tensor =
         ws_->GetTensor(MakeString("mace_output_node_", output.first + ":0"));
diff --git a/mace/core/mace_runtime.cc b/mace/core/mace_runtime.cc
index da70f96f72b2e0c76d704773c34eefd2a789b5b1..5f94195370990480c0003836cbde8d2a529c70c3 100644
--- a/mace/core/mace_runtime.cc
+++ b/mace/core/mace_runtime.cc
@@ -8,6 +8,8 @@
 
 namespace mace {
 
+std::shared_ptr<KVStorageFactory> kStorageFactory = nullptr;
+
 void ConfigOpenCLRuntime(GPUPerfHint gpu_perf_hint,
                          GPUPriorityHint gpu_priority_hint) {
   VLOG(1) << "Set GPU configurations, gpu_perf_hint: " << gpu_perf_hint
@@ -15,6 +17,11 @@ void ConfigOpenCLRuntime(GPUPerfHint gpu_perf_hint,
   OpenCLRuntime::Configure(gpu_perf_hint, gpu_priority_hint);
 }
 
+void ConfigKVStorageFactory(std::shared_ptr<KVStorageFactory> storage_factory) {
+  VLOG(1) << "Set internal KV Storage Engine";
+  kStorageFactory = storage_factory;
+}
+
 void ConfigOmpThreads(int omp_num_threads) {
   VLOG(1) << "Config CPU omp_num_threads: " << omp_num_threads;
   SetOmpThreads(omp_num_threads);
diff --git a/mace/core/runtime/opencl/opencl_development.cc b/mace/core/runtime/opencl/opencl_development.cc
deleted file mode 100644
index 036a97ed6f19d7b5fb499d1166f5183f99d84bc4..0000000000000000000000000000000000000000
--- a/mace/core/runtime/opencl/opencl_development.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-//
-// Copyright (c) 2017 XiaoMi All rights reserved.
-//
-
-#include <fstream>
-#include <map>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "mace/core/runtime/opencl/cl2_header.h"
-#include "mace/utils/logging.h"
-#include "mace/utils/utils.h"
-
-namespace mace {
-
-bool GetSourceOrBinaryProgram(const std::string &program_name,
-                              const std::string &binary_file_name_prefix,
-                              const cl::Context &context,
-                              const cl::Device &device,
-                              cl::Program *program,
-                              bool *is_binary) {
-  extern const std::map<std::string, std::vector<unsigned char>>
-      kEncryptedProgramMap;
-  *is_binary = false;
-  auto it_source = kEncryptedProgramMap.find(program_name);
-  if (it_source == kEncryptedProgramMap.end()) {
-    return false;
-  }
-  cl::Program::Sources sources;
-  std::string content(it_source->second.begin(), it_source->second.end());
-  std::string kernel_source = ObfuscateString(content);
-  sources.push_back(kernel_source);
-  *program = cl::Program(context, sources);
-
-  return true;
-}
-
-}  // namespace mace
diff --git a/mace/core/runtime/opencl/opencl_production.cc b/mace/core/runtime/opencl/opencl_production.cc
deleted file mode 100644
index f4df016f52911c9ed4c9a37cd4d7660de1621a43..0000000000000000000000000000000000000000
--- a/mace/core/runtime/opencl/opencl_production.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-//
-// Copyright (c) 2017 XiaoMi All rights reserved.
-//
-
-#include <map>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "mace/core/runtime/opencl/cl2_header.h"
-#include "mace/utils/logging.h"
-
-namespace mace {
-
-bool GetSourceOrBinaryProgram(const std::string &program_name,
-                              const std::string &binary_file_name_prefix,
-                              const cl::Context &context,
-                              const cl::Device &device,
-                              cl::Program *program,
-                              bool *is_binary) {
-  extern const std::map<std::string, std::vector<unsigned char>>
-      kCompiledProgramMap;
-  *is_binary = true;
-  auto it_binary = kCompiledProgramMap.find(binary_file_name_prefix);
-  if (it_binary == kCompiledProgramMap.end()) {
-    return false;
-  }
-  *program = cl::Program(context, {device}, {it_binary->second});
-  return true;
-}
-
-}  // namespace mace
diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index a592b6fa119239ef09c293e602caa29fd66e8d23..f24333ac461e9cbff8708f4202ad0f7545e56eeb 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -10,35 +10,20 @@
 #include <mutex>  // NOLINT(build/c++11)
 #include <string>
 #include <vector>
+#include <utility>
 
+#include "mace/core/file_storage.h"
 #include "mace/core/runtime/opencl/opencl_extension.h"
 #include "mace/public/mace.h"
 #include "mace/utils/tuner.h"
 
 namespace mace {
-namespace {
-
-bool WriteFile(const std::string &filename,
-               bool binary,
-               const std::vector<unsigned char> &content) {
-  std::ios_base::openmode mode = std::ios_base::out | std::ios_base::trunc;
-  if (binary) {
-    mode |= std::ios::binary;
-  }
-  std::ofstream ofs(filename, mode);
-
-  ofs.write(reinterpret_cast<const char *>(&content[0]),
-            content.size() * sizeof(char));
-  ofs.close();
-  if (ofs.fail()) {
-    LOG(ERROR) << "Failed to write to file " << filename;
-    return false;
-  }
 
-  return true;
-}
-
-}  // namespace
+extern const std::map<std::string, std::vector<unsigned char>>
+    kCompiledProgramMap;
+extern const std::string kCompiledProgramPlatform;
+extern const std::map<std::string, std::vector<unsigned char>>
+    kEncryptedProgramMap;
 
 const std::string OpenCLErrorToString(cl_int error) {
   switch (error) {
@@ -194,19 +179,19 @@ void OpenCLProfilingTimer::ClearTiming() {
   accumulated_micros_ = 0;
 }
 
-GPUPerfHint OpenCLRuntime::gpu_perf_hint_ = GPUPerfHint::PERF_DEFAULT;
-GPUPriorityHint OpenCLRuntime::gpu_priority_hint_ =
+GPUPerfHint OpenCLRuntime::kGPUPerfHint = GPUPerfHint::PERF_DEFAULT;
+GPUPriorityHint OpenCLRuntime::kGPUPriorityHint =
     GPUPriorityHint::PRIORITY_DEFAULT;
 
 OpenCLRuntime *OpenCLRuntime::Global() {
-  static OpenCLRuntime runtime(gpu_perf_hint_, gpu_priority_hint_);
+  static OpenCLRuntime runtime(kGPUPerfHint, kGPUPriorityHint);
   return &runtime;
 }
 
 void OpenCLRuntime::Configure(GPUPerfHint gpu_perf_hint,
                               GPUPriorityHint gpu_priority_hint) {
-  OpenCLRuntime::gpu_perf_hint_ = gpu_perf_hint;
-  OpenCLRuntime::gpu_priority_hint_ = gpu_priority_hint;
+  OpenCLRuntime::kGPUPerfHint = gpu_perf_hint;
+  OpenCLRuntime::kGPUPriorityHint = gpu_priority_hint;
 }
 
 void GetAdrenoContextProperties(std::vector<cl_context_properties> *properties,
@@ -250,7 +235,8 @@ void GetAdrenoContextProperties(std::vector<cl_context_properties> *properties,
 }
 
 OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
-                             GPUPriorityHint gpu_priority_hint) {
+                             GPUPriorityHint gpu_priority_hint):
+    storage_(nullptr) {
   LoadOpenCLLibrary();
 
   std::vector<cl::Platform> all_platforms;
@@ -259,9 +245,12 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
     LOG(FATAL) << "No OpenCL platforms found";
   }
   cl::Platform default_platform = all_platforms[0];
-  VLOG(1) << "Using platform: " << default_platform.getInfo<CL_PLATFORM_NAME>()
-          << ", " << default_platform.getInfo<CL_PLATFORM_PROFILE>() << ", "
-          << default_platform.getInfo<CL_PLATFORM_VERSION>();
+  std::stringstream ss;
+  ss << default_platform.getInfo<CL_PLATFORM_NAME>()
+     << ", " << default_platform.getInfo<CL_PLATFORM_PROFILE>() << ", "
+     << default_platform.getInfo<CL_PLATFORM_VERSION>();
+  platform_info_ = ss.str();
+  VLOG(1) << "Using platform: " << platform_info_;
 
   // get default device (CPUs, GPUs) of the default platform
   std::vector<cl::Device> all_devices;
@@ -278,10 +267,10 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
       gpu_detected = true;
 
       const std::string device_name = device.getInfo<CL_DEVICE_NAME>();
-      gpu_type_ = ParseGPUTypeFromDeviceName(device_name);
+      gpu_type_ = ParseGPUType(device_name);
 
       const std::string device_version = device.getInfo<CL_DEVICE_VERSION>();
-      opencl_version_ = device_version.substr(7, 3);
+      opencl_version_ = ParseDeviceVersion(device_version);
 
       VLOG(1) << "Using device: " << device_name;
       break;
@@ -320,9 +309,19 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
                                                       &err);
   MACE_CHECK_CL_SUCCESS(err);
 
-  const char *kernel_path = getenv("MACE_KERNEL_PATH");
-  this->kernel_path_ =
-      std::string(kernel_path == nullptr ? "" : kernel_path) + "/";
+  this->program_map_changed_ = false;
+
+  extern std::shared_ptr<KVStorageFactory> kStorageFactory;
+  if (kStorageFactory != nullptr) {
+    const std::string cl_compiled_file_name = "mace_cl_compiled_program.bin";
+    storage_ = kStorageFactory->CreateStorage(cl_compiled_file_name);
+
+    if (platform_info_ != kCompiledProgramPlatform) {
+      if (storage_->Load() != 0) {
+        LOG(FATAL) << "Load opencl compiled kernel file failed";
+      }
+    }
+  }
 
   const char *out_of_range_check = getenv("MACE_OUT_OF_RANGE_CHECK");
   if (out_of_range_check != nullptr && strlen(out_of_range_check) == 1
@@ -348,45 +347,49 @@ cl::Device &OpenCLRuntime::device() { return *device_; }
 
 cl::CommandQueue &OpenCLRuntime::command_queue() { return *command_queue_; }
 
-std::string OpenCLRuntime::GenerateCLBinaryFilenamePrefix(
-    const std::string &filename_msg) {
-  // TODO(heliangliang) This can be long and slow, fix it
-  std::string filename_prefix = filename_msg;
-  for (auto it = filename_prefix.begin(); it != filename_prefix.end(); ++it) {
-    if (*it == ' ' || *it == '-' || *it == '=') {
-      *it = '_';
+bool OpenCLRuntime::BuildProgramFromBinary(
+    const std::string &built_program_key,
+    const std::string &build_options_str,
+    cl::Program *program) {
+  // Find from binary
+  if (kCompiledProgramPlatform != platform_info_) return false;
+  auto it_binary = kCompiledProgramMap.find(built_program_key);
+  if (it_binary == kCompiledProgramMap.end()) return false;
+
+  *program = cl::Program(context(), {device()}, {it_binary->second});
+  cl_int ret = program->build({device()}, build_options_str.c_str());
+  if (ret != CL_SUCCESS) {
+    if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(device()) ==
+        CL_BUILD_ERROR) {
+      std::string build_log =
+          program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(device());
+      LOG(INFO) << "Program build log: " << build_log;
     }
+    LOG(WARNING) << "Build program "
+                 << built_program_key << " from Binary failed:"
+                 << (ret == CL_INVALID_PROGRAM ? "CL_INVALID_PROGRAM, possible "
+                     "cause 1: the MACE library is built from SoC 1 but is "
+                     "used on different SoC 2, possible cause 2: the MACE "
+                     "buffer is corrupted make sure your code has no "
+                     "out-of-range memory writing" : MakeString(ret));
+    return false;
   }
-  return MACE_OBFUSCATE_SYMBOL(filename_prefix);
+  VLOG(3) << "Program from Binary: " << built_program_key;
+  return true;
 }
 
-extern bool GetSourceOrBinaryProgram(const std::string &program_name,
-                                     const std::string &binary_file_name_prefix,
-                                     const cl::Context &context,
-                                     const cl::Device &device,
-                                     cl::Program *program,
-                                     bool *is_opencl_binary);
-
-void OpenCLRuntime::BuildProgram(const std::string &program_name,
-                                 const std::string &built_program_key,
-                                 const std::string &build_options,
-                                 cl::Program *program) {
-  MACE_CHECK_NOTNULL(program);
+bool OpenCLRuntime::BuildProgramFromCache(
+    const std::string &built_program_key,
+    const std::string &build_options_str,
+    cl::Program *program) {
+  // Find from binary
+  if (this->storage_ == nullptr) return false;
+  auto content = this->storage_->Find(built_program_key);
+  if (content == nullptr) {
+    return false;
+  }
 
-  std::string binary_file_name_prefix =
-      GenerateCLBinaryFilenamePrefix(built_program_key);
-  std::vector<unsigned char> program_vec;
-  bool is_opencl_binary;
-  const bool found =
-      GetSourceOrBinaryProgram(program_name, binary_file_name_prefix, context(),
-                               device(), program, &is_opencl_binary);
-  MACE_CHECK(found, "Program not found for ",
-             is_opencl_binary ? "binary: " : "source: ", built_program_key);
-
-  // Build program
-  std::string build_options_str =
-      build_options + " -Werror -cl-mad-enable -cl-fast-relaxed-math";
-  // TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math
+  *program = cl::Program(context(), {device()}, {*content});
   cl_int ret = program->build({device()}, build_options_str.c_str());
   if (ret != CL_SUCCESS) {
     if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(device()) ==
@@ -395,20 +398,43 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name,
           program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(device());
       LOG(INFO) << "Program build log: " << build_log;
     }
-    LOG(FATAL) << "Build program from "
-               << (is_opencl_binary ? "binary: " : "source: ")
-               << built_program_key << " failed: "
-               << (ret == CL_INVALID_PROGRAM ? "CL_INVALID_PROGRAM, possible "
-                   "cause 1: the MACE library is built from SoC 1 but is "
-                   "used on different SoC 2, possible cause 2: the MACE "
-                   "buffer is corrupted make sure your code has no "
-                   "out-of-range memory writing" : MakeString(ret));
+    LOG(WARNING) << "Build program "
+                 << built_program_key << " from Cache failed:"
+                 << MakeString(ret);
+    return false;
   }
+  VLOG(3) << "Program from Cache: " << built_program_key;
+  return true;
+}
+
+void OpenCLRuntime::BuildProgramFromSource(
+    const std::string &program_name,
+    const std::string &built_program_key,
+    const std::string &build_options_str,
+    cl::Program *program) {
+  // Find from source
+  auto it_source = kEncryptedProgramMap.find(program_name);
+  if (it_source != kEncryptedProgramMap.end()) {
+    cl::Program::Sources sources;
+    std::string source(it_source->second.begin(), it_source->second.end());
+    std::string kernel_source = ObfuscateString(source);
+    sources.push_back(kernel_source);
+    *program = cl::Program(context(), sources);
+    cl_int ret = program->build({device()}, build_options_str.c_str());
+    if (ret != CL_SUCCESS) {
+      if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(device()) ==
+          CL_BUILD_ERROR) {
+        std::string build_log =
+            program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(device());
+        LOG(INFO) << "Program build log: " << build_log;
+      }
+      LOG(WARNING) << "Build program "
+                   << program_name << " from source failed: "
+                   << MakeString(ret);
+      return;
+    }
 
-  if (!is_opencl_binary) {
-    // Write binary if necessary
-    std::string binary_filename =
-        kernel_path_ + binary_file_name_prefix + ".bin";
+    // Keep built program binary
     size_t device_list_size = 1;
     std::unique_ptr<size_t[]> program_binary_sizes(
         new size_t[device_list_size]);
@@ -432,7 +458,34 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name,
         reinterpret_cast<unsigned char const *>(program_binaries[0].get()) +
             program_binary_sizes[0]);
 
-    MACE_CHECK(WriteFile(binary_filename, true, content));
+    if (this->storage_ != nullptr) {
+      this->storage_->Insert(built_program_key, content);
+      this->program_map_changed_ = true;
+    }
+
+    VLOG(3) << "Program from source: " << built_program_key;
+  }
+}
+
+void OpenCLRuntime::BuildProgram(const std::string &program_name,
+                                 const std::string &built_program_key,
+                                 const std::string &build_options,
+                                 cl::Program *program) {
+  MACE_CHECK_NOTNULL(program);
+
+  std::string build_options_str =
+      build_options + " -Werror -cl-mad-enable -cl-fast-relaxed-math";
+  // TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math
+  bool ret = BuildProgramFromBinary(built_program_key,
+                                    build_options_str, program);
+  if (!ret) {
+    ret = BuildProgramFromCache(built_program_key,
+                                build_options_str, program);
+    // Fallback to source.
+    if (!ret) {
+      BuildProgramFromSource(program_name, built_program_key,
+                             build_options_str, program);
+    }
   }
 }
 
@@ -459,6 +512,15 @@ cl::Kernel OpenCLRuntime::BuildKernel(
   return cl::Kernel(program, kernel_name.c_str());
 }
 
+void OpenCLRuntime::SaveBuiltCLProgram() {
+  if (program_map_changed_ && storage_ != nullptr) {
+    if (storage_->Flush() != 0) {
+      LOG(FATAL) << "Store opencl compiled kernel to file failed";
+    }
+    program_map_changed_ = false;
+  }
+}
+
 void OpenCLRuntime::GetCallStats(const cl::Event &event, CallStats *stats) {
   if (stats != nullptr) {
     stats->start_micros =
@@ -480,7 +542,6 @@ uint64_t OpenCLRuntime::GetKernelMaxWorkGroupSize(const cl::Kernel &kernel) {
   return size;
 }
 
-// TODO(liuqi): not compatible with mali gpu.
 uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) {
   uint64_t size = 0;
   kernel.getWorkGroupInfo(*device_, CL_KERNEL_WAVE_SIZE_QCOM, &size);
@@ -496,7 +557,11 @@ const GPUType OpenCLRuntime::gpu_type() const {
   return gpu_type_;
 }
 
-const GPUType OpenCLRuntime::ParseGPUTypeFromDeviceName(
+const std::string OpenCLRuntime::platform_info() const {
+  return platform_info_;
+}
+
+const GPUType OpenCLRuntime::ParseGPUType(
     const std::string &device_name) {
   constexpr const char *kQualcommAdrenoGPUStr = "QUALCOMM Adreno(TM)";
   constexpr const char *kMaliGPUStr = "Mali";
@@ -512,6 +577,14 @@ const GPUType OpenCLRuntime::ParseGPUTypeFromDeviceName(
     return GPUType::UNKNOWN;
   }
 }
+const std::string OpenCLRuntime::ParseDeviceVersion(
+    const std::string &device_version) {
+  // OpenCL Device version string format:
+  // OpenCL<space><major_version.minor_version><space>\
+  // <vendor-specific information>
+  auto words = Split(device_version, ' ');
+  return words[1];
+}
 
 const bool OpenCLRuntime::IsOutOfRangeCheckEnabled() const {
   return out_of_range_check_;
diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h
index 8b29f145784a7b25ff7206b4a0bcbeaf1533475d..c06564908e56ba794963a8a70314760f31b5afc0 100644
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -10,6 +10,7 @@
 #include <mutex>  // NOLINT(build/c++11)
 #include <set>
 #include <string>
+#include <vector>
 
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -55,22 +56,27 @@ class OpenCLRuntime {
  public:
   static OpenCLRuntime *Global();
   static void Configure(GPUPerfHint, GPUPriorityHint);
+  static void Configure(std::shared_ptr<KVStorage> storage_engine);
 
   cl::Context &context();
   cl::Device &device();
   cl::CommandQueue &command_queue();
+  const GPUType gpu_type() const;
+  const std::string platform_info() const;
+
+  cl::Kernel BuildKernel(const std::string &program_name,
+                         const std::string &kernel_name,
+                         const std::set<std::string> &build_options);
 
   void GetCallStats(const cl::Event &event, CallStats *stats);
   uint64_t GetDeviceMaxWorkGroupSize();
   uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel);
   uint64_t GetKernelWaveSize(const cl::Kernel &kernel);
   const bool IsNonUniformWorkgroupsSupported();
-  const GPUType ParseGPUTypeFromDeviceName(const std::string &device_name);
-  const GPUType gpu_type() const;
-  cl::Kernel BuildKernel(const std::string &program_name,
-                         const std::string &kernel_name,
-                         const std::set<std::string> &build_options);
   const bool IsOutOfRangeCheckEnabled() const;
+  const GPUType ParseGPUType(const std::string &device_name);
+  const std::string ParseDeviceVersion(const std::string &device_version);
+  void SaveBuiltCLProgram();
 
  private:
   OpenCLRuntime(GPUPerfHint, GPUPriorityHint);
@@ -82,7 +88,19 @@ class OpenCLRuntime {
                     const std::string &binary_file_name,
                     const std::string &build_options,
                     cl::Program *program);
-  std::string GenerateCLBinaryFilenamePrefix(const std::string &filename_msg);
+  bool BuildProgramFromBinary(
+      const std::string &built_program_key,
+      const std::string &build_options_str,
+      cl::Program *program);
+  bool BuildProgramFromCache(
+      const std::string &built_program_key,
+      const std::string &build_options_str,
+      cl::Program *program);
+  void BuildProgramFromSource(
+      const std::string &program_name,
+      const std::string &built_program_key,
+      const std::string &build_options_str,
+      cl::Program *program);
 
  private:
   // All OpenCL object must be a pointer and manually deleted before unloading
@@ -92,13 +110,15 @@ class OpenCLRuntime {
   std::shared_ptr<cl::CommandQueue> command_queue_;
   std::map<std::string, cl::Program> built_program_map_;
   std::mutex program_build_mutex_;
-  std::string kernel_path_;
   GPUType gpu_type_;
   std::string opencl_version_;
   bool out_of_range_check_;
+  std::string platform_info_;
+  bool program_map_changed_;
+  std::unique_ptr<KVStorage> storage_;
 
-  static GPUPerfHint gpu_perf_hint_;
-  static GPUPriorityHint gpu_priority_hint_;
+  static GPUPerfHint kGPUPerfHint;
+  static GPUPriorityHint kGPUPriorityHint;
 };
 
 }  // namespace mace
diff --git a/mace/examples/BUILD b/mace/examples/BUILD
index 7940c6bb67931aca077da743fbf51ee256e412a6..d572e11d319c3da42599055583ead80bd534e65e 100644
--- a/mace/examples/BUILD
+++ b/mace/examples/BUILD
@@ -2,8 +2,8 @@
 load("//mace:mace.bzl", "if_openmp_enabled")
 
 cc_binary(
-    name = "mace_run",
-    srcs = ["mace_run.cc"],
+    name = "example",
+    srcs = ["example.cc"],
     linkopts = if_openmp_enabled(["-fopenmp"]),
     linkstatic = 1,
     deps = [
diff --git a/mace/examples/example.cc b/mace/examples/example.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e70025d6c171087fad4393bc3620f728517ac564
--- /dev/null
+++ b/mace/examples/example.cc
@@ -0,0 +1,290 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+/**
+ * Usage:
+ * mace_run --model=mobi_mace.pb \
+ *          --input=input_node  \
+ *          --output=output_node  \
+ *          --input_shape=1,224,224,3   \
+ *          --output_shape=1,224,224,2   \
+ *          --input_file=input_data \
+ *          --output_file=mace.out  \
+ *          --model_data_file=model_data.data \
+ *          --device=OPENCL
+ */
+#include <malloc.h>
+#include <stdint.h>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <numeric>
+
+#include "gflags/gflags.h"
+#include "mace/public/mace.h"
+#include "mace/public/mace_runtime.h"
+#include "mace/utils/env_time.h"
+#include "mace/utils/logging.h"
+
+// #include "mace/codegen/models/${MACE_MODEL_TAG}/${MACE_MODEL_TAG}.h" instead
+namespace mace {
+namespace MACE_MODEL_TAG {
+
+extern const unsigned char *LoadModelData(const char *model_data_file);
+
+extern void UnloadModelData(const unsigned char *model_data);
+
+extern NetDef CreateNet(const unsigned char *model_data);
+
+extern const std::string ModelChecksum();
+
+}  // namespace MACE_MODEL_TAG
+}  // namespace mace
+
+namespace mace {
+namespace examples {
+
+namespace str_util {
+
+std::vector<std::string> Split(const std::string &str, char delims) {
+  std::vector<std::string> result;
+  std::string tmp = str;
+  while (!tmp.empty()) {
+    size_t next_offset = tmp.find(delims);
+    result.push_back(tmp.substr(0, next_offset));
+    if (next_offset == std::string::npos) {
+      break;
+    } else {
+      tmp = tmp.substr(next_offset + 1);
+    }
+  }
+  return result;
+}
+
+}  // namespace str_util
+
+void ParseShape(const std::string &str, std::vector<int64_t> *shape) {
+  std::string tmp = str;
+  while (!tmp.empty()) {
+    int dim = atoi(tmp.data());
+    shape->push_back(dim);
+    size_t next_offset = tmp.find(",");
+    if (next_offset == std::string::npos) {
+      break;
+    } else {
+      tmp = tmp.substr(next_offset + 1);
+    }
+  }
+}
+
+std::string FormatName(const std::string input) {
+  std::string res = input;
+  for (size_t i = 0; i < input.size(); ++i) {
+    if (!isalnum(res[i])) res[i] = '_';
+  }
+  return res;
+}
+
+DeviceType ParseDeviceType(const std::string &device_str) {
+  if (device_str.compare("CPU") == 0) {
+    return DeviceType::CPU;
+  } else if (device_str.compare("NEON") == 0) {
+    return DeviceType::NEON;
+  } else if (device_str.compare("OPENCL") == 0) {
+    return DeviceType::OPENCL;
+  } else if (device_str.compare("HEXAGON") == 0) {
+    return DeviceType::HEXAGON;
+  } else {
+    return DeviceType::CPU;
+  }
+}
+
+
+DEFINE_string(input_node,
+              "input_node0,input_node1",
+              "input nodes, separated by comma");
+DEFINE_string(input_shape,
+              "1,224,224,3:1,1,1,10",
+              "input shapes, separated by colon and comma");
+DEFINE_string(output_node,
+              "output_node0,output_node1",
+              "output nodes, separated by comma");
+DEFINE_string(output_shape,
+              "1,224,224,2:1,1,1,10",
+              "output shapes, separated by colon and comma");
+DEFINE_string(input_file,
+              "",
+              "input file name | input file prefix for multiple inputs.");
+DEFINE_string(output_file,
+              "",
+              "output file name | output file prefix for multiple outputs");
+DEFINE_string(model_data_file,
+              "",
+              "model data file name, used when EMBED_MODEL_DATA set to 0");
+DEFINE_string(device, "OPENCL", "CPU/NEON/OPENCL/HEXAGON");
+DEFINE_int32(round, 1, "round");
+DEFINE_int32(restart_round, 1, "restart round");
+DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable");
+DEFINE_int32(gpu_perf_hint, 2, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
+DEFINE_int32(gpu_priority_hint, 1, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
+DEFINE_int32(omp_num_threads, 8, "num of openmp threads");
+DEFINE_int32(cpu_power_option,
+             0,
+             "0:DEFAULT/1:HIGH_PERFORMANCE/2:BATTERY_SAVE");
+
+bool RunModel(const std::vector<std::string> &input_names,
+              const std::vector<std::vector<int64_t>> &input_shapes,
+              const std::vector<std::string> &output_names,
+              const std::vector<std::vector<int64_t>> &output_shapes) {
+  // load model
+  const unsigned char *model_data =
+      mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str());
+  NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data);
+
+  DeviceType device_type = ParseDeviceType(FLAGS_device);
+
+  // config runtime
+  mace::ConfigOmpThreads(FLAGS_omp_num_threads);
+  mace::ConfigCPUPowerOption(
+      static_cast<CPUPowerOption>(FLAGS_cpu_power_option));
+  if (device_type == DeviceType::OPENCL) {
+    mace::ConfigOpenCLRuntime(
+        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
+        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
+  }
+
+  const std::string kernel_file_path =
+                  "/data/local/tmp/mace_run/cl";
+
+  // Config internal kv storage factory.
+  std::shared_ptr<KVStorageFactory> storage_factory(
+      new FileStorageFactory(kernel_file_path));
+  ConfigKVStorageFactory(storage_factory);
+  // Init model
+  mace::MaceEngine engine(&net_def, device_type, input_names,
+                          output_names);
+  if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) {
+    mace::MACE_MODEL_TAG::UnloadModelData(model_data);
+  }
+
+  const size_t input_count = input_names.size();
+  const size_t output_count = output_names.size();
+
+  std::map<std::string, mace::MaceTensor> inputs;
+  std::map<std::string, mace::MaceTensor> outputs;
+  for (size_t i = 0; i < input_count; ++i) {
+    // Allocate input and output
+    int64_t input_size =
+        std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 1,
+                        std::multiplies<int64_t>());
+    auto buffer_in = std::shared_ptr<float>(new float[input_size],
+                                            std::default_delete<float[]>());
+    // load input
+    std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]),
+                          std::ios::in | std::ios::binary);
+    if (in_file.is_open()) {
+      in_file.read(reinterpret_cast<char *>(buffer_in.get()),
+                   input_size * sizeof(float));
+      in_file.close();
+    } else {
+      LOG(INFO) << "Open input file failed";
+      return -1;
+    }
+    inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in);
+  }
+
+  for (size_t i = 0; i < output_count; ++i) {
+    int64_t output_size =
+        std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1,
+                        std::multiplies<int64_t>());
+    auto buffer_out = std::shared_ptr<float>(new float[output_size],
+                                             std::default_delete<float[]>());
+    outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out);
+  }
+
+  LOG(INFO) << "Warm up run";
+  engine.Run(inputs, &outputs);
+
+  if (FLAGS_round > 0) {
+    LOG(INFO) << "Run model";
+    for (int i = 0; i < FLAGS_round; ++i) {
+      engine.Run(inputs, &outputs);
+    }
+  }
+
+  for (size_t i = 0; i < output_count; ++i) {
+    std::string output_name =
+        FLAGS_output_file + "_" + FormatName(output_names[i]);
+    std::ofstream out_file(output_name, std::ios::binary);
+    int64_t output_size =
+        std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1,
+                        std::multiplies<int64_t>());
+    out_file.write(
+        reinterpret_cast<char *>(outputs[output_names[i]].data().get()),
+        output_size * sizeof(float));
+    out_file.flush();
+    out_file.close();
+  }
+
+  return true;
+}
+
+int Main(int argc, char **argv) {
+  gflags::SetUsageMessage("some usage message");
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  LOG(INFO) << "mace version: " << MaceVersion();
+  LOG(INFO) << "model checksum: " << mace::MACE_MODEL_TAG::ModelChecksum();
+  LOG(INFO) << "input node: " << FLAGS_input_node;
+  LOG(INFO) << "input shape: " << FLAGS_input_shape;
+  LOG(INFO) << "output node: " << FLAGS_output_node;
+  LOG(INFO) << "output shape: " << FLAGS_output_shape;
+  LOG(INFO) << "input_file: " << FLAGS_input_file;
+  LOG(INFO) << "output_file: " << FLAGS_output_file;
+  LOG(INFO) << "model_data_file: " << FLAGS_model_data_file;
+  LOG(INFO) << "device: " << FLAGS_device;
+  LOG(INFO) << "round: " << FLAGS_round;
+  LOG(INFO) << "restart_round: " << FLAGS_restart_round;
+  LOG(INFO) << "gpu_perf_hint: " << FLAGS_gpu_perf_hint;
+  LOG(INFO) << "gpu_priority_hint: " << FLAGS_gpu_priority_hint;
+  LOG(INFO) << "omp_num_threads: " << FLAGS_omp_num_threads;
+  LOG(INFO) << "cpu_power_option: " << FLAGS_cpu_power_option;
+
+  std::vector<std::string> input_names = str_util::Split(FLAGS_input_node, ',');
+  std::vector<std::string> output_names =
+      str_util::Split(FLAGS_output_node, ',');
+  std::vector<std::string> input_shapes =
+      str_util::Split(FLAGS_input_shape, ':');
+  std::vector<std::string> output_shapes =
+      str_util::Split(FLAGS_output_shape, ':');
+
+  const size_t input_count = input_shapes.size();
+  const size_t output_count = output_shapes.size();
+  std::vector<std::vector<int64_t>> input_shape_vec(input_count);
+  std::vector<std::vector<int64_t>> output_shape_vec(output_count);
+  for (size_t i = 0; i < input_count; ++i) {
+    ParseShape(input_shapes[i], &input_shape_vec[i]);
+  }
+  for (size_t i = 0; i < output_count; ++i) {
+    ParseShape(output_shapes[i], &output_shape_vec[i]);
+  }
+
+  bool ret;
+#pragma omp parallel for
+  for (int i = 0; i < FLAGS_restart_round; ++i) {
+    VLOG(0) << "restart round " << i;
+    ret =
+        RunModel(input_names, input_shape_vec, output_names, output_shape_vec);
+  }
+  if (ret) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+}  // namespace examples
+}  // namespace mace
+
+int main(int argc, char **argv) { mace::examples::Main(argc, argv); }
diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h
index 67b01458bdabb60dd56797a4912a18f6386944ae..e62181bc3ae743e2662b3a364537de6e1d8ed88d 100644
--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -90,18 +90,22 @@ void DoActivation(const T *input_ptr,
 
 template <typename T>
 void PReLUActivation(const T *input_ptr,
-                     const index_t size,
+                     const index_t outer_size,
                      const index_t input_chan,
+                     const index_t inner_size,
                      const T *alpha_ptr,
                      T *output_ptr) {
-#pragma omp parallel for
-  for (index_t i = 0; i < size; ++i) {
-    const index_t chan_idx = i % input_chan;
-    T in = input_ptr[i];
-    if (in < 0) {
-      output_ptr[i] = in * alpha_ptr[chan_idx];
-    } else {
-      output_ptr[i] = in;
+#pragma omp parallel for collapse(3)
+  for (index_t i = 0; i < outer_size; ++i) {
+    for (index_t chan_idx = 0; chan_idx < input_chan; ++chan_idx) {
+      for (index_t j = 0; j < inner_size; ++j) {
+        index_t idx = i * input_chan * inner_size + chan_idx * inner_size + j;
+        if (input_ptr[idx] < 0) {
+          output_ptr[idx] = input_ptr[idx] * alpha_ptr[chan_idx];
+        } else {
+          output_ptr[idx] = input_ptr[idx];
+        }
+      }
     }
   }
 }
@@ -121,7 +125,9 @@ class ActivationFunctor {
     if (activation_ == PRELU) {
       MACE_CHECK_NOTNULL(alpha);
       const T *alpha_ptr = alpha->data<T>();
-      PReLUActivation(input_ptr, output->size(), input->dim(3), alpha_ptr,
+      const index_t outer_size = output->dim(0) * output->dim(1)
+          * output->dim(2);
+      PReLUActivation(input_ptr, outer_size, input->dim(3), 1, alpha_ptr,
                       output_ptr);
     } else {
       DoActivation(input_ptr, output_ptr, output->size(), activation_,
diff --git a/mace/kernels/arm/activation.cc b/mace/kernels/arm/activation.cc
index 30f21e03c5333292fd1e44aa71684a2e2cfcbfc7..59b94ffac8a6c1d84d580253d7694cd54fe6ffc3 100644
--- a/mace/kernels/arm/activation.cc
+++ b/mace/kernels/arm/activation.cc
@@ -17,7 +17,9 @@ void ActivationFunctor<DeviceType::NEON, float>::operator()(
   if (activation_ == PRELU) {
     MACE_CHECK_NOTNULL(alpha);
     const float *alpha_ptr = alpha->data<float>();
-    PReLUActivation(input_ptr, output->size(), input->dim(1), alpha_ptr,
+    const index_t outer_size = output->dim(0);
+    const index_t inner_size = output->dim(2) * output->dim(3);
+    PReLUActivation(input_ptr, outer_size, input->dim(1), inner_size, alpha_ptr,
                     output_ptr);
   } else {
     DoActivation(input_ptr, output_ptr, output->size(), activation_,
diff --git a/mace/kernels/arm/conv_2d.cc b/mace/kernels/arm/conv_2d.cc
index e50cac08bd1f4d7d7693079d0f47f35962ff2e10..7fc16cda27b4c3c93d491aa9caf6372247df6e96 100644
--- a/mace/kernels/arm/conv_2d.cc
+++ b/mace/kernels/arm/conv_2d.cc
@@ -7,6 +7,7 @@
 
 // winograd is always superior to neon impl during benchmark
 #define USE_WINOGRAD 1
+#define WINOGRAD_OUT_TILE_SIZE 6
 
 namespace mace {
 namespace kernels {
@@ -162,10 +163,11 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
 
   if (USE_WINOGRAD && filter_h == 3 && filter_w == 3 && stride_h == 1
     && stride_w == 1
-    && dilation_h == 1 && dilation_w == 1) {
-    extra_output_height = RoundUp<index_t>(height, 2);
+    && dilation_h == 1 && dilation_w == 1
+    && input_channels >= 8 && channels >= 8) {
+    extra_output_height = RoundUp<index_t>(height, WINOGRAD_OUT_TILE_SIZE);
     extra_input_height = std::max(padded_input_height, extra_output_height + 2);
-    extra_output_width = RoundUp<index_t>(width, 2);
+    extra_output_width = RoundUp<index_t>(width, WINOGRAD_OUT_TILE_SIZE);
     extra_input_width = std::max(padded_input_width, extra_output_width + 2);
     if (extra_input_height != padded_input_height) {
       pad_bottom += (extra_input_height - padded_input_height);
@@ -174,12 +176,15 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
       pad_right += (extra_input_width - padded_input_width);
     }
 
-    index_t tile_height_count = (extra_output_height + 1) / 2;
-    index_t tile_width_count = (extra_output_width + 1) / 2;
+    index_t tile_height_count = extra_output_height / WINOGRAD_OUT_TILE_SIZE;
+    index_t tile_width_count = extra_output_width / WINOGRAD_OUT_TILE_SIZE;
     index_t tile_count = tile_height_count * tile_width_count;
-    transformed_input_.Resize({16, batch, input_channels, tile_count});
-    transformed_filter_.Resize({16, channels, input_channels});
-    transformed_output_.Resize({16, batch, channels, tile_count});
+    index_t in_tile_area =
+      (WINOGRAD_OUT_TILE_SIZE + 2) * (WINOGRAD_OUT_TILE_SIZE + 2);
+    transformed_input_.Resize({in_tile_area, batch, input_channels,
+                               tile_count});
+    transformed_filter_.Resize({in_tile_area, channels, input_channels});
+    transformed_output_.Resize({in_tile_area, batch, channels, tile_count});
 
     conv_func = [=](const float *pad_input, float *pad_output) {
       WinoGradConv3x3s1(pad_input,
@@ -189,6 +194,7 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
                         extra_input_width,
                         input_channels,
                         channels,
+                        WINOGRAD_OUT_TILE_SIZE,
                         transformed_input_.mutable_data<float>(),
                         transformed_filter_.mutable_data<float>(),
                         transformed_output_.mutable_data<float>(),
diff --git a/mace/kernels/arm/conv_winograd.cc b/mace/kernels/arm/conv_winograd.cc
index c0509689f832d702ed23afdbd29c5b53abf42773..272e3c5227645ff228113c6d226765617aee724b 100644
--- a/mace/kernels/arm/conv_winograd.cc
+++ b/mace/kernels/arm/conv_winograd.cc
@@ -8,19 +8,20 @@
 #include "mace/kernels/arm/conv_winograd.h"
 #include "mace/kernels/gemm.h"
 #include "mace/utils/utils.h"
+#include "mace/utils/logging.h"
 
 namespace mace {
 namespace kernels {
 
 namespace {
 // NCHW => TNCB (T: in tile pixels, B: tile indices)
-void TransformInput(const float *input,
-                    const index_t batch,
-                    const index_t in_height,
-                    const index_t in_width,
-                    const index_t in_channels,
-                    const index_t tile_count,
-                    float *output) {
+void TransformInput4x4(const float *input,
+                       const index_t batch,
+                       const index_t in_height,
+                       const index_t in_width,
+                       const index_t in_channels,
+                       const index_t tile_count,
+                       float *output) {
   const index_t stride = batch * in_channels * tile_count;
   const index_t in_height_width = in_height * in_width;
 
@@ -101,12 +102,124 @@ void TransformInput(const float *input,
   }
 }
 
+// NCHW => TNCB (T: in tile pixels, B: tile indices)
+/**
+ * BT =
+⎡1   0    -21/4    0    21/4     0    -1  0⎤
+⎢                                          ⎥
+⎢0   1      1    -17/4  -17/4    1    1   0⎥
+⎢                                          ⎥
+⎢0   -1     1    17/4   -17/4   -1    1   0⎥
+⎢                                          ⎥
+⎢0  1/2    1/4   -5/2   -5/4     2    1   0⎥
+⎢                                          ⎥
+⎢0  -1/2   1/4    5/2   -5/4    -2    1   0⎥
+⎢                                          ⎥
+⎢0   2      4    -5/2    -5     1/2   1   0⎥
+⎢                                          ⎥
+⎢0   -2     4     5/2    -5    -1/2   1   0⎥
+⎢                                          ⎥
+⎣0   -1     0    21/4     0    -21/4  0   1⎦
+
+ * @param input
+ * @param batch
+ * @param in_height
+ * @param in_width
+ * @param in_channels
+ * @param tile_count
+ * @param output
+ */
+void TransformInput8x8(const float *input,
+                       const index_t batch,
+                       const index_t in_height,
+                       const index_t in_width,
+                       const index_t in_channels,
+                       const index_t tile_count,
+                       float *output) {
+  const index_t stride = batch * in_channels * tile_count;
+  const index_t in_height_width = in_height * in_width;
+
+#pragma omp parallel for
+  for (index_t nc = 0; nc < batch * in_channels; ++nc) {
+    index_t tile_index = nc * tile_count;
+    float s[8][8];
+    for (index_t h = 0; h < in_height - 2; h += 6) {
+      for (index_t w = 0; w < in_width - 2; w += 6) {
+        index_t tile_offset = nc * in_height_width + h * in_width + w;
+        for (int i = 0; i < 8; ++i) {
+          float d0, d1, d2, d3, d4, d5, d6, d7;
+          d0 = input[tile_offset];
+          d1 = input[tile_offset + 1];
+          d2 = input[tile_offset + 2];
+          d3 = input[tile_offset + 3];
+          d4 = input[tile_offset + 4];
+          d5 = input[tile_offset + 5];
+          d6 = input[tile_offset + 6];
+          d7 = input[tile_offset + 7];
+
+          s[i][0] = d0 - d6 + (d4 - d2) * 5.25;
+          s[i][7] = d7 - d1 + (d3 - d5) * 5.25;
+
+          float u = d2 + d6 - d4 * 4.25;
+          float v = d1 + d5 - d3 * 4.25;
+          s[i][1] = u + v;
+          s[i][2] = u - v;
+
+          u = d6 + d2 * 0.25 - d4 * 1.25;
+          v = d1 * 0.5 - d3 * 2.5 + d5 * 2;
+          s[i][3] = u + v;
+          s[i][4] = u - v;
+
+          u = d6 + (d2 - d4 * 1.25) * 4;
+          v = d1 * 2 - d3 * 2.5 + d5 * 0.5;
+          s[i][5] = u + v;
+          s[i][6] = u - v;
+
+          tile_offset += in_width;
+        }
+
+        for (int i = 0; i < 8; ++i) {
+          float d0, d1, d2, d3, d4, d5, d6, d7;
+          d0 = s[0][i];
+          d1 = s[1][i];
+          d2 = s[2][i];
+          d3 = s[3][i];
+          d4 = s[4][i];
+          d5 = s[5][i];
+          d6 = s[6][i];
+          d7 = s[7][i];
+
+          output[tile_index + i * stride] = d0 - d6 + (d4 - d2) * 5.25;
+          output[tile_index + (56 + i) * stride] = d7 - d1 + (d3 - d5) * 5.25;
+
+          float u = d2 + d6 - d4 * 4.25;
+          float v = d1 + d5 - d3 * 4.25;
+          output[tile_index + (8 + i) * stride] = u + v;
+          output[tile_index + (16 + i) * stride] = u - v;
+
+          u = d6 + d2 * 0.25 - d4 * 1.25;
+          v = d1 * 0.5 - d3 * 2.5 + d5 * 2;
+          output[tile_index + (24 + i) * stride] = u + v;
+          output[tile_index + (32 + i) * stride] = u - v;
+
+          u = d6 + (d2 - d4 * 1.25) * 4;
+          v = d1 * 2 - d3 * 2.5 + d5 * 0.5;
+          output[tile_index + (40 + i) * stride] = u + v;
+          output[tile_index + (48 + i) * stride] = u - v;
+        }
+
+        ++tile_index;
+      }
+    }
+  }
+}
+
 // OCHW => TOC
 // no need to optimize, it will exist in converter
-void TransformFilter(const float *filter,
-                     const index_t in_channels,
-                     const index_t out_channels,
-                     float *output) {
+void TransformFilter4x4(const float *filter,
+                        const index_t in_channels,
+                        const index_t out_channels,
+                        float *output) {
   const index_t stride = out_channels * in_channels;
 
 #pragma omp parallel for collapse(2)
@@ -171,6 +284,83 @@ void TransformFilter(const float *filter,
   }
 }
 
+// OCHW => TOC
+// no need to optimize, it will exist in converter
+/**
+ * G =
+⎡ 1      0      0  ⎤
+⎢                  ⎥
+⎢-2/9  -2/9   -2/9 ⎥
+⎢                  ⎥
+⎢-2/9   2/9   -2/9 ⎥
+⎢                  ⎥
+⎢1/90  1/45   2/45 ⎥
+⎢                  ⎥
+⎢1/90  -1/45  2/45 ⎥
+⎢                  ⎥
+⎢1/45  1/90   1/180⎥
+⎢                  ⎥
+⎢1/45  -1/90  1/180⎥
+⎢                  ⎥
+⎣ 0      0      1  ⎦
+ *
+ * @param filter
+ * @param in_channels
+ * @param out_channels
+ * @param output
+ */
+void TransformFilter8x8(const float *filter,
+                        const index_t in_channels,
+                        const index_t out_channels,
+                        float *output) {
+  const index_t stride = out_channels * in_channels;
+
+  const float G[8][3] = {
+    {1.0f, 0.0f, 0.0f},
+    {-2.0f / 9, -2.0f / 9, -2.0f / 9},
+    {-2.0f / 9, 2.0f / 9, -2.0f / 9},
+    {1.0f / 90, 1.0f / 45, 2.0f / 45},
+    {1.0f / 90, -1.0f / 45, 2.0f / 45},
+    {1.0f / 45, 1.0f / 90, 1.0f / 180},
+    {1.0f / 45, -1.0f / 90, 1.0f / 180},
+    {0.0f, 0.0f, 1.0f}
+  };
+
+#pragma omp parallel for collapse(2)
+  for (index_t m = 0; m < out_channels; ++m) {
+    for (index_t c = 0; c < in_channels; ++c) {
+      // load filter
+      index_t filter_offset = (m * in_channels + c) * 9;
+      float g0, g1, g2, g3, g4, g5, g6, g7, g8;
+      g0 = filter[filter_offset];
+      g1 = filter[filter_offset + 1];
+      g2 = filter[filter_offset + 2];
+      g3 = filter[filter_offset + 3];
+      g4 = filter[filter_offset + 4];
+      g5 = filter[filter_offset + 5];
+      g6 = filter[filter_offset + 6];
+      g7 = filter[filter_offset + 7];
+      g8 = filter[filter_offset + 8];
+
+      float s[3][8];
+      for (int i = 0; i < 8; ++i) {
+        s[0][i] = g0 * G[i][0] + g1 * G[i][1] + g2 * G[i][2];
+        s[1][i] = g3 * G[i][0] + g4 * G[i][1] + g5 * G[i][2];
+        s[2][i] = g6 * G[i][0] + g7 * G[i][1] + g8 * G[i][2];
+      }
+
+      // store output
+      index_t output_offset = m * in_channels + c;
+      for (int i = 0; i < 8; ++i) {
+        for (int j = 0; j < 8; ++j) {
+          output[output_offset + (i * 8 + j) * stride] =
+            G[i][0] * s[0][j] + G[i][1] * s[1][j] + G[i][2] * s[2][j];
+        }
+      }
+    }
+  }
+}
+
 // TOC * TNCB => TNOB
 void BatchGemm(const float *input,
                const float *filter,
@@ -178,17 +368,24 @@ void BatchGemm(const float *input,
                index_t in_channels,
                index_t out_channels,
                index_t tile_count,
+               int out_tile_size,
                float *output) {
   const index_t in_stride = batch * in_channels * tile_count;
   const index_t in_channels_tile_count = in_channels * tile_count;
   const index_t filter_stride = out_channels * in_channels;
   const index_t out_stride = batch * out_channels * tile_count;
   const index_t out_channels_tile_count = out_channels * tile_count;
-
+  const int in_tile_area = (out_tile_size + 2) * (out_tile_size + 2);
   if (batch == 1) {
-    Gemm(filter, input, 16, out_channels, in_channels, tile_count, output);
+    Gemm(filter,
+         input,
+         in_tile_area,
+         out_channels,
+         in_channels,
+         tile_count,
+         output);
   } else {
-    for (int i = 0; i < 16; ++i) {
+    for (int i = 0; i < in_tile_area; ++i) {
       for (int b = 0; b < batch; ++b) {
         const float
           *in_ptr = input + i * in_stride + b * in_channels_tile_count;
@@ -207,13 +404,13 @@ void BatchGemm(const float *input,
 }
 
 // TNOB => ToNOB => NOHoWo
-void TransformOutput(const float *input,
-                     index_t batch,
-                     index_t out_height,
-                     index_t out_width,
-                     index_t out_channels,
-                     index_t tile_count,
-                     float *output) {
+void TransformOutput4x4(const float *input,
+                        index_t batch,
+                        index_t out_height,
+                        index_t out_width,
+                        index_t out_channels,
+                        index_t tile_count,
+                        float *output) {
   const index_t in_stride = batch * out_channels * tile_count;
 
 #pragma omp parallel for
@@ -272,39 +469,102 @@ void TransformOutput(const float *input,
   }
 }
 
-void ConvRef3x3s1(const float *input,
-                  const float *filter,
-                  const index_t batch,
-                  const index_t in_height,
-                  const index_t in_width,
-                  const index_t in_channels,
-                  const index_t out_channels,
-                  float *output) {
-  index_t out_height = in_height - 2;
-  index_t out_width = in_width - 2;
+// TNOB => ToNOB => NOHoWo
+/**
+ * AT =
+⎡1  1  1   1    1   32  32   0⎤
+⎢                             ⎥
+⎢0  1  -1  2   -2   16  -16  0⎥
+⎢                             ⎥
+⎢0  1  1   4    4   8    8   0⎥
+⎢                             ⎥
+⎢0  1  -1  8   -8   4   -4   0⎥
+⎢                             ⎥
+⎢0  1  1   16  16   2    2   0⎥
+⎢                             ⎥
+⎣0  1  -1  32  -32  1   -1   1⎦
+ *
+ * @param input
+ * @param batch
+ * @param out_height
+ * @param out_width
+ * @param out_channels
+ * @param tile_count
+ * @param output
+ */
+void TransformOutput8x8(const float *input,
+                        index_t batch,
+                        index_t out_height,
+                        index_t out_width,
+                        index_t out_channels,
+                        index_t tile_count,
+                        float *output) {
+  const index_t in_stride = batch * out_channels * tile_count;
 
-#pragma omp parallel for collapse(4)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; ++m) {
-      for (index_t h = 0; h < out_height; ++h) {
-        for (index_t w = 0; w < out_width; ++w) {
-          index_t out_offset =
-            ((b * out_channels + m) * out_height + h) * out_width + w;
-          output[out_offset] = 0;
-          for (index_t c = 0; c < in_channels; ++c) {
-            for (index_t kh = 0; kh < 3; ++kh) {
-              for (index_t kw = 0; kw < 3; ++kw) {
-                index_t ih = h + kh;
-                index_t iw = w + kw;
-                index_t in_offset =
-                  ((b * in_channels + c) * in_height + ih) * in_width + iw;
-                index_t
-                  filter_offset = (((m * in_channels) + c) * 3 + kh) * 3 + kw;
-                output[out_offset] += input[in_offset] * filter[filter_offset];
-              }
-            }
-          }
+#pragma omp parallel for
+  for (index_t nm = 0; nm < batch * out_channels; ++nm) {
+    index_t tile_offset = nm * tile_count;
+    float s[8][6];
+    for (index_t h = 0; h < out_height; h += 6) {
+      for (index_t w = 0; w < out_width; w += 6) {
+        index_t tile_offset_tmp = tile_offset;
+        for (int i = 0; i < 8; ++i) {
+          float d0, d1, d2, d3, d4, d5, d6, d7;
+          d0 = input[tile_offset_tmp + 0 * in_stride];
+          d1 = input[tile_offset_tmp + 1 * in_stride];
+          d2 = input[tile_offset_tmp + 2 * in_stride];
+          d3 = input[tile_offset_tmp + 3 * in_stride];
+          d4 = input[tile_offset_tmp + 4 * in_stride];
+          d5 = input[tile_offset_tmp + 5 * in_stride];
+          d6 = input[tile_offset_tmp + 6 * in_stride];
+          d7 = input[tile_offset_tmp + 7 * in_stride];
+
+          float u = d1 + d2;
+          float v = d1 - d2;
+          float w = d3 + d4;
+          float x = d3 - d4;
+          float y = d5 + d6;
+          float z = d5 - d6;
+
+          s[i][0] = d0 + u + w + y * 32;
+          s[i][1] = v + x + x + z * 16;
+          s[i][2] = u + w * 4 + y * 8;
+          s[i][3] = v + x * 8 + z * 4;
+          s[i][4] = u + w * 16 + y + y;
+          s[i][5] = v + x * 32 + z + d7;
+
+          tile_offset_tmp += 8 * in_stride;
+        }
+
+        index_t out_offset = nm * out_height * out_width + h * out_width + w;
+
+        for (int i = 0; i < 6; ++i) {
+          float d0, d1, d2, d3, d4, d5, d6, d7;
+          d0 = s[0][i];
+          d1 = s[1][i];
+          d2 = s[2][i];
+          d3 = s[3][i];
+          d4 = s[4][i];
+          d5 = s[5][i];
+          d6 = s[6][i];
+          d7 = s[7][i];
+
+          float u = d1 + d2;
+          float v = d1 - d2;
+          float w = d3 + d4;
+          float x = d3 - d4;
+          float y = d5 + d6;
+          float z = d5 - d6;
+
+          output[out_offset + 0 * out_width + i] = d0 + u + w + y * 32;
+          output[out_offset + 1 * out_width + i] = v + x + x + z * 16;
+          output[out_offset + 2 * out_width + i] = u + w * 4 + y * 8;
+          output[out_offset + 3 * out_width + i] = v + x * 8 + z * 4;
+          output[out_offset + 4 * out_width + i] = u + w * 16 + y + y;
+          output[out_offset + 5 * out_width + i] = v + x * 32 + z + d7;
         }
+
+        ++tile_offset;
       }
     }
   }
@@ -318,6 +578,7 @@ void WinoGradConv3x3s1(const float *input,
                        const index_t in_width,
                        const index_t in_channels,
                        const index_t out_channels,
+                       const int out_tile_size,
                        float *transformed_input,
                        float *transformed_filter,
                        float *transformed_output,
@@ -325,22 +586,52 @@ void WinoGradConv3x3s1(const float *input,
                        float *output) {
   index_t out_height = in_height - 2;
   index_t out_width = in_width - 2;
-  index_t tile_height_count = (out_height + 1) / 2;
-  index_t tile_width_count = (out_width + 1) / 2;
+  index_t tile_height_count =
+    RoundUpDiv(out_height, static_cast<index_t>(out_tile_size));
+  index_t tile_width_count =
+    RoundUpDiv(out_width, static_cast<index_t>(out_tile_size));
   index_t tile_count = tile_height_count * tile_width_count;
 
-  TransformInput(input,
-                 batch,
-                 in_height,
-                 in_width,
-                 in_channels,
-                 tile_count,
-                 transformed_input);
+  switch (out_tile_size) {
+    case 2:
+      TransformInput4x4(input,
+                        batch,
+                        in_height,
+                        in_width,
+                        in_channels,
+                        tile_count,
+                        transformed_input);
+      break;
+    case 6:
+      TransformInput8x8(input,
+                        batch,
+                        in_height,
+                        in_width,
+                        in_channels,
+                        tile_count,
+                        transformed_input);
+      break;
+    default:MACE_NOT_IMPLEMENTED;
+  }
 
   // TODO(liyin): put it in model converter, but do not worry, it is fast and
   // will only do once
   if (!is_filter_transformed) {
-    TransformFilter(filter, in_channels, out_channels, transformed_filter);
+    switch (out_tile_size) {
+      case 2:
+        TransformFilter4x4(filter,
+                           in_channels,
+                           out_channels,
+                           transformed_filter);
+        break;
+      case 6:
+        TransformFilter8x8(filter,
+                           in_channels,
+                           out_channels,
+                           transformed_filter);
+        break;
+      default:MACE_NOT_IMPLEMENTED;
+    }
   }
 
   BatchGemm(transformed_input,
@@ -349,15 +640,30 @@ void WinoGradConv3x3s1(const float *input,
             in_channels,
             out_channels,
             tile_count,
+            out_tile_size,
             transformed_output);
 
-  TransformOutput(transformed_output,
-                  batch,
-                  out_height,
-                  out_width,
-                  out_channels,
-                  tile_count,
-                  output);
+  switch (out_tile_size) {
+    case 2:
+      TransformOutput4x4(transformed_output,
+                         batch,
+                         out_height,
+                         out_width,
+                         out_channels,
+                         tile_count,
+                         output);
+      break;
+    case 6:
+      TransformOutput8x8(transformed_output,
+                         batch,
+                         out_height,
+                         out_width,
+                         out_channels,
+                         tile_count,
+                         output);
+      break;
+    default:MACE_NOT_IMPLEMENTED;
+  }
 }
 
 void WinoGradConv3x3s1(const float *input,
@@ -367,16 +673,21 @@ void WinoGradConv3x3s1(const float *input,
                        const index_t in_width,
                        const index_t in_channels,
                        const index_t out_channels,
+                       const int out_tile_size,
                        float *output) {
   index_t out_height = in_height - 2;
   index_t out_width = in_width - 2;
-  index_t tile_height_count = (out_height + 1) / 2;
-  index_t tile_width_count = (out_width + 1) / 2;
+  index_t tile_height_count =
+    RoundUpDiv(out_height, static_cast<index_t>(out_tile_size));
+  index_t tile_width_count =
+    RoundUpDiv(out_width, static_cast<index_t>(out_tile_size));
   index_t tile_count = tile_height_count * tile_width_count;
-
-  index_t transformed_input_size = 16 * batch * in_channels * tile_count;
-  index_t transformed_filter_size = 16 * out_channels * in_channels;
-  index_t transformed_output_size = 16 * batch * out_channels * tile_count;
+  index_t in_tile_area = (out_tile_size + 2) * (out_tile_size + 2);
+  index_t transformed_input_size =
+    in_tile_area * batch * in_channels * tile_count;
+  index_t transformed_filter_size = in_tile_area * out_channels * in_channels;
+  index_t
+    transformed_output_size = in_tile_area * batch * out_channels * tile_count;
 
   float *transformed_input = new float[transformed_input_size];  // TNCB
   float *transformed_filter = new float[transformed_filter_size];  // TOC
@@ -389,6 +700,7 @@ void WinoGradConv3x3s1(const float *input,
                     in_width,
                     in_channels,
                     out_channels,
+                    out_tile_size,
                     transformed_input,
                     transformed_filter,
                     transformed_output,
@@ -400,5 +712,44 @@ void WinoGradConv3x3s1(const float *input,
   delete[]transformed_output;
 }
 
+void ConvRef3x3s1(const float *input,
+                  const float *filter,
+                  const index_t batch,
+                  const index_t in_height,
+                  const index_t in_width,
+                  const index_t in_channels,
+                  const index_t out_channels,
+                  float *output) {
+  index_t out_height = in_height - 2;
+  index_t out_width = in_width - 2;
+
+#pragma omp parallel for collapse(4)
+  for (index_t b = 0; b < batch; ++b) {
+    for (index_t m = 0; m < out_channels; ++m) {
+      for (index_t h = 0; h < out_height; ++h) {
+        for (index_t w = 0; w < out_width; ++w) {
+          index_t out_offset =
+            ((b * out_channels + m) * out_height + h) * out_width + w;
+          output[out_offset] = 0;
+          for (index_t c = 0; c < in_channels; ++c) {
+            for (index_t kh = 0; kh < 3; ++kh) {
+              for (index_t kw = 0; kw < 3; ++kw) {
+                index_t ih = h + kh;
+                index_t iw = w + kw;
+                index_t in_offset =
+                  ((b * in_channels + c) * in_height + ih) * in_width + iw;
+                index_t
+                  filter_offset = (((m * in_channels) + c) * 3 + kh) * 3 + kw;
+                output[out_offset] +=
+                  input[in_offset] * filter[filter_offset];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/arm/conv_winograd.h b/mace/kernels/arm/conv_winograd.h
index 7611d65ae5e2a57b4542df40bc4d6bef3d04538d..d058a29c159f07ddb92a984f6e93ba38926c3f62 100644
--- a/mace/kernels/arm/conv_winograd.h
+++ b/mace/kernels/arm/conv_winograd.h
@@ -21,6 +21,7 @@ void WinoGradConv3x3s1(const float *input,
                        const index_t in_width,
                        const index_t in_channels,
                        const index_t out_channels,
+                       const int out_tile_size,
                        float *output);
 
 void WinoGradConv3x3s1(const float *input,
@@ -30,12 +31,22 @@ void WinoGradConv3x3s1(const float *input,
                        const index_t in_width,
                        const index_t in_channels,
                        const index_t out_channels,
+                       const int out_tile_size,
                        float *transformed_input,
                        float *transformed_filter,
                        float *transformed_output,
                        bool is_filter_transformed,
                        float *output);
 
+void ConvRef3x3s1(const float *input,
+                  const float *filter,
+                  const index_t batch,
+                  const index_t in_height,
+                  const index_t in_width,
+                  const index_t in_channels,
+                  const index_t out_channels,
+                  float *output);
+
 }  // namespace kernels
 }  // namespace mace
 
diff --git a/mace/kernels/arm/conv_winograd_test.cc b/mace/kernels/arm/conv_winograd_test.cc
index 52be053bd9e5cb2d57a5c2d3c1b1fce322752996..4cb591ec1b6ae422fa963ec1505aed8df2dc874c 100644
--- a/mace/kernels/arm/conv_winograd_test.cc
+++ b/mace/kernels/arm/conv_winograd_test.cc
@@ -58,11 +58,12 @@ TEST(ConvWinogradTest, winograd) {
                              in_width,
                              in_channels,
                              out_channels,
+                             6,
                              output_data);
 
   // test
   for (index_t i = 0; i < output_size; ++i) {
-    EXPECT_NEAR(output_data_ref[i], output_data[i], 0.1);
+    EXPECT_NEAR(output_data_ref[i], output_data[i], 0.1) << " with index " << i;
   }
 
   delete[]input_data;
diff --git a/mace/kernels/gemm.cc b/mace/kernels/gemm.cc
index 00be4829802ede5fadbc0244917f56fcf0dd6025..b200c7650d596d868022d292029fa29fb1abcc18 100644
--- a/mace/kernels/gemm.cc
+++ b/mace/kernels/gemm.cc
@@ -13,22 +13,6 @@ namespace mace {
 namespace kernels {
 
 namespace {
-void GemmRef(const float *A,
-             const float *B,
-             const index_t height,
-             const index_t K,
-             const index_t width,
-             float *C) {
-  memset(C, 0, sizeof(float) * height * width);
-  for (int i = 0; i < height; ++i) {
-    for (int j = 0; j < width; ++j) {
-      for (int k = 0; k < K; ++k) {
-        C[i * width + j] += A[i * K + k] * B[k * width + j];
-      }
-    }
-  }
-}
-
 inline void GemmBlock(const float *A,
                       const float *B,
                       const index_t height,
@@ -49,8 +33,8 @@ inline void GemmBlock(const float *A,
 // TODO(liyin): may need implement 883 since RGB
 inline void Gemm884(const float *a_ptr,
                     const float *b_ptr,
-                    index_t stride_w,
                     index_t stride_k,
+                    index_t stride_w,
                     float *c_ptr) {
 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
   float32x4_t a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14,
@@ -136,29 +120,300 @@ inline void GemmTile(const float *A,
                      float *C) {
   index_t h, w, k;
   for (h = 0; h + 7 < height; h += 8) {
-    for (w = 0; w + 3 < width; w += 4) {
-      for (k = 0; k + 7 < K; k += 8) {
-        const float *a_ptr = A + (h * stride_k + k);
+    for (k = 0; k + 7 < K; k += 8) {
+      const float *a_ptr = A + (h * stride_k + k);
+
+#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
+
+#ifdef __clang__
+      int nw = width >> 2;
+      if (nw > 0) {
+        // load A
+        float32x4_t a0, a1, a2, a3, a4, a5, a6, a7,
+          a8, a9, a10, a11, a12, a13, a14, a15;
+        a0 = vld1q_f32(a_ptr);
+        a1 = vld1q_f32(a_ptr + 4);
+        a2 = vld1q_f32(a_ptr + 1 * stride_k);
+        a3 = vld1q_f32(a_ptr + 1 * stride_k + 4);
+        a4 = vld1q_f32(a_ptr + 2 * stride_k);
+        a5 = vld1q_f32(a_ptr + 2 * stride_k + 4);
+        a6 = vld1q_f32(a_ptr + 3 * stride_k);
+        a7 = vld1q_f32(a_ptr + 3 * stride_k + 4);
+        a8 = vld1q_f32(a_ptr + 4 * stride_k);
+        a9 = vld1q_f32(a_ptr + 4 * stride_k + 4);
+        a10 = vld1q_f32(a_ptr + 5 * stride_k);
+        a11 = vld1q_f32(a_ptr + 5 * stride_k + 4);
+        a12 = vld1q_f32(a_ptr + 6 * stride_k);
+        a13 = vld1q_f32(a_ptr + 6 * stride_k + 4);
+        a14 = vld1q_f32(a_ptr + 7 * stride_k);
+        a15 = vld1q_f32(a_ptr + 7 * stride_k + 4);
+
+        const float *b_ptr0 = B + k * stride_w;
+        const float *b_ptr1 = B + (k + 1) * stride_w;
+        const float *b_ptr2 = B + (k + 2) * stride_w;
+        const float *b_ptr3 = B + (k + 3) * stride_w;
+        const float *b_ptr4 = B + (k + 4) * stride_w;
+        const float *b_ptr5 = B + (k + 5) * stride_w;
+        const float *b_ptr6 = B + (k + 6) * stride_w;
+        const float *b_ptr7 = B + (k + 7) * stride_w;
+
+        float *c_ptr0 = C + h * stride_w;
+        float *c_ptr1 = C + (h + 1) * stride_w;
+        float *c_ptr2 = C + (h + 2) * stride_w;
+        float *c_ptr3 = C + (h + 3) * stride_w;
+        float *c_ptr4 = C + (h + 4) * stride_w;
+        float *c_ptr5 = C + (h + 5) * stride_w;
+        float *c_ptr6 = C + (h + 6) * stride_w;
+        float *c_ptr7 = C + (h + 7) * stride_w;
+
+        asm volatile(
+        "0:                                 \n"
+
+          "prfm   pldl1keep, [%1, #128]       \n"
+          "ld1    {v24.4s}, [%1]              \n"
+
+          // load b: 0-7
+          "prfm   pldl1keep, [%9, #128]       \n"
+          "ld1    {v16.4s}, [%9], #16         \n"
+
+          "prfm   pldl1keep, [%10, #128]      \n"
+          "ld1    {v17.4s}, [%10], #16        \n"
+
+          "prfm   pldl1keep, [%11, #128]      \n"
+          "ld1    {v18.4s}, [%11], #16        \n"
+
+          "prfm   pldl1keep, [%12, #128]      \n"
+          "ld1    {v19.4s}, [%12], #16        \n"
+
+          "prfm   pldl1keep, [%2, #128]       \n"
+          "ld1    {v25.4s}, [%2]              \n"
+
+          "prfm   pldl1keep, [%13, #128]      \n"
+          "ld1    {v20.4s}, [%13], #16        \n"
+
+          "prfm   pldl1keep, [%14, #128]      \n"
+          "ld1    {v21.4s}, [%14], #16        \n"
+
+          "prfm   pldl1keep, [%15, #128]      \n"
+          "ld1    {v22.4s}, [%15], #16        \n"
+
+          "prfm   pldl1keep, [%16, #128]      \n"
+          "ld1    {v23.4s}, [%16], #16        \n"
+
+          "prfm   pldl1keep, [%3, #128]       \n"
+          "ld1    {v26.4s}, [%3]              \n"
+
+          "fmla   v24.4s, v16.4s, %34.s[0]    \n"
+          "fmla   v24.4s, v17.4s, %34.s[1]    \n"
+          "fmla   v24.4s, v18.4s, %34.s[2]    \n"
+          "fmla   v24.4s, v19.4s, %34.s[3]    \n"
+
+          "fmla   v24.4s, v20.4s, %35.s[0]    \n"
+          "fmla   v24.4s, v21.4s, %35.s[1]    \n"
+          "fmla   v24.4s, v22.4s, %35.s[2]    \n"
+          "fmla   v24.4s, v23.4s, %35.s[3]    \n"
+
+          "st1    {v24.4s}, [%1], #16         \n"
+
+          "fmla   v25.4s, v16.4s, %36.s[0]    \n"
+          "fmla   v25.4s, v17.4s, %36.s[1]    \n"
+          "fmla   v25.4s, v18.4s, %36.s[2]    \n"
+          "fmla   v25.4s, v19.4s, %36.s[3]    \n"
+
+          "fmla   v25.4s, v20.4s, %37.s[0]    \n"
+          "fmla   v25.4s, v21.4s, %37.s[1]    \n"
+          "fmla   v25.4s, v22.4s, %37.s[2]    \n"
+          "fmla   v25.4s, v23.4s, %37.s[3]    \n"
+
+          "prfm   pldl1keep, [%4, #128]       \n"
+          "ld1    {v24.4s}, [%4]              \n"
+
+          "st1    {v25.4s}, [%2], #16         \n"
+
+          "fmla   v26.4s, v16.4s, %38.s[0]    \n"
+          "fmla   v26.4s, v17.4s, %38.s[1]    \n"
+          "fmla   v26.4s, v18.4s, %38.s[2]    \n"
+          "fmla   v26.4s, v19.4s, %38.s[3]    \n"
+
+          "fmla   v26.4s, v20.4s, %39.s[0]    \n"
+          "fmla   v26.4s, v21.4s, %39.s[1]    \n"
+          "fmla   v26.4s, v22.4s, %39.s[2]    \n"
+          "fmla   v26.4s, v23.4s, %39.s[3]    \n"
+
+          "prfm   pldl1keep, [%5, #128]       \n"
+          "ld1    {v25.4s}, [%5]              \n"
+
+          "st1    {v26.4s}, [%3], #16         \n"
+
+          "fmla   v24.4s, v16.4s, %40.s[0]    \n"
+          "fmla   v24.4s, v17.4s, %40.s[1]    \n"
+          "fmla   v24.4s, v18.4s, %40.s[2]    \n"
+          "fmla   v24.4s, v19.4s, %40.s[3]    \n"
+
+          "fmla   v24.4s, v20.4s, %41.s[0]    \n"
+          "fmla   v24.4s, v21.4s, %41.s[1]    \n"
+          "fmla   v24.4s, v22.4s, %41.s[2]    \n"
+          "fmla   v24.4s, v23.4s, %41.s[3]    \n"
+
+          "prfm   pldl1keep, [%6, #128]       \n"
+          "ld1    {v26.4s}, [%6]              \n"
+
+          "st1    {v24.4s}, [%4], #16         \n"
+
+          "fmla   v25.4s, v16.4s, %42.s[0]    \n"
+          "fmla   v25.4s, v17.4s, %42.s[1]    \n"
+          "fmla   v25.4s, v18.4s, %42.s[2]    \n"
+          "fmla   v25.4s, v19.4s, %42.s[3]    \n"
+
+          "fmla   v25.4s, v20.4s, %43.s[0]    \n"
+          "fmla   v25.4s, v21.4s, %43.s[1]    \n"
+          "fmla   v25.4s, v22.4s, %43.s[2]    \n"
+          "fmla   v25.4s, v23.4s, %43.s[3]    \n"
+
+          "prfm   pldl1keep, [%7, #128]       \n"
+          "ld1    {v24.4s}, [%7]              \n"
+
+          "st1    {v25.4s}, [%5], #16         \n"
+
+          "fmla   v26.4s, v16.4s, %44.s[0]    \n"
+          "fmla   v26.4s, v17.4s, %44.s[1]    \n"
+          "fmla   v26.4s, v18.4s, %44.s[2]    \n"
+          "fmla   v26.4s, v19.4s, %44.s[3]    \n"
+
+          "fmla   v26.4s, v20.4s, %45.s[0]    \n"
+          "fmla   v26.4s, v21.4s, %45.s[1]    \n"
+          "fmla   v26.4s, v22.4s, %45.s[2]    \n"
+          "fmla   v26.4s, v23.4s, %45.s[3]    \n"
+
+          "prfm   pldl1keep, [%8, #128]       \n"
+          "ld1    {v25.4s}, [%8]              \n"
+
+          "st1    {v26.4s}, [%6], #16         \n"
+
+          "fmla   v24.4s, v16.4s, %46.s[0]    \n"
+          "fmla   v24.4s, v17.4s, %46.s[1]    \n"
+          "fmla   v24.4s, v18.4s, %46.s[2]    \n"
+          "fmla   v24.4s, v19.4s, %46.s[3]    \n"
+
+          "fmla   v24.4s, v20.4s, %47.s[0]    \n"
+          "fmla   v24.4s, v21.4s, %47.s[1]    \n"
+          "fmla   v24.4s, v22.4s, %47.s[2]    \n"
+          "fmla   v24.4s, v23.4s, %47.s[3]    \n"
+
+          "st1    {v24.4s}, [%7], #16         \n"
+
+          "fmla   v25.4s, v16.4s, %48.s[0]    \n"
+          "fmla   v25.4s, v17.4s, %48.s[1]    \n"
+          "fmla   v25.4s, v18.4s, %48.s[2]    \n"
+          "fmla   v25.4s, v19.4s, %48.s[3]    \n"
+
+          "fmla   v25.4s, v20.4s, %49.s[0]    \n"
+          "fmla   v25.4s, v21.4s, %49.s[1]    \n"
+          "fmla   v25.4s, v22.4s, %49.s[2]    \n"
+          "fmla   v25.4s, v23.4s, %49.s[3]    \n"
+
+          "st1    {v25.4s}, [%8], #16         \n"
+
+          "subs   %w0, %w0, #1                \n"
+          "bne    0b                          \n"
+        : "=r"(nw),      // 0
+        "=r"(c_ptr0),  // 1
+        "=r"(c_ptr1),  // 2
+        "=r"(c_ptr2),  // 3
+        "=r"(c_ptr3),  // 4
+        "=r"(c_ptr4),  // 5
+        "=r"(c_ptr5),  // 6
+        "=r"(c_ptr6),  // 7
+        "=r"(c_ptr7),  // 8
+        "=r"(b_ptr0),  // 9
+        "=r"(b_ptr1),  // 10
+        "=r"(b_ptr2),  // 11
+        "=r"(b_ptr3),  // 12
+        "=r"(b_ptr4),  // 13
+        "=r"(b_ptr5),  // 14
+        "=r"(b_ptr6),  // 15
+        "=r"(b_ptr7)   // 16
+        : "0"(nw),       // 17
+        "1"(c_ptr0),   // 18
+        "2"(c_ptr1),   // 19
+        "3"(c_ptr2),   // 20
+        "4"(c_ptr3),   // 21
+        "5"(c_ptr4),   // 22
+        "6"(c_ptr5),   // 23
+        "7"(c_ptr6),   // 24
+        "8"(c_ptr7),   // 25
+        "9"(b_ptr0),   // 26
+        "10"(b_ptr1),  // 27
+        "11"(b_ptr2),  // 28
+        "12"(b_ptr3),  // 29
+        "13"(b_ptr4),  // 30
+        "14"(b_ptr5),  // 31
+        "15"(b_ptr6),  // 32
+        "16"(b_ptr7),  // 33
+        "w"(a0),       // 34
+        "w"(a1),       // 35
+        "w"(a2),       // 36
+        "w"(a3),       // 37
+        "w"(a4),       // 38
+        "w"(a5),       // 39
+        "w"(a6),       // 40
+        "w"(a7),       // 41
+        "w"(a8),       // 42
+        "w"(a9),       // 43
+        "w"(a10),      // 44
+        "w"(a11),      // 45
+        "w"(a12),      // 46
+        "w"(a13),      // 47
+        "w"(a14),      // 48
+        "w"(a15)       // 49
+        : "cc", "memory",
+        "v16",
+        "v17",
+        "v18",
+        "v19",
+        "v20",
+        "v21",
+        "v22",
+        "v23",
+        "v24",
+        "v25",
+        "v26"
+        );
+
+        w = (width >> 2) << 2;
+      }
+#else  // gcc
+      for (w = 0; w + 3 < width; w += 4) {
+        const float *b_ptr = B + (k * stride_w + w);
+        float *c_ptr = C + (h * stride_w + w);
+        Gemm884(a_ptr, b_ptr, stride_k, stride_w, c_ptr);
+      }
+#endif
+
+#else
+      for (w = 0; w + 3 < width; w += 4) {
         const float *b_ptr = B + (k * stride_w + w);
         float *c_ptr = C + (h * stride_w + w);
-        Gemm884(a_ptr, b_ptr, stride_w, stride_k, c_ptr);
+        GemmBlock(a_ptr, b_ptr, 8, 8, 4, stride_k, stride_w, c_ptr);
       }
-      if (k < K) {
+#endif
+
+      if (w < width) {
         const float *a_ptr = A + (h * stride_k + k);
         const float *b_ptr = B + (k * stride_w + w);
         float *c_ptr = C + (h * stride_w + w);
-        GemmBlock(a_ptr, b_ptr, 8, K - k, 4, stride_k, stride_w, c_ptr);
+        GemmBlock(a_ptr, b_ptr, 8, 8, width - w, stride_k, stride_w, c_ptr);
       }
     }
-    if (w < width) {
-      const float *a_ptr = A + h * stride_k;
-      const float *b_ptr = B + w;
-      float *c_ptr = C + (h * stride_w + w);
+    if (k < K) {
+      const float *a_ptr = A + (h * stride_k + k);
+      const float *b_ptr = B + k * stride_w;
+      float *c_ptr = C + h * stride_w;
       GemmBlock(a_ptr,
                 b_ptr,
                 8,
-                K,
-                width - w,
+                K - k,
+                width,
                 stride_k,
                 stride_w,
                 c_ptr);
@@ -243,5 +498,21 @@ void Gemm(const float *A,
   }  // n
 }
 
+void GemmRef(const float *A,
+             const float *B,
+             const index_t height,
+             const index_t K,
+             const index_t width,
+             float *C) {
+  memset(C, 0, sizeof(float) * height * width);
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
+      for (int k = 0; k < K; ++k) {
+        C[i * width + j] += A[i * K + k] * B[k * width + j];
+      }
+    }
+  }
+}
+
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/gemm.h b/mace/kernels/gemm.h
index d17eab83e12d5531eed8bdaddd6af352b179b327..eec69e8ef7675a28a3fa79a8f5bf5f8e957d6865 100644
--- a/mace/kernels/gemm.h
+++ b/mace/kernels/gemm.h
@@ -22,6 +22,13 @@ void Gemm(const float *A,
           const index_t width,
           float *C);
 
+void GemmRef(const float *A,
+             const float *B,
+             const index_t height,
+             const index_t K,
+             const index_t width,
+             float *C);
+
 }  // namespace kernels
 }  // namespace mace
 
diff --git a/mace/kernels/gemm_test.cc b/mace/kernels/gemm_test.cc
index 9e4b964daf5922a6fd0912ca65d9b01225aaf2fb..dacb93eebcab3aae16035477d4a591780f6b32a6 100644
--- a/mace/kernels/gemm_test.cc
+++ b/mace/kernels/gemm_test.cc
@@ -31,7 +31,7 @@ TEST(GEMMTest, gemm) {
                 [&gen, &nd] {
                   return nd(gen);
                 });
-  kernels::Gemm(A, B, N, K, M, C);
+  kernels::Gemm(A, B, 1, N, K, M, C);
   kernels::GemmRef(A, B, N, K, M, C_ref);
 
   for (int i = 0; i < N * M; ++i) {
diff --git a/mace/kernels/slice.h b/mace/kernels/slice.h
index 0fea6b586fd7c81790538fa4a318980cf072a96d..36cda42f345ff9bb1a4d118cb6627608ef698309 100644
--- a/mace/kernels/slice.h
+++ b/mace/kernels/slice.h
@@ -6,6 +6,7 @@
 #define MACE_KERNELS_SLICE_H_
 
 #include <memory>
+#include <functional>
 #include <vector>
 
 #include "mace/core/future.h"
@@ -17,20 +18,34 @@
 namespace mace {
 namespace kernels {
 
+struct SliceFunctorBase {
+  explicit SliceFunctorBase(const int32_t axis) : axis_(axis) {}
+
+  int32_t axis_;
+};
+
 template<DeviceType D, typename T>
-struct SliceFunctor {
+struct SliceFunctor : SliceFunctorBase {
+  explicit SliceFunctor(const int32_t axis) : SliceFunctorBase(axis) {}
+
   void operator()(const Tensor *input,
                   const std::vector<Tensor *> &output_list,
                   StatsFuture *future) {
-    const index_t outer_size = input->dim(0) * input->dim(1) * input->dim(2);
-    const index_t input_channels = input->dim(3);
+    const index_t input_channels = input->dim(axis_);
     const size_t outputs_count = output_list.size();
     const index_t output_channels = input_channels / outputs_count;
     std::vector<T *> output_ptrs(output_list.size(), nullptr);
+    std::vector<index_t> output_shape(input->shape());
+    output_shape[axis_] = output_channels;
 
-    std::vector<index_t> output_shape({input->dim(0), input->dim(1),
-                                       input->dim(2), output_channels});
-
+    const index_t outer_size = std::accumulate(output_shape.begin(),
+                                               output_shape.begin() + axis_,
+                                               1,
+                                               std::multiplies<index_t>());
+    const index_t inner_size = std::accumulate(output_shape.begin() + axis_ + 1,
+                                               output_shape.end(),
+                                               1,
+                                               std::multiplies<index_t>());
     for (size_t i= 0; i < outputs_count; ++i) {
       output_list[i]->Resize(output_shape);
       output_ptrs[i] = output_list[i]->mutable_data<T>();
@@ -39,25 +54,27 @@ struct SliceFunctor {
 
 #pragma omp parallel for
     for (int outer_idx = 0; outer_idx < outer_size; ++outer_idx) {
-      int input_idx = outer_idx * input_channels;
-      int output_idx = outer_idx * output_channels;
+      int input_idx = outer_idx * input_channels * inner_size;
+      int output_idx = outer_idx * output_channels * inner_size;
       for (size_t i = 0; i < outputs_count; ++i) {
         if (DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) {
           memcpy(output_ptrs[i]+output_idx, input_ptr+input_idx,
-                 output_channels * sizeof(T));
+                 output_channels * inner_size * sizeof(T));
         } else {
-          for (index_t k = 0; k < output_channels; ++k) {
+          for (index_t k = 0; k < output_channels * inner_size; ++k) {
             *(output_ptrs[i] + output_idx + k) = *(input_ptr + input_idx + k);
           }
         }
-        input_idx += output_channels;
+        input_idx += output_channels * inner_size;
       }
     }
   }
 };
 
 template<typename T>
-struct SliceFunctor<DeviceType::OPENCL, T> {
+struct SliceFunctor<DeviceType::OPENCL, T> : SliceFunctorBase {
+  explicit SliceFunctor(const int32_t axis) : SliceFunctorBase(axis) {}
+
   void operator()(const Tensor *input,
                   const std::vector<Tensor *> &output_list,
                   StatsFuture *future);
diff --git a/mace/ops/activation_test.cc b/mace/ops/activation_test.cc
index 18bddcc75e4555cdc16d3cd63e828281ce7dd0e8..034a591fa21c0455bb9f6ccb35e6f12b494f4dd0 100644
--- a/mace/ops/activation_test.cc
+++ b/mace/ops/activation_test.cc
@@ -249,14 +249,26 @@ void TestSimplePrelu() {
     net.RunOp(D);
   }
 
-  auto expected = CreateTensor<float>(
-      {2, 2, 2, 2},
-      {-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0});
+  if (D == DeviceType::NEON) {
+    auto expected = CreateTensor<float>(
+        {2, 2, 2, 2},
+        {-14, 7, -12, 6, -15, -15, -12, -12, -6, 3, -4, 2, -3, -3, 0, 0});
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
+  } else {
+    auto expected = CreateTensor<float>(
+        {2, 2, 2, 2},
+        {-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0});
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
+  }
+}
 
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
+TEST_F(ActivationOpTest, CPUSimplePrelu) {
+  TestSimplePrelu<DeviceType::CPU>();
 }
 
-TEST_F(ActivationOpTest, CPUSimplePrelu) { TestSimplePrelu<DeviceType::CPU>(); }
+TEST_F(ActivationOpTest, NEONSimplePrelu) {
+  TestSimplePrelu<DeviceType::NEON>();
+}
 
 TEST_F(ActivationOpTest, OPENCLSimplePrelu) {
   TestSimplePrelu<DeviceType::OPENCL>();
diff --git a/mace/ops/slice.cc b/mace/ops/slice.cc
index d482b0288bbcaaa8d7144ad98de10b50d4c7db40..e8fe0acc2ae5b78bdf68ba0dc8f25a7225a911f7 100644
--- a/mace/ops/slice.cc
+++ b/mace/ops/slice.cc
@@ -24,6 +24,11 @@ void Register_Slice(OperatorRegistry *op_registry) {
                                      .TypeConstraint<half>("T")
                                      .Build(),
                     SliceOp<DeviceType::OPENCL, half>);
+  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice")
+                                     .Device(DeviceType::NEON)
+                                     .TypeConstraint<float>("T")
+                                     .Build(),
+                    SliceOp<DeviceType::NEON, float>);
 }
 
 }  // namespace ops
diff --git a/mace/ops/slice.h b/mace/ops/slice.h
index a1a6ad112578c41f5966a0a61951a730c50cf245..228693450de1e8a460ef921766e87b08f2141176 100644
--- a/mace/ops/slice.h
+++ b/mace/ops/slice.h
@@ -17,14 +17,16 @@ template <DeviceType D, typename T>
 class SliceOp : public Operator<D, T> {
  public:
   SliceOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws) {}
+      : Operator<D, T>(op_def, ws),
+        functor_(OperatorBase::GetSingleArgument<int>("axis", 3)) {}
 
   bool Run(StatsFuture *future) override {
     MACE_CHECK(this->OutputSize() >= 2)
       << "There must be at least two outputs for slicing";
     const Tensor *input = this->Input(INPUT);
     const std::vector<Tensor *> output_list = this->Outputs();
-    MACE_CHECK((input->dim(3) % this->OutputSize()) == 0)
+    const int32_t slice_axis = OperatorBase::GetSingleArgument<int>("axis", 3);
+    MACE_CHECK((input->dim(slice_axis) % this->OutputSize()) == 0)
       << "Outputs do not split input equally.";
 
     functor_(input, output_list, future);
diff --git a/mace/ops/slice_test.cc b/mace/ops/slice_test.cc
index bd0244d3cd9453785c30a219c6011c82edd2bd84..ad507af380a29ee0a3d138e983d4fe34396ad75f 100644
--- a/mace/ops/slice_test.cc
+++ b/mace/ops/slice_test.cc
@@ -16,7 +16,7 @@ namespace test {
 class SliceOpTest : public OpsTestBase {};
 
 template<DeviceType D, typename T>
-void RandomTest(const int num_outputs) {
+void RandomTest(const int num_outputs, const int axis) {
   static unsigned int seed = time(NULL);
   const index_t output_channels = 4 * (1 + rand_r(&seed) % 10);
   const index_t input_channels = num_outputs * output_channels;
@@ -27,7 +27,11 @@ void RandomTest(const int num_outputs) {
   // Construct graph
   OpsTestNet net;
 
-  std::vector<index_t> input_shape({batch, height, width, input_channels});
+  std::vector<index_t> input_shape;
+  if (axis == 1)
+    input_shape = {batch, input_channels, height, width};
+  else if (axis == 3)
+    input_shape = {batch, height, width, input_channels};
   const index_t input_size = std::accumulate(input_shape.begin(),
                                              input_shape.end(),
                                              1,
@@ -49,7 +53,7 @@ void RandomTest(const int num_outputs) {
         .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
         .Finalize(net.NewOperatorDef());
   } else {
-    auto builder = OpDefBuilder("Slice", "SliceTest");
+    auto builder = OpDefBuilder("Slice", "SliceTest").AddIntArg("axis", axis);
     builder.Input("Input");
     for (int i = 0; i < num_outputs; ++i) {
       builder = builder.Output(MakeString("Output", i));
@@ -70,9 +74,17 @@ void RandomTest(const int num_outputs) {
   }
 
   // Check
-  std::vector<index_t> expected_shape({batch, height, width, output_channels});
+  std::vector<index_t> expected_shape;
+  if (axis == 1)
+    expected_shape = {batch, output_channels, height, width};
+  else if (axis == 3)
+    expected_shape = {batch, height, width, output_channels};
   const index_t outer_size = std::accumulate(expected_shape.begin(),
-                                             expected_shape.end() - 1,
+                                             expected_shape.begin() + axis,
+                                             1,
+                                             std::multiplies<index_t>());
+  const index_t inner_size = std::accumulate(expected_shape.begin() + axis + 1,
+                                             expected_shape.end(),
                                              1,
                                              std::multiplies<index_t>());
   const float *input_ptr = input_data.data();
@@ -83,8 +95,9 @@ void RandomTest(const int num_outputs) {
     Tensor::MappingGuard output_mapper(output);
     output_ptr = output->data<float>();
     for (int outer_idx = 0; outer_idx < outer_size; ++outer_idx) {
-      const int idx = outer_idx * input_channels + i * output_channels;
-      for (int j = 0; j < output_channels; ++j) {
+      const int idx = (outer_idx * input_channels + i * output_channels)
+          *  inner_size;
+      for (int j = 0; j < output_channels * inner_size; ++j) {
         ASSERT_NEAR(*output_ptr++, input_ptr[idx + j], 1e-2) << "with output "
           << i << " index " << idx + j;
       }
@@ -93,21 +106,27 @@ void RandomTest(const int num_outputs) {
 }
 
 TEST_F(SliceOpTest, CPU) {
-  RandomTest<DeviceType::CPU, float>(2);
-  RandomTest<DeviceType::CPU, float>(4);
-  RandomTest<DeviceType::CPU, float>(11);
+  RandomTest<DeviceType::CPU, float>(2, 3);
+  RandomTest<DeviceType::CPU, float>(4, 3);
+  RandomTest<DeviceType::CPU, float>(11, 3);
+}
+
+TEST_F(SliceOpTest, CPUAxis1) {
+  RandomTest<DeviceType::CPU, float>(2, 1);
+  RandomTest<DeviceType::CPU, float>(4, 1);
+  RandomTest<DeviceType::CPU, float>(11, 1);
 }
 
 TEST_F(SliceOpTest, OPENCLFloat) {
-  RandomTest<DeviceType::OPENCL, float>(2);
-  RandomTest<DeviceType::OPENCL, float>(4);
-  RandomTest<DeviceType::OPENCL, float>(11);
+  RandomTest<DeviceType::OPENCL, float>(2, 3);
+  RandomTest<DeviceType::OPENCL, float>(4, 3);
+  RandomTest<DeviceType::OPENCL, float>(11, 3);
 }
 
 TEST_F(SliceOpTest, OPENCLHalf) {
-  RandomTest<DeviceType::OPENCL, half>(2);
-  RandomTest<DeviceType::OPENCL, half>(4);
-  RandomTest<DeviceType::OPENCL, half>(11);
+  RandomTest<DeviceType::OPENCL, half>(2, 3);
+  RandomTest<DeviceType::OPENCL, half>(4, 3);
+  RandomTest<DeviceType::OPENCL, half>(11, 3);
 }
 
 }  // namespace test
diff --git a/mace/public/mace_runtime.h b/mace/public/mace_runtime.h
index 63ca972324a9ff538d088dc407460e5853e67a59..8acb1cace385a876e97d62a2c2563c0bd13bd497 100644
--- a/mace/public/mace_runtime.h
+++ b/mace/public/mace_runtime.h
@@ -8,6 +8,11 @@
 #ifndef MACE_PUBLIC_MACE_RUNTIME_H_
 #define MACE_PUBLIC_MACE_RUNTIME_H_
 
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
 namespace mace {
 
 enum GPUPerfHint {
@@ -26,10 +31,42 @@ enum GPUPriorityHint {
 
 enum CPUPowerOption { DEFAULT = 0, HIGH_PERFORMANCE = 1, BATTERY_SAVE = 2 };
 
+class KVStorage {
+ public:
+  // return: 0 for success, -1 for error
+  virtual int Load() = 0;
+  virtual bool Insert(const std::string &key,
+                      const std::vector<unsigned char> &value) = 0;
+  virtual const std::vector<unsigned char> *Find(const std::string &key) = 0;
+  // return: 0 for success, -1 for error
+  virtual int Flush() = 0;
+};
+
+class KVStorageFactory {
+ public:
+  virtual std::unique_ptr<KVStorage> CreateStorage(const std::string &name) = 0;
+};
+
+class FileStorageFactory : public KVStorageFactory {
+ public:
+  explicit FileStorageFactory(const std::string &path);
+
+  ~FileStorageFactory();
+
+  std::unique_ptr<KVStorage> CreateStorage(const std::string &name) override;
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+void ConfigKVStorageFactory(std::shared_ptr<KVStorageFactory> storage_factory);
+
 void ConfigOpenCLRuntime(GPUPerfHint, GPUPriorityHint);
 void ConfigOmpThreads(int omp_num_threads);
 void ConfigCPUPowerOption(CPUPowerOption power_option);
 
+
 }  // namespace mace
 
 #endif  // MACE_PUBLIC_MACE_RUNTIME_H_
diff --git a/mace/python/tools/binary_codegen.py b/mace/python/tools/binary_codegen.py
index 9956624b41086a1cb91403a21fd25b79c56a2efd..3be2b086461dd70e7fa6066f086d91442601dec5 100644
--- a/mace/python/tools/binary_codegen.py
+++ b/mace/python/tools/binary_codegen.py
@@ -25,9 +25,9 @@ def generate_cpp_source():
     with open(binary_path, "rb") as f:
       binary_array = np.fromfile(f, dtype=np.uint8)
 
+    print "Generate binary from", binary_path
     idx = 0
     size, = struct.unpack("Q", binary_array[idx:idx+8])
-    print size
     idx += 8
     for _ in xrange(size):
       key_size, = struct.unpack("i", binary_array[idx:idx+4])
@@ -64,7 +64,7 @@ def parse_args():
   parser.add_argument(
       "--binary_dirs",
       type=str,
-      default="cl_bin0/,cl_bin1/",
+      default="",
       help="The binaries file path.")
   parser.add_argument(
       "--binary_file_name",
diff --git a/mace/python/tools/caffe_converter_lib.py b/mace/python/tools/caffe_converter_lib.py
index 166bb6ec3012f2f3075e6dc5577dd5a9e6832463..7a94d2805def0c5120fc2692cf69e541bd4c122b 100644
--- a/mace/python/tools/caffe_converter_lib.py
+++ b/mace/python/tools/caffe_converter_lib.py
@@ -68,14 +68,26 @@ def BlobToNPArray(blob):
 
 class Shapes(object):
   @staticmethod
-  def conv_pool_shape(input_shape, filter_shape, paddings, strides, dilations, round_func):
+  def conv_pool_shape(input_shape, filter_shape, paddings, strides, dilations, round_func, input_format='NHWC'):
     output_shape = np.zeros_like(input_shape)
     output_shape[0] = input_shape[0]
-    output_shape[1] = int(round_func((input_shape[1] + paddings[0] - filter_shape[0]
-                                      - (filter_shape[0] - 1) * (dilations[0] - 1)) / float(strides[0]))) + 1
-    output_shape[2] = int(round_func((input_shape[2] + paddings[1] - filter_shape[1]
-                                      - (filter_shape[1] - 1) * (dilations[1] - 1)) / float(strides[1]))) + 1
-    output_shape[3] = filter_shape[2]
+    if input_format == 'NHWC':
+      # input format: NHWC, filter format: HWOI
+      output_shape[1] = int(round_func((input_shape[1] + paddings[0] - filter_shape[0]
+                                        - (filter_shape[0] - 1) * (dilations[0] - 1)) / float(strides[0]))) + 1
+      output_shape[2] = int(round_func((input_shape[2] + paddings[1] - filter_shape[1]
+                                        - (filter_shape[1] - 1) * (dilations[1] - 1)) / float(strides[1]))) + 1
+      output_shape[3] = filter_shape[2]
+    elif input_format == 'NCHW':
+      # input format: NCHW, filter format: OIHW
+      output_shape[1] = filter_shape[0]
+      output_shape[2] = int(round_func((input_shape[2] + paddings[0] - filter_shape[2]
+                                        - (filter_shape[2] - 1) * (dilations[0] - 1)) / float(strides[0]))) + 1
+      output_shape[3] = int(round_func((input_shape[3] + paddings[1] - filter_shape[3]
+                                        - (filter_shape[3] - 1) * (dilations[1] - 1)) / float(strides[1]))) + 1
+    else:
+      raise Exception("format %s is not supported" % input_format)
+
     return output_shape
 
   @staticmethod
@@ -93,8 +105,13 @@ class Shapes(object):
     return output_shape
 
   @staticmethod
-  def slice_shape(input_shape, num_output):
-    return [input_shape[0], input_shape[1], input_shape[2], input_shape[3]/num_output]
+  def slice_shape(input_shape, num_output, input_format='NHWC'):
+    if input_format == 'NHWC':
+      return [input_shape[0], input_shape[1], input_shape[2], input_shape[3]/num_output]
+    elif input_format == 'NCHW':
+      return [input_shape[0], input_shape[1]/num_output, input_shape[2], input_shape[3]]
+    else:
+      raise Exception("format %s is not supported" % input_format)
 
 # outputs' name is [op.name + '_' + #]
 class CaffeConverter(object):
@@ -168,7 +185,10 @@ class CaffeConverter(object):
     arg.i = self.dt
     data_format_arg = op_def.arg.add()
     data_format_arg.name = 'data_format'
-    data_format_arg.s = 'NHWC'
+    if self.device == 'neon':
+      data_format_arg.s = 'NCHW'
+    else:
+      data_format_arg.s = 'NHWC'
     op_def.name = op.name
     op_def.type = mace_type
     op_def.input.extend([name+':0' for name in self.inputs_map[op.name]])
@@ -342,7 +362,11 @@ class CaffeConverter(object):
 
     # Add filter
     weight_tensor_name = op.name + '_weight:0'
-    weight_data = op.data[0].transpose((2, 3, 0, 1))
+    if self.device == 'neon':
+      weight_data = op.data[0]
+    else:
+      # OIHW -> HWOI
+      weight_data = op.data[0].transpose((2, 3, 0, 1))
     self.add_tensor(weight_tensor_name, weight_data)
 
     if self.device == 'gpu':
@@ -376,10 +400,11 @@ class CaffeConverter(object):
     final_op = op
     self.resolved_ops.add(op.name)
 
+    input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
     output_shape = Shapes.conv_pool_shape(op.get_single_parent().output_shape_map[op.layer.bottom[0]],
       weight_data.shape,
       paddings, strides, dilations,
-      math.floor)
+      math.floor, input_format)
     op.output_shape_map[op.layer.top[0]] = output_shape
 
     if len(self.ops_map[final_op.name].children) == 1 \
@@ -399,9 +424,13 @@ class CaffeConverter(object):
     self.net_def.op.extend([op_def])
 
   def check_winograd_conv(self, op):
+    # TODO: support winograd conv on neon
+    if self.device == 'neon':
+      return False
     param = op.layer.convolution_param
     filter_shape = np.asarray(op.data[0].shape)
-    filter_shape = filter_shape[[2, 3, 0, 1]]
+    if self.device != 'neon':
+      filter_shape = filter_shape[[2, 3, 0, 1]]  # OIHW -> HWOI
     paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None)
 
     dilations = [1, 1]
@@ -411,17 +440,21 @@ class CaffeConverter(object):
       elif len(param.dilation) == 2:
         dilations = [param.dilation[0], param.dilation[1]]
 
+    input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
     output_shape = Shapes.conv_pool_shape(
       op.get_single_parent().output_shape_map[op.layer.bottom[0]],
-      filter_shape, paddings, strides, dilations, math.floor)
+      filter_shape, paddings, strides, dilations, math.floor, input_format)
     width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2)
-    return self.winograd and self.device == 'gpu' and \
-           filter_shape[0] == 3 and (filter_shape[0] == filter_shape[1]) and \
-           dilations[0] == 1 and (dilations[0] == dilations[1]) and \
-           (strides[0] == 1) and (strides[0] == strides[1]) and \
-           (16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \
-           (16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \
-           (width < OPENCL_IMAGE_MAX_SIZE)
+    if self.winograd and dilations[0] == 1 and (dilations[0] == dilations[1]) and \
+            (strides[0] == 1) and (strides[0] == strides[1]):
+      if self.device == 'gpu':
+        return filter_shape[0] == 3 and (filter_shape[0] == filter_shape[1]) and \
+               (16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \
+               (16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \
+               (width < OPENCL_IMAGE_MAX_SIZE)
+      elif self.device == 'neon':
+        return filter_shape[2] == 3 and (filter_shape[2] == filter_shape[3])
+    return False
 
   def convert_winograd_conv(self, op):
     # Add filter
@@ -435,11 +468,13 @@ class CaffeConverter(object):
     paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None)
 
     filter_shape = np.asarray(op.data[0].shape)
-    filter_shape = filter_shape[[2, 3, 0, 1]]
+    if self.device != 'neon':
+      filter_shape = filter_shape[[2, 3, 0, 1]]  # OIHW -> HWOI
 
+    input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
     output_shape = Shapes.conv_pool_shape(
       op.get_single_parent().output_shape_map[op.layer.bottom[0]],
-      filter_shape, paddings, strides, [1, 1], math.floor)
+      filter_shape, paddings, strides, [1, 1], math.floor, input_format)
 
     # Input transform
     wt_op = mace_pb2.OperatorDef()
@@ -455,8 +490,12 @@ class CaffeConverter(object):
     wt_output_name = wt_op.name + ":0"
     wt_op.output.extend([wt_output_name])
     wt_output_shape = mace_pb2.OutputShape()
-    wt_output_width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2)
-    wt_output_shape.dims.extend([16, filter_shape[3], wt_output_width, 1])
+    if self.device != 'neon':
+      wt_output_width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2)
+      wt_output_shape.dims.extend([16, filter_shape[3], wt_output_width, 1])
+    else:
+      wt_output_width = output_shape[0] * ((output_shape[2] + 1)/2) * ((output_shape[3]+1)/2)
+      wt_output_shape.dims.extend([16, filter_shape[1], wt_output_width, 1])
     wt_op.output_shape.extend([wt_output_shape])
 
     # MatMul
@@ -470,7 +509,10 @@ class CaffeConverter(object):
     matmul_output_name = matmul_op.name + ":0"
     matmul_op.output.extend([matmul_output_name])
     matmul_output_shape = mace_pb2.OutputShape()
-    matmul_output_shape.dims.extend([16, filter_shape[2], wt_output_width, 1])
+    if self.device != 'neon':
+      matmul_output_shape.dims.extend([16, filter_shape[2], wt_output_width, 1])
+    else:
+      matmul_output_shape.dims.extend([16, filter_shape[0], wt_output_width, 1])
     matmul_op.output_shape.extend([matmul_output_shape])
 
     # Inverse transform
@@ -483,10 +525,10 @@ class CaffeConverter(object):
     batch_arg.i = output_shape[0]
     height_arg = iwt_op.arg.add()
     height_arg.name = 'height'
-    height_arg.i = output_shape[1]
+    height_arg.i = output_shape[1] if self.device != 'neon' else output_shape[2]
     width_arg = iwt_op.arg.add()
     width_arg.name = 'width'
-    width_arg.i = output_shape[2]
+    width_arg.i = output_shape[2] if self.device != 'neon' else output_shape[3]
     iwt_op.name = op.name + '_inverse_transform'
     iwt_op.type = 'WinogradInverseTransform'
     iwt_op.input.extend([matmul_output_name])
@@ -589,8 +631,9 @@ class CaffeConverter(object):
 
     weight_data = op.data[0].reshape(-1, op.data[0].shape[-1])
     assert weight_data.shape[1] == (input_shape[1] * input_shape[2] * input_shape[3])
-    weight_data = weight_data.reshape(-1, input_shape[3], input_shape[1], input_shape[2])
-    weight_data = weight_data.transpose((0, 2, 3, 1)).reshape(weight_data.shape[0], -1)
+    if self.device != 'neon':
+      weight_data = weight_data.reshape(-1, input_shape[3], input_shape[1], input_shape[2])
+      weight_data = weight_data.transpose((0, 2, 3, 1)).reshape(weight_data.shape[0], -1)
     self.add_tensor(weight_tensor_name, weight_data)
     if self.device == 'gpu':
       if (weight_data.shape[0] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE \
@@ -665,9 +708,12 @@ class CaffeConverter(object):
     kernel_arg.name = 'kernels'
     kernel_arg.ints.extend(kernels)
 
-    filter_shape = [kernels[0], kernels[1], input_shape[3], input_shape[3]]
+    filter_shape = [kernels[0], kernels[1], input_shape[3], input_shape[3]] \
+      if self.device != 'neon' else \
+      [input_shape[1], input_shape[1], kernels[0], kernels[1]]
+    input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
     output_shape = Shapes.conv_pool_shape(input_shape, filter_shape,
-      paddings, strides, [1, 1], math.ceil)
+      paddings, strides, [1, 1], math.ceil, input_format)
     op.output_shape_map[op.layer.top[0]] = output_shape
 
     op_def.output.extend([op.name + ':0'])
@@ -720,7 +766,7 @@ class CaffeConverter(object):
     op_def = self.CommonConvert(op, 'Concat')
     axis_arg = op_def.arg.add()
     axis_arg.name = 'axis'
-    axis_arg.i = 3
+    axis_arg.i = 3 if self.device != 'neon' else 1
     try:
       if op.layer.concat_param.HasFeild('axis'):
         axis_arg.i = op.concat_param.axis
@@ -766,13 +812,19 @@ class CaffeConverter(object):
       if len(param.slice_point) > 0:
         raise Exception('Mace do not support slice with slice_point')
 
+    axis_arg = op_def.arg.add()
+    axis_arg.name = 'axis'
+    axis_arg.i = 3 if self.device != 'neon' else 1
+
     input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
     num_outputs = len(op.layer.top)
-    if (input_shape[3] % num_outputs) != 0 or \
-        (self.device == 'gpu' and ((input_shape[3] / num_outputs) % 4 != 0)) :
+    input_channels = input_shape[axis_arg.i]
+    if (input_channels % num_outputs) != 0 or \
+        (self.device == 'gpu' and ((input_channels / num_outputs) % 4 != 0)):
       raise Exception('Mace do not support slice with input shape '
                       + str(input_shape) + ' and number of output ' + str(num_outputs))
-    output_shape = Shapes.slice_shape(input_shape, num_outputs)
+    input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
+    output_shape = Shapes.slice_shape(input_shape, num_outputs, input_format)
     for i in range(len(op.layer.top)):
       op.output_shape_map[op.layer.top[i]] = output_shape
       self.add_output_shape(op_def, output_shape)
@@ -790,10 +842,15 @@ class CaffeConverter(object):
     self.resolved_ops.add(op.name)
 
   def convert_reshape(self, op):
-    op_def = self.CommonConvert(op, 'ReOrganize')
+    if self.device == 'neon':
+      op_def = self.CommonConvert(op, 'Reshape')
+    else:
+      op_def = self.CommonConvert(op, 'ReOrganize')
     input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
     output_shape = input_shape
-    shape_param = np.asarray(op.layer.reshape_param.shape.dim)[[0, 3, 2, 1]]
+    shape_param = np.asarray(op.layer.reshape_param.shape.dim)
+    if self.device != 'neon':
+      shape_param = shape_param[[0, 3, 1, 2]]
     for i in range(len(shape_param)):
       if shape_param[i] != 0:
         output_shape[i] = shape_param[i]
@@ -867,15 +924,50 @@ class CaffeConverter(object):
     assert len(input_nodes) == len(input_shapes)
     for i in range(len(input_nodes)):
       input_op = self.ops_map[input_nodes[i]]
+      input_shape = input_shapes[i] if self.device != 'neon' else \
+        [input_shapes[i][0], input_shapes[i][3], input_shapes[i][1], input_shapes[i][2]]
       if input_op.layer is not None:
-        input_op.output_shape_map[input_op.layer.top[0]] = input_shapes[i]
+        input_op.output_shape_map[input_op.layer.top[0]] = input_shape
       else:
-        input_op.output_shape_map[input_op.name] = input_shapes[i]
+        input_op.output_shape_map[input_op.name] = input_shape
+
+  def add_neon_input_transform(self, names):
+    for name in names:
+      new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
+      op_def = self.net_def.op.add()
+      op_def.name = name
+      op_def.type = 'Transpose'
+      op_def.input.extend([new_input_name])
+      op_def.output.extend([name+':0'])
+
+      dims_arg = op_def.arg.add()
+      dims_arg.name = 'dims'
+      dims_arg.ints.extend([0, 3, 1, 2])  # NHWC -> NCHW
+
+      arg = op_def.arg.add()
+      arg.name = 'T'
+      arg.i = self.dt
+
+  def add_neon_output_transform(self, names):
+    for name in names:
+      output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
+      op_def = self.net_def.op.add()
+      op_def.name = output_name[:-2]
+      op_def.type = 'Transpose'
+      op_def.input.extend([name+':0'])
+      op_def.output.extend([output_name])
+
+      dims_arg = op_def.arg.add()
+      dims_arg.name = 'dims'
+      dims_arg.ints.extend([0, 2, 3, 1])  # NCHW -> NHWC
 
   def convert(self, input_nodes, input_shapes, output_nodes):
     if self.device == 'gpu':
       self.add_input_transform(input_nodes)
 
+    if self.device == 'neon':
+      self.add_neon_input_transform(input_nodes)
+
     assert self.ops[0].type == 'Input'
     self.add_input_op_shape(input_nodes, input_shapes)
 
@@ -924,6 +1016,9 @@ class CaffeConverter(object):
     if self.device == 'cpu':
       self.replace_in_out_name(input_nodes, output_nodes)
 
+    if self.device == 'neon':
+      self.add_neon_output_transform(output_nodes)
+
     for op in self.ops:
       if op.name not in self.resolved_ops:
         print 'Unresolve Op: %s with type %s' % (op.name, op.type)
diff --git a/mace/python/tools/opencl_codegen.py b/mace/python/tools/opencl_codegen.py
index 04bda9a82ff46d02d7cfb9a1ebc1a19ee7f7b9c2..96f4c6a6d11b27c60f01b83e35ecd47bca863e05 100644
--- a/mace/python/tools/opencl_codegen.py
+++ b/mace/python/tools/opencl_codegen.py
@@ -1,6 +1,7 @@
 import argparse
 import os
 import sys
+import struct
 
 import numpy as np
 
@@ -14,30 +15,49 @@ FLAGS = None
 
 def generate_cpp_source():
   maps = {}
-  cl_binary_dir_arr = FLAGS.cl_binary_dirs.split(",")
-  for cl_binary_dir in cl_binary_dir_arr:
-    if not os.path.exists(cl_binary_dir):
-      print("Input cl_binary_dir " + cl_binary_dir + " doesn't exist!")
-    for file_name in os.listdir(cl_binary_dir):
-      file_path = os.path.join(cl_binary_dir, file_name)
-      if file_path[-4:] == ".bin":
-        # read binary
-        f = open(file_path, "rb")
-        binary_array = np.fromfile(f, dtype=np.uint8)
-        f.close()
-
-        maps[file_name[:-4]] = []
-        for ele in binary_array:
-          maps[file_name[:-4]].append(hex(ele))
+  platform_info = ''
+  binary_dirs = FLAGS.cl_binary_dirs.strip().split(",")
+  for binary_dir in binary_dirs:
+    binary_path = os.path.join(binary_dir, FLAGS.built_kernel_file_name)
+    if not os.path.exists(binary_path):
+      continue
+
+    print 'generate opencl code from', binary_path
+    with open(binary_path, "rb") as f:
+      binary_array = np.fromfile(f, dtype=np.uint8)
+
+    idx = 0
+    size, = struct.unpack("Q", binary_array[idx:idx+8])
+    idx += 8
+    for _ in xrange(size):
+      key_size, = struct.unpack("i", binary_array[idx:idx+4])
+      idx += 4
+      key, = struct.unpack(str(key_size) + "s", binary_array[idx:idx+key_size])
+      idx += key_size
+      value_size, = struct.unpack("i", binary_array[idx:idx+4])
+      idx += 4
+      maps[key] = []
+      value = struct.unpack(str(value_size) + "B",
+                            binary_array[idx:idx+value_size])
+      idx += value_size
+      for ele in value:
+        maps[key].append(hex(ele))
+
+    cl_platform_info_path = os.path.join(binary_dir, FLAGS.platform_info_file_name)
+    with open(cl_platform_info_path, 'r') as f:
+      curr_platform_info = f.read()
+    if platform_info != "":
+      assert(curr_platform_info == platform_info)
+    platform_info = curr_platform_info
 
   env = jinja2.Environment(loader=jinja2.FileSystemLoader(sys.path[0]))
-  return env.get_template('str2vec_maps.cc.jinja2').render(
+  return env.get_template('opencl_compiled_kernel.cc.jinja2').render(
     maps = maps,
     data_type = 'unsigned char',
-    variable_name = 'kCompiledProgramMap'
+    variable_name = 'kCompiledProgramMap',
+    platform_info = platform_info,
   )
 
-
 def main(unused_args):
 
   cpp_cl_binary_source = generate_cpp_source()
@@ -54,7 +74,17 @@ def parse_args():
   parser.add_argument(
       "--cl_binary_dirs",
       type=str,
-      default="cl_bin0/,cl_bin1/,cl_bin2/",
+      default="",
+      help="The cl binaries directories.")
+  parser.add_argument(
+      "--built_kernel_file_name",
+      type=str,
+      default="",
+      help="The cl binaries directories.")
+  parser.add_argument(
+      "--platform_info_file_name",
+      type=str,
+      default="",
       help="The cl binaries directories.")
   parser.add_argument(
       "--output_path",
diff --git a/mace/python/tools/opencl_compiled_kernel.cc.jinja2 b/mace/python/tools/opencl_compiled_kernel.cc.jinja2
new file mode 100644
index 0000000000000000000000000000000000000000..391b03f0cd64b9c07ad4fe3b5345a3aaa7c1cd69
--- /dev/null
+++ b/mace/python/tools/opencl_compiled_kernel.cc.jinja2
@@ -0,0 +1,29 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+// This is a generated file, DO NOT EDIT
+
+#include <map>
+#include <string>
+#include <vector>
+
+namespace mace {
+
+extern const std::map<std::string, std::vector<{{data_type}}>> {{variable_name}} =
+{
+  {% for key, value in maps.iteritems() %}
+  {
+    "{{key}}",
+    {
+      {%- for ele in value -%}
+      {{ele}},
+      {%- endfor -%}
+    }
+  },  // {{key}}
+{% endfor %}
+};
+
+extern const std::string kCompiledProgramPlatform = {{platform_info|tojson}};
+
+}  // namespace mace
diff --git a/mace/tools/validation/BUILD b/mace/tools/validation/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a4e1cb876dd508b9a1a365c2350345301b9a3af4
--- /dev/null
+++ b/mace/tools/validation/BUILD
@@ -0,0 +1,14 @@
+# Examples
+load("//mace:mace.bzl", "if_openmp_enabled")
+
+cc_binary(
+    name = "mace_run",
+    srcs = ["mace_run.cc"],
+    linkopts = if_openmp_enabled(["-fopenmp"]),
+    linkstatic = 1,
+    deps = [
+        "//external:gflags_nothreads",
+        "//mace/codegen:generated_models",
+        "//mace/core:core",
+    ],
+)
diff --git a/mace/examples/mace_run.cc b/mace/tools/validation/mace_run.cc
similarity index 84%
rename from mace/examples/mace_run.cc
rename to mace/tools/validation/mace_run.cc
index f71ca08d34467c58a5e8d7cffb1269f16f4622f6..d4fa7c29a00073b9aff74d0a107e620540fc45cb 100644
--- a/mace/examples/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
@@ -16,12 +16,14 @@
  */
 #include <malloc.h>
 #include <stdint.h>
+#include <cstdio>
 #include <cstdlib>
 #include <fstream>
 #include <iostream>
 #include <numeric>
 
 #include "gflags/gflags.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/public/mace.h"
 #include "mace/public/mace_runtime.h"
 #include "mace/utils/env_time.h"
@@ -100,6 +102,20 @@ DeviceType ParseDeviceType(const std::string &device_str) {
   }
 }
 
+void WriteOpenCLPlatformInfo(const std::string &output_dir) {
+  std::string platform_info = OpenCLRuntime::Global()->platform_info();
+  const std::string cl_platform_info_file_name = output_dir
+      + "/mace_cl_platform_info.txt";
+
+  std::ofstream ofs(cl_platform_info_file_name);
+  if (ofs.is_open()) {
+    ofs << platform_info;
+    ofs.close();
+  } else {
+    LOG(WARNING) << "Write opencl platform info failed.";
+  }
+}
+
 struct mallinfo LogMallinfoChange(struct mallinfo prev) {
   struct mallinfo curr = mallinfo();
   if (prev.arena != curr.arena) {
@@ -189,8 +205,8 @@ bool RunModel(const std::vector<std::string> &input_names,
       mace::MACE_MODEL_TAG::LoadModelData(FLAGS_model_data_file.c_str());
   NetDef net_def = mace::MACE_MODEL_TAG::CreateNet(model_data);
   int64_t t1 = NowMicros();
-  LOG(INFO) << "CreateNetDef latency: " << t1 - t0 << " us";
-  int64_t init_micros = t1 - t0;
+  double create_net_millis = (t1 - t0) / 1000.0;
+  LOG(INFO) << "CreateNetDef latency: " << create_net_millis << " ms";
 
   DeviceType device_type = ParseDeviceType(FLAGS_device);
   LOG(INFO) << "Runing with device type: " << device_type;
@@ -205,17 +221,26 @@ bool RunModel(const std::vector<std::string> &input_names,
         static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
   }
 
+  const char *kernel_path = getenv("MACE_CL_PROGRAM_PATH");
+  const std::string kernel_file_path =
+      std::string(kernel_path == nullptr ?
+                  "/data/local/tmp/mace_run/cl_program" : kernel_path);
+
   // Init model
   LOG(INFO) << "Run init";
-  t0 = NowMicros();
+  std::shared_ptr<KVStorageFactory> storage_factory(
+      new FileStorageFactory(kernel_file_path));
+  ConfigKVStorageFactory(storage_factory);
   mace::MaceEngine engine(&net_def, device_type, input_names, output_names);
   if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) {
     mace::MACE_MODEL_TAG::UnloadModelData(model_data);
   }
-  t1 = NowMicros();
-  init_micros += t1 - t0;
-  LOG(INFO) << "Net init latency: " << t1 - t0 << " us";
-  LOG(INFO) << "Total init latency: " << init_micros << " us";
+  int64_t t2 = NowMicros();
+  double mace_engine_ctor_millis = (t2 - t1) / 1000.0;
+  double init_millis = (t2 - t0) / 1000.0;
+  LOG(INFO) << "MaceEngine constructor latency: "
+            << mace_engine_ctor_millis << " ms";
+  LOG(INFO) << "Total init latency: " << init_millis << " ms";
 
   const size_t input_count = input_names.size();
   const size_t output_count = output_names.size();
@@ -253,14 +278,16 @@ bool RunModel(const std::vector<std::string> &input_names,
   }
 
   LOG(INFO) << "Warm up run";
-  t0 = NowMicros();
+  int64_t t3 = NowMicros();
   engine.Run(inputs, &outputs);
-  t1 = NowMicros();
-  LOG(INFO) << "1st warm up run latency: " << t1 - t0 << " us";
+  int64_t t4 = NowMicros();
+  double warmup_millis = (t4 - t3) / 1000.0;
+  LOG(INFO) << "1st warm up run latency: " << warmup_millis << " ms";
 
+  double model_run_millis = -1;
   if (FLAGS_round > 0) {
     LOG(INFO) << "Run model";
-    t0 = NowMicros();
+    int64_t t0 = NowMicros();
     struct mallinfo prev = mallinfo();
     for (int i = 0; i < FLAGS_round; ++i) {
       engine.Run(inputs, &outputs);
@@ -269,10 +296,20 @@ bool RunModel(const std::vector<std::string> &input_names,
         prev = LogMallinfoChange(prev);
       }
     }
-    t1 = NowMicros();
-    LOG(INFO) << "Average latency: " << (t1 - t0) / FLAGS_round << " us";
+    int64_t t1 = NowMicros();
+    model_run_millis = (t1 - t0) / 1000.0 / FLAGS_round;
+    LOG(INFO) << "Average latency: " << model_run_millis << " ms";
   }
 
+  // Metrics reporting tools depends on the format, keep in consistent
+  printf("================================================================\n");
+  printf("      create_net engine_ctor        init      warmup     run_avg\n");
+  printf("================================================================\n");
+  printf("time %11.3f %11.3f %11.3f %11.3f %11.3f\n", create_net_millis,
+         mace_engine_ctor_millis, init_millis, warmup_millis, model_run_millis);
+
+  WriteOpenCLPlatformInfo(kernel_file_path);
+
   for (size_t i = 0; i < output_count; ++i) {
     std::string output_name =
         FLAGS_output_file + "_" + FormatName(output_names[i]);
diff --git a/mace/utils/utils.h b/mace/utils/utils.h
index 54ab5793d229bba9453ae38fae6ec260f3c9cb6e..a9ca2a40aae252c10ce96d5968987b68af9894ea 100644
--- a/mace/utils/utils.h
+++ b/mace/utils/utils.h
@@ -9,6 +9,7 @@
 #include <sstream>
 #include <string>
 #include <utility>
+#include <vector>
 
 namespace mace {
 template <typename Integer>
@@ -96,5 +97,20 @@ inline std::string ObfuscateSymbol(const std::string &src) {
 #define MACE_OBFUSCATE_SYMBOL(str) (str)
 #endif
 
+inline std::vector<std::string> Split(const std::string &str, char delims) {
+  std::vector<std::string> result;
+  std::string tmp = str;
+  while (!tmp.empty()) {
+    size_t next_offset = tmp.find(delims);
+    result.push_back(tmp.substr(0, next_offset));
+    if (next_offset == std::string::npos) {
+      break;
+    } else {
+      tmp = tmp.substr(next_offset + 1);
+    }
+  }
+  return result;
+}
+
 }  // namespace mace
 #endif  // MACE_UTILS_UTILS_H_
diff --git a/tools/bazel_adb_run.py b/tools/bazel_adb_run.py
index 185167673aca32cc1b188234892ae98d551d329d..c8c9ddb46f9ae640824a0b6d4faaea3a700c6993 100644
--- a/tools/bazel_adb_run.py
+++ b/tools/bazel_adb_run.py
@@ -33,10 +33,16 @@ def ops_benchmark_stdout_processor(stdout, device_properties, abi):
     line = line.strip()
     parts = line.split()
     if len(parts) == 5 and parts[0].startswith("BM_"):
-      metrics["%s.time_ms" % parts[0]] = str(float(parts[1])/1000000.0)
+      metrics["%s.time_ms" % parts[0]] = str(float(parts[1])/1e6)
       metrics["%s.input_mb_per_sec" % parts[0]] = parts[3]
       metrics["%s.gmacc_per_sec" % parts[0]] = parts[4]
-  sh_commands.falcon_push_metrics(metrics, device_properties, abi,
+
+  platform = device_properties["ro.board.platform"].replace(" ", "-")
+  model = device_properties["ro.product.model"].replace(" ", "-")
+  tags = {"ro.board.platform": platform,
+          "ro.product.model": model,
+          "abi": abi}
+  sh_commands.falcon_push_metrics(metrics, tags=tags,
                                   endpoint="mace_ops_benchmark")
 
 def parse_args():
@@ -88,6 +94,7 @@ def main(unused_args):
 
   # generate sources
   sh_commands.gen_encrypted_opencl_source()
+  sh_commands.gen_compiled_opencl_source()
   sh_commands.gen_mace_version()
 
   for target_abi in target_abis:
diff --git a/tools/build_mace_run.sh b/tools/build_mace_run.sh
index 658ae26745f330cd0dded36da0f24d85d1bb9361..8f22ee33966692798cbb4ee25d427dc4579ff265 100644
--- a/tools/build_mace_run.sh
+++ b/tools/build_mace_run.sh
@@ -30,7 +30,7 @@ if [ x"$TARGET_ABI" = x"host" ]; then
     --copt="-O3" \
     $PRODUCTION_MODE_BUILD_FLAGS || exit 1
 
-  bazel build --verbose_failures -c opt --strip always //mace/examples:mace_run \
+  bazel build --verbose_failures -c opt --strip always //mace/tools/validation:mace_run \
     --copt="-std=c++11" \
     --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
     --copt="-Werror=return-type" \
@@ -47,7 +47,7 @@ else
     NEON_ENABLE_FLAG="--define neon=true"
   fi
 
-  bazel build --verbose_failures -c opt --strip always //mace/examples:mace_run \
+  bazel build --verbose_failures -c opt --strip always //mace/tools/validation:mace_run \
     --crosstool_top=//external:android/crosstool \
     --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
     --cpu=${TARGET_ABI} \
@@ -64,14 +64,13 @@ else
     $HEXAGON_MODE_BUILD_FLAG || exit 1
 fi
 
-if [ "$PRODUCTION_MODE" = 1 ]; then
-  cp $GENERATED_MODEL_LIB_PATH $MODEL_OUTPUT_DIR/libmace_${MODEL_TAG}.a
-fi
+rm -rf $MODEL_OUTPUT_DIR/libmace_${MODEL_TAG}.a
+cp $GENERATED_MODEL_LIB_PATH $MODEL_OUTPUT_DIR/libmace_${MODEL_TAG}.a
 
 if [ -f "$MODEL_OUTPUT_DIR/mace_run" ]; then
   rm -rf $MODEL_OUTPUT_DIR/mace_run
 fi
-cp bazel-bin/mace/examples/mace_run $MODEL_OUTPUT_DIR
+cp bazel-bin/mace/tools/validation/mace_run $MODEL_OUTPUT_DIR
 if [ "$EMBED_MODEL_DATA" = 0 ]; then
   cp mace/codegen/models/${MODEL_TAG}/${MODEL_TAG}.data $MODEL_OUTPUT_DIR
 fi
diff --git a/tools/build_production_code.sh b/tools/build_production_code.sh
index 7987f67fa7747177517aeef621b2a137ce4a9f9d..4072019429ecd61d4b7e18f0cbbed17e1851dc09 100644
--- a/tools/build_production_code.sh
+++ b/tools/build_production_code.sh
@@ -35,9 +35,9 @@ build_target()
 }
 
 if [ x"$TARGET_ABI" = x"host" ]; then
-  build_host_target //mace/codegen:generated_opencl_prod
+  build_host_target //mace/codegen:generated_opencl
   build_host_target //mace/codegen:generated_tuning_params
 else
-  build_target //mace/codegen:generated_opencl_prod
+  build_target //mace/codegen:generated_opencl
   build_target //mace/codegen:generated_tuning_params
 fi
diff --git a/tools/build_run_throughput_test.sh b/tools/build_run_throughput_test.sh
index ef148e6ab3cd954b99a9f653ea299e7086708ab9..75387b62879f528e67be1acc78ea96225d299cb8 100644
--- a/tools/build_run_throughput_test.sh
+++ b/tools/build_run_throughput_test.sh
@@ -72,9 +72,11 @@ adb -s $DEVICE_ID </dev/null shell \
   MACE_KERNEL_PATH=$KERNEL_DIR \
   MACE_LIMIT_OPENCL_KERNEL_TIME=${LIMIT_OPENCL_KERNEL_TIME} \
   ${PHONE_DATA_DIR}/model_throughput_test \
+  --input_node="${INPUT_NODES}" \
   --input_shape="${INPUT_SHAPES}" \
+  --output_node="${OUTPUT_NODES}" \
   --output_shape="${OUTPUT_SHAPES}" \
-  --input_file=${PHONE_DATA_DIR}/${INPUT_FILE_NAME}_${INPUT_NODES} \
+  --input_file=${PHONE_DATA_DIR}/${INPUT_FILE_NAME} \
   --cpu_model_data_file=${PHONE_DATA_DIR}/${CPU_MODEL_TAG}.data \
   --gpu_model_data_file=${PHONE_DATA_DIR}/${GPU_MODEL_TAG}.data \
   --dsp_model_data_file=${PHONE_DATA_DIR}/${DSP_MODEL_TAG}.data \
diff --git a/tools/clear_env.sh b/tools/clear_env.sh
index ef14f55bf2d6433b8134258d12e8cf24c91f3efb..1652bb52a4ad2e08763e1b1fddcf05487c82a383 100644
--- a/tools/clear_env.sh
+++ b/tools/clear_env.sh
@@ -26,4 +26,3 @@ if [ x"$TARGET_ABI" != x"host" ]; then
 fi
 
 rm -rf mace/codegen/models
-git checkout -- mace/codegen/opencl/opencl_compiled_program.cc mace/codegen/tuning/tuning_params.cc
diff --git a/tools/env.sh b/tools/env.sh
index 5ef841a1ed14eb75f8a75dc3f7b87d0ec7b55db6..634fb93c6d5bf6726c461066d60446601ca8b62a 100644
--- a/tools/env.sh
+++ b/tools/env.sh
@@ -3,12 +3,14 @@ LIBMACE_TAG=`git describe --abbrev=0 --tags`
 
 MACE_SOURCE_DIR=`/bin/pwd`
 PHONE_DATA_DIR="/data/local/tmp/mace_run"
-KERNEL_DIR="${PHONE_DATA_DIR}/cl/"
+COMPILED_PROGRAM_DIR="${PHONE_DATA_DIR}/cl_program/"
 CODEGEN_DIR=${MACE_SOURCE_DIR}/mace/codegen
 MODEL_CODEGEN_DIR=${CODEGEN_DIR}/models/${MODEL_TAG}
 CL_CODEGEN_DIR=${CODEGEN_DIR}/opencl
 TUNING_CODEGEN_DIR=${CODEGEN_DIR}/tuning
 VERSION_SOURCE_PATH=${CODEGEN_DIR}/version
+CL_BUILT_KERNEL_FILE_NAME=mace_cl_compiled_program.bin
+CL_PLATFORM_INFO_FILE_NAME=mace_cl_platform_info.txt
 if [ -z ${EMBED_MODEL_DATA} ]; then
   EMBED_MODEL_DATA=1
 fi
diff --git a/tools/generate_opencl_code.sh b/tools/generate_opencl_code.sh
new file mode 100644
index 0000000000000000000000000000000000000000..37d1fb10c9e3f2474e7000c29d58cfb5aeed5a9e
--- /dev/null
+++ b/tools/generate_opencl_code.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+
+Usage() {
+  echo "Usage: bash tools/genenrate_opencl_code.sh type [target_soc] [cl_bin_dirs] [pull_or_not]"
+}
+
+if [ $# -lt 1 ]; then
+  Usage
+  exit 1
+fi
+
+CURRENT_DIR=`dirname $0`
+source ${CURRENT_DIR}/env.sh
+
+TYPE=$1
+TARGET_SOC=$2
+CL_BIN_DIRS=$3
+PULL_OR_NOT=$4
+
+mkdir -p ${CL_CODEGEN_DIR}
+
+if [ x"$TYPE" == x"source" ];then
+  python mace/python/tools/encrypt_opencl_codegen.py \
+    --cl_kernel_dir=./mace/kernels/opencl/cl/ \
+    --output_path=${CODEGEN_DIR}/opencl/opencl_encrypt_program.cc || exit 1
+elif [ x"$#" == x"1" ];then
+
+  python mace/python/tools/opencl_codegen.py \
+    --built_kernel_file_name=${CL_BUILT_KERNEL_FILE_NAME} \
+    --platform_info_file_name=${CL_PLATFORM_INFO_FILE_NAME} \
+    --output_path=${CL_CODEGEN_DIR}/opencl_compiled_program.cc || exit 1
+
+else
+  RESULT_VALUE=`echo_device_id_by_soc $TARGET_SOC`
+  if [ $? -ne 0 ]; then
+    echo $RESULT_VALUE
+    exit 1
+  else
+    DEVICE_ID=$RESULT_VALUE
+  fi
+
+  if [ "$PULL_OR_NOT" = 1 ]; then
+    CL_BIN_DIR=${CL_BIN_DIRS}
+    mkdir -p ${CL_BIN_DIR}
+    rm -rf ${CL_BIN_DIR}/${CL_BUILT_KERNEL_FILE_NAME}
+    rm -rf ${CL_BIN_DIR}/${CL_PLATFORM_INFO_FILE_NAME}
+    if [ x"$TARGET_ABI" != x"host" ]; then
+      adb -s $DEVICE_ID pull ${COMPILED_PROGRAM_DIR}/. ${CL_BIN_DIR} > /dev/null
+    fi
+  fi
+
+  python mace/python/tools/opencl_codegen.py \
+    --built_kernel_file_name=${CL_BUILT_KERNEL_FILE_NAME} \
+    --platform_info_file_name=${CL_PLATFORM_INFO_FILE_NAME} \
+    --cl_binary_dirs=${CL_BIN_DIRS} \
+    --output_path=${CL_CODEGEN_DIR}/opencl_compiled_program.cc || exit 1
+fi
+
diff --git a/tools/generate_production_code.sh b/tools/generate_production_code.sh
deleted file mode 100644
index f4f021c4e930e50a5c969568e0ecda0891357914..0000000000000000000000000000000000000000
--- a/tools/generate_production_code.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-
-Usage() {
-  echo "Usage: bash tools/generate_production_code.sh target_soc cl_bin_dirs pull_or_not"
-}
-
-if [ $# -lt 3 ]; then
-  Usage
-  exit 1
-fi
-
-CURRENT_DIR=`dirname $0`
-source ${CURRENT_DIR}/env.sh
-
-TARGET_SOC=$1
-CL_BIN_DIRS=$2
-PULL_OR_NOT=$3
-
-RESULT_VALUE=`echo_device_id_by_soc $TARGET_SOC`
-if [ $? -ne 0 ]; then
-  echo $RESULT_VALUE
-  exit 1
-else
-  DEVICE_ID=$RESULT_VALUE
-fi
-
-if [ "$PULL_OR_NOT" = 1 ]; then
-  CL_BIN_DIR=${CL_BIN_DIRS}
-  rm -rf ${CL_BIN_DIR}
-  mkdir -p ${CL_BIN_DIR}
-  if [ x"$TARGET_ABI" != x"host" ]; then
-    adb -s $DEVICE_ID pull ${KERNEL_DIR}/. ${CL_BIN_DIR} > /dev/null
-    adb -s $DEVICE_ID pull ${PHONE_DATA_DIR}/mace_run.config ${CL_BIN_DIR} > /dev/null
-  fi
-fi
-
-python mace/python/tools/opencl_codegen.py \
-  --cl_binary_dirs=${CL_BIN_DIRS} \
-  --output_path=${CL_CODEGEN_DIR}/opencl_compiled_program.cc
-
-python mace/python/tools/binary_codegen.py \
-  --binary_dirs=${CL_BIN_DIRS} \
-  --binary_file_name=mace_run.config \
-  --output_path=${TUNING_CODEGEN_DIR}/tuning_params.cc
diff --git a/tools/generate_tuning_param_code.sh b/tools/generate_tuning_param_code.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4d116c9fec7c140a2078e0d6f28e1c5181468e51
--- /dev/null
+++ b/tools/generate_tuning_param_code.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+CURRENT_DIR=`dirname $0`
+source ${CURRENT_DIR}/env.sh
+
+mkdir -p ${TUNING_CODEGEN_DIR}
+
+if [ "$#" -eq "0" ]; then
+  python mace/python/tools/binary_codegen.py \
+    --binary_file_name=mace_run.config \
+    --output_path=${TUNING_CODEGEN_DIR}/tuning_params.cc
+else
+
+  TARGET_SOC=$1
+  BIN_DIRS=$2
+  PULL_OR_NOT=$3
+
+  RESULT_VALUE=`echo_device_id_by_soc $TARGET_SOC`
+  if [ $? -ne 0 ]; then
+    echo $RESULT_VALUE
+    exit 1
+  else
+    DEVICE_ID=$RESULT_VALUE
+  fi
+
+  if [ "$PULL_OR_NOT" = 1 ]; then
+    mkdir -p ${BIN_DIRS}
+    rm -rf ${BIN_DIRS}/mace_run.config
+    if [ x"$TARGET_ABI" != x"host" ]; then
+      adb -s $DEVICE_ID pull ${PHONE_DATA_DIR}/mace_run.config ${BIN_DIRS} > /dev/null
+    fi
+  fi
+
+  python mace/python/tools/binary_codegen.py \
+  --binary_dirs=${BIN_DIRS} \
+  --binary_file_name=mace_run.config \
+  --output_path=${TUNING_CODEGEN_DIR}/tuning_params.cc
+fi
+
+
diff --git a/tools/generate_opencl_and_version_code.sh b/tools/generate_version_code.sh
similarity index 54%
rename from tools/generate_opencl_and_version_code.sh
rename to tools/generate_version_code.sh
index ee32acc7dac7fdfc83beb0aab1be26106efbf6e0..7b1c33d219a6a886c29335deff8c4bc6661c39c4 100644
--- a/tools/generate_opencl_and_version_code.sh
+++ b/tools/generate_version_code.sh
@@ -1,13 +1,10 @@
 #!/bin/bash
 
+CL_KERNEL_DIR_TAG=$1
+
 CURRENT_DIR=`dirname $0`
 source ${CURRENT_DIR}/env.sh
 
-python mace/python/tools/encrypt_opencl_codegen.py \
-    --cl_kernel_dir=./mace/kernels/opencl/cl/ \
-    --output_path=${CODEGEN_DIR}/opencl/opencl_encrypt_program.cc || exit 1
-
-
 rm -rf ${CODEGEN_DIR}/version
 mkdir ${CODEGEN_DIR}/version
 bash mace/tools/git/gen_version_source.sh ${CODEGEN_DIR}/version/version.cc || exit 1
diff --git a/tools/mace_tools.py b/tools/mace_tools.py
index f9dd9181b2e55717598b9d526ccf4505750ee0e0..d229573a5f1210068eb00616b9cf9b5e7b360b17 100644
--- a/tools/mace_tools.py
+++ b/tools/mace_tools.py
@@ -9,6 +9,7 @@
 import argparse
 import hashlib
 import os
+import sh
 import shutil
 import subprocess
 import sys
@@ -58,10 +59,41 @@ def get_global_runtime(configs):
   return global_runtime
 
 
-def generate_opencl_and_version_code():
-  command = "bash tools/generate_opencl_and_version_code.sh"
+def generate_version_code():
+  command = "bash tools/generate_version_code.sh"
   run_command(command)
 
+def generate_opencl_source_code():
+  command = "bash tools/generate_opencl_code.sh source"
+  run_command(command)
+
+def generate_opencl_binay_code(target_soc, model_output_dirs, pull_or_not):
+  cl_bin_dirs = []
+  for d in model_output_dirs:
+    cl_bin_dirs.append(os.path.join(d, "opencl_bin"))
+  cl_bin_dirs_str = ",".join(cl_bin_dirs)
+  if not cl_bin_dirs:
+    command = "bash tools/generate_opencl_code.sh binary"
+  else:
+    command = "bash tools/generate_opencl_code.sh {} {} {} {}".format(
+      'binary', target_soc, cl_bin_dirs_str, int(pull_or_not))
+  run_command(command)
+
+def generate_tuning_param_code(target_soc, model_output_dirs, pull_or_not):
+  cl_bin_dirs = []
+  for d in model_output_dirs:
+    cl_bin_dirs.append(os.path.join(d, "opencl_bin"))
+  cl_bin_dirs_str = ",".join(cl_bin_dirs)
+  if not cl_bin_dirs:
+    command = "bash tools/generate_tuning_param_code.sh"
+  else:
+    command = "bash tools/generate_tuning_param_code.sh {} {} {}".format(
+      target_soc, cl_bin_dirs_str, int(pull_or_not))
+  run_command(command)
+
+def generate_code(target_soc, model_output_dirs, pull_or_not):
+  generate_opencl_binay_code(target_soc, model_output_dirs, pull_or_not)
+  generate_tuning_param_code(target_soc, model_output_dirs, pull_or_not)
 
 def clear_env(target_soc):
   command = "bash tools/clear_env.sh {}".format(target_soc)
@@ -111,18 +143,41 @@ def build_mace_run(production_mode, model_output_dir, hexagon_mode):
   run_command(command)
 
 
-def tuning_run(target_soc,
+def tuning_run(model_name,
+               target_runtime,
+               target_abi,
+               target_soc,
                model_output_dir,
                running_round,
                tuning,
-               production_mode,
                restart_round,
                option_args=''):
-  command = "bash tools/tuning_run.sh {} {} {} {} {} {} \"{}\"".format(
-      target_soc, model_output_dir, running_round, int(tuning),
-      int(production_mode), restart_round, option_args)
-  run_command(command)
-
+  # TODO(yejianwu) refactoring the hackish code
+  stdout_buff = []
+  process_output = sh_commands.make_output_processor(stdout_buff)
+  p = sh.bash("tools/tuning_run.sh", target_soc, model_output_dir,
+              running_round, int(tuning),
+              restart_round, option_args, _out=process_output,
+              _bg=True, _err_to_out=True)
+  p.wait()
+  metrics = {}
+  for line in stdout_buff:
+    line = line.strip()
+    parts = line.split()
+    if len(parts) == 6 and parts[0].startswith("time"):
+      metrics["%s.create_net_ms" % model_name] = str(float(parts[1]))
+      metrics["%s.mace_engine_ctor_ms" % model_name] = str(float(parts[2]))
+      metrics["%s.init_ms" % model_name] = str(float(parts[3]))
+      metrics["%s.warmup_ms" % model_name] = str(float(parts[4]))
+      if float(parts[5]) > 0:
+        metrics["%s.avg_latency_ms" % model_name] = str(float(parts[5]))
+  tags = {"ro.board.platform": target_soc,
+          "abi": target_abi,
+          # "runtime": target_runtime, # TODO(yejianwu) Add the actual runtime
+          "round": running_round, # TODO(yejianwu) change this to source/binary
+          "tuning": tuning}
+  sh_commands.falcon_push_metrics(metrics, endpoint="mace_model_benchmark",
+                                  tags=tags)
 
 def benchmark_model(target_soc, model_output_dir, option_args=''):
   command = "bash tools/benchmark.sh {} {} \"{}\"".format(
@@ -130,9 +185,10 @@ def benchmark_model(target_soc, model_output_dir, option_args=''):
   run_command(command)
 
 
-def run_model(target_soc, model_output_dir, running_round, restart_round,
-              option_args):
-  tuning_run(target_soc, model_output_dir, running_round, False, False,
+def run_model(model_name, target_runtime, target_abi, target_soc,
+              model_output_dir, running_round, restart_round, option_args):
+  tuning_run(model_name, target_runtime, target_abi, target_soc,
+             model_output_dir, running_round, False,
              restart_round, option_args)
 
 
@@ -146,25 +202,28 @@ def generate_production_code(target_soc, model_output_dirs, pull_or_not):
   run_command(command)
 
 
-def build_mace_run_prod(target_soc, model_output_dir, tuning, global_runtime):
-  if "dsp" == global_runtime:
+def build_mace_run_prod(model_name, target_runtime, target_abi, target_soc,
+                        model_output_dir, tuning):
+  if "dsp" == target_runtime:
     hexagon_mode = True
   else:
     hexagon_mode = False
 
+  generate_code(target_soc, [], False)
   production_or_not = False
   build_mace_run(production_or_not, model_output_dir, hexagon_mode)
   tuning_run(
+      model_name,
+      target_runtime,
+      target_abi,
       target_soc,
       model_output_dir,
       running_round=0,
       tuning=tuning,
-      production_mode=production_or_not,
       restart_round=1)
 
+  generate_code(target_soc, [model_output_dir], True)
   production_or_not = True
-  pull_or_not = True
-  generate_production_code(target_soc, [model_output_dir], pull_or_not)
   build_mace_run(production_or_not, model_output_dir, hexagon_mode)
 
 
@@ -188,8 +247,7 @@ def build_production_code():
 
 
 def merge_libs_and_tuning_results(target_soc, output_dir, model_output_dirs):
-  pull_or_not = False
-  generate_production_code(target_soc, model_output_dirs, pull_or_not)
+  generate_code(target_soc, model_output_dirs, False)
   build_production_code()
 
   model_output_dirs_str = ",".join(model_output_dirs)
@@ -202,6 +260,26 @@ def packaging_lib_file(output_dir):
   command = "bash tools/packaging_lib.sh {}".format(output_dir)
   run_command(command)
 
+def download_model_files(model_file_path,
+                         model_output_dir,
+                         weight_file_path=""):
+  if model_file_path.startswith("http://") or \
+      model_file_path.startswith("https://"):
+    os.environ["MODEL_FILE_PATH"] = model_output_dir + "/model.pb"
+    urllib.urlretrieve(model_file_path, os.environ["MODEL_FILE_PATH"])
+
+  if weight_file_path.startswith("http://") or \
+      weight_file_path.startswith("https://"):
+    os.environ[
+      "WEIGHT_FILE_PATH"] = model_output_dir + "/model.caffemodel"
+    urllib.urlretrieve(weight_file_path,
+      os.environ["WEIGHT_FILE_PATH"])
+
+def md5sum(str):
+  md5 = hashlib.md5()
+  md5.update(str)
+  return md5.hexdigest()
+
 
 def parse_model_configs():
   with open(FLAGS.config) as f:
@@ -268,7 +346,9 @@ def main(unused_args):
       shutil.rmtree(os.path.join(FLAGS.output_dir, os.environ["PROJECT_NAME"]))
       os.makedirs(os.path.join(FLAGS.output_dir, os.environ["PROJECT_NAME"]))
 
-  generate_opencl_and_version_code()
+    generate_version_code()
+    generate_opencl_source_code()
+
   option_args = ' '.join([arg for arg in unused_args if arg.startswith('--')])
 
   available_socs = sh_commands.adb_get_all_socs()
@@ -285,6 +365,7 @@ def main(unused_args):
       print("Error: devices with SoCs are not connected %s" % missing_socs)
       exit(1)
 
+
   for target_soc in target_socs:
     for target_abi in configs["target_abis"]:
       global_runtime = get_global_runtime(configs)
@@ -292,9 +373,9 @@ def main(unused_args):
       os.environ["TARGET_ABI"] = target_abi
       model_output_dirs = []
       for model_name in configs["models"]:
+        print '=======================', model_name, '======================='
         # Transfer params by environment
         os.environ["MODEL_TAG"] = model_name
-        print '=======================', model_name, '======================='
         model_config = configs["models"][model_name]
         input_file_list = model_config.get("validation_inputs_data", [])
         for key in model_config:
@@ -307,9 +388,8 @@ def main(unused_args):
           else:
             os.environ[key.upper()] = str(model_config[key])
 
-        md5 = hashlib.md5()
-        md5.update(model_config["model_file_path"])
-        model_path_digest = md5.hexdigest()
+        # Create model build directory
+        model_path_digest = md5sum(model_config["model_file_path"])
         model_output_dir = "%s/%s/%s/%s/%s/%s/%s" % (FLAGS.output_dir,
                                                      os.environ["PROJECT_NAME"],
                                                      "build", model_name,
@@ -323,21 +403,8 @@ def main(unused_args):
           os.makedirs(model_output_dir)
           clear_env(target_soc)
 
-        # Support http:// and https://
-        if model_config["model_file_path"].startswith(
-            "http://") or model_config["model_file_path"].startswith(
-                "https://"):
-          os.environ["MODEL_FILE_PATH"] = model_output_dir + "/model.pb"
-          urllib.urlretrieve(model_config["model_file_path"],
-                             os.environ["MODEL_FILE_PATH"])
-
-        if model_config["platform"] == "caffe" and (
-            model_config["weight_file_path"].startswith("http://") or
-            model_config["weight_file_path"].startswith("https://")):
-          os.environ[
-              "WEIGHT_FILE_PATH"] = model_output_dir + "/model.caffemodel"
-          urllib.urlretrieve(model_config["weight_file_path"],
-                             os.environ["WEIGHT_FILE_PATH"])
+        download_model_files(model_config["model_file_path"],
+          model_output_dir, model_config.get("weight_file_path", ""))
 
         if FLAGS.mode == "build" or FLAGS.mode == "run" or FLAGS.mode == "validate"\
             or FLAGS.mode == "benchmark" or FLAGS.mode == "all":
@@ -346,12 +413,13 @@ def main(unused_args):
 
         if FLAGS.mode == "build" or FLAGS.mode == "all":
           generate_model_code()
-          build_mace_run_prod(target_soc, model_output_dir, FLAGS.tuning,
-                              global_runtime)
+          build_mace_run_prod(model_name, global_runtime, target_abi,
+                              target_soc, model_output_dir, FLAGS.tuning)
 
         if FLAGS.mode == "run" or FLAGS.mode == "validate" or FLAGS.mode == "all":
-          run_model(target_soc, model_output_dir, FLAGS.round,
-                    FLAGS.restart_round, option_args)
+          run_model(model_name, global_runtime, target_abi, target_soc,
+                    model_output_dir, FLAGS.round, FLAGS.restart_round,
+                    option_args)
 
         if FLAGS.mode == "benchmark":
           benchmark_model(target_soc, model_output_dir, option_args)
diff --git a/tools/merge_libs.sh b/tools/merge_libs.sh
index cc4cd4b054d4b14c8a1b25e4dbfd4d582d054884..90bfa7d96a104775bc344975649c47e411f783a3 100644
--- a/tools/merge_libs.sh
+++ b/tools/merge_libs.sh
@@ -41,14 +41,13 @@ LIBMACE_TEMP_DIR=`mktemp -d -t libmace.XXXX`
 echo "create ${LIBMACE_BUILD_DIR}/${TARGET_ABI}/libmace_${PROJECT_NAME}.${TARGET_SOC}.a" > ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
 
 if [ x"$TARGET_ABI" = x"host" ]; then
-  echo "addlib bazel-bin/mace/codegen/libgenerated_opencl_prod.pic.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
+  echo "addlib bazel-bin/mace/codegen/libgenerated_opencl.pic.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
   echo "addlib bazel-bin/mace/codegen/libgenerated_tuning_params.pic.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
 else
-  echo "addlib bazel-bin/mace/codegen/libgenerated_opencl_prod.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
+  echo "addlib bazel-bin/mace/codegen/libgenerated_opencl.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
   echo "addlib bazel-bin/mace/codegen/libgenerated_tuning_params.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
   echo "addlib bazel-bin/mace/codegen/libgenerated_version.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
   echo "addlib bazel-bin/mace/core/libcore.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
-  echo "addlib bazel-bin/mace/core/libopencl_prod.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
   echo "addlib bazel-bin/mace/kernels/libkernels.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
   echo "addlib bazel-bin/mace/utils/libutils.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
   echo "addlib bazel-bin/mace/utils/libutils_prod.a" >> ${LIBMACE_TEMP_DIR}/libmace_${PROJECT_NAME}.mri
diff --git a/tools/sh_commands.py b/tools/sh_commands.py
index 21dcef8be73707f3e32a6ba7ca3ba9d2b598c28c..666ce7b0bff61461cf547ec416114c77e808683d 100644
--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -70,22 +70,20 @@ def adb_run(serialno, host_bin_path, bin_name,
             out_of_range_check=1):
   host_bin_full_path = "%s/%s" % (host_bin_path, bin_name)
   device_bin_full_path = "%s/%s" % (device_bin_path, bin_name)
-  device_cl_path = "%s/cl" % device_bin_path
   props = adb_getprop_by_serialno(serialno)
   print("=====================================================================")
   print("Run on device: %s, %s, %s" % (serialno, props["ro.board.platform"],
                                        props["ro.product.model"]))
   sh.adb("-s", serialno, "shell", "rm -rf %s" % device_bin_path)
   sh.adb("-s", serialno, "shell", "mkdir -p %s" % device_bin_path)
-  sh.adb("-s", serialno, "shell", "mkdir -p %s" % device_cl_path)
   print("Push %s to %s" % (host_bin_full_path, device_bin_full_path))
-  sh.adb("-s", serialno, "push", host_bin_full_path, device_bin_path)
+  sh.adb("-s", serialno, "push", host_bin_full_path, device_bin_full_path)
   print("Run %s" % device_bin_full_path)
   stdout_buff=[]
   process_output = make_output_processor(stdout_buff)
   p = sh.adb("-s", serialno, "shell",
-             "MACE_OUT_OF_RANGE_CHECK=%d MACE_OPENCL_PROFILING=%d MACE_KERNEL_PATH=%s MACE_CPP_MIN_VLOG_LEVEL=%d %s %s" %
-             (out_of_range_check, opencl_profiling, device_cl_path, vlog_level, device_bin_full_path, args),
+             "MACE_OUT_OF_RANGE_CHECK=%d MACE_OPENCL_PROFILING=%d MACE_CPP_MIN_VLOG_LEVEL=%d %s %s" %
+             (out_of_range_check, opencl_profiling, vlog_level, device_bin_full_path, args),
              _out=process_output, _bg=True, _err_to_out=True)
   p.wait()
   return "".join(stdout_buff)
@@ -131,6 +129,7 @@ def bazel_target_to_bin(target):
 ################################
 # TODO this should be refactored
 def gen_encrypted_opencl_source(codegen_path="mace/codegen"):
+  sh.mkdir("-p", "%s/opencl" % codegen_path)
   sh.python("mace/python/tools/encrypt_opencl_codegen.py",
             "--cl_kernel_dir=./mace/kernels/opencl/cl/",
             "--output_path=%s/opencl/opencl_encrypt_program.cc" % codegen_path)
@@ -140,24 +139,32 @@ def gen_mace_version(codegen_path="mace/codegen"):
   sh.bash("mace/tools/git/gen_version_source.sh",
           "%s/version/version.cc" % codegen_path)
 
+def gen_compiled_opencl_source(codegen_path="mace/codegen"):
+  sh.mkdir("-p", "%s/opencl" % codegen_path)
+  sh.python("mace/python/tools/opencl_codegen.py",
+            "--output_path=%s/opencl/opencl_compiled_program.cc" % codegen_path)
+
 ################################
 # falcon
 ################################
-def falcon_tags(platform, model, abi):
-  return "ro.board.platform=%s,ro.product.model=%s,abi=%s" % (platform, model, abi)
-
-def falcon_push_metrics(metrics, device_properties, abi, endpoint="mace_dev"):
+def falcon_tags(tags_dict):
+  tags = ""
+  for k, v in tags_dict.iteritems():
+    if tags == "":
+      tags = "%s=%s" % (k, v)
+    else:
+      tags = tags + ",%s=%s" % (k, v)
+  return tags
+
+def falcon_push_metrics(metrics, endpoint="mace_dev", tags={}):
   cli = falcon_cli.FalconCli.connect(server="transfer.falcon.miliao.srv",
                                      port=8433,
                                      debug=False)
-  platform = device_properties["ro.board.platform"].replace(" ", "-")
-  model = device_properties["ro.product.model"].replace(" ", "-")
-  tags = falcon_tags(platform, model, abi)
   ts = int(time.time())
   falcon_metrics = [{
       "endpoint": endpoint,
       "metric": key,
-      "tags": tags,
+      "tags": falcon_tags(tags),
       "timestamp": ts,
       "value": value,
       "step": 86400,
diff --git a/tools/tuning_run.sh b/tools/tuning_run.sh
index c4e8dbe99e403b6d067b4a802022f341e6a49141..98fc9e092b88b55ec573bc688c95e14ec3242149 100644
--- a/tools/tuning_run.sh
+++ b/tools/tuning_run.sh
@@ -1,10 +1,10 @@
 #!/bin/bash
 
 Usage() {
-  echo "Usage: bash tools/tuning_run.sh target_soc model_output_dir round tuning production_mode"
+  echo "Usage: bash tools/tuning_run.sh target_soc model_output_dir round tuning "
 }
 
-if [ $# -lt 7 ]; then
+if [ $# -lt 6 ]; then
   Usage
   exit 1
 fi
@@ -16,9 +16,8 @@ TARGET_SOC=$1
 MODEL_OUTPUT_DIR=$2
 ROUND=$3
 TUNING_OR_NOT=$4
-PRODUCTION_MODE=$5
-RESTART_ROUND=$6
-OPTION_ARGS=$7
+RESTART_ROUND=$5
+OPTION_ARGS=$6
 
 echo $OPTION_ARGS
 
@@ -45,16 +44,14 @@ if [ x"$TARGET_ABI" = x"host" ]; then
     --restart_round=1 \
     $OPTION_ARGS || exit 1
 else
-  if [[ "${TUNING_OR_NOT}" != "0" && "$PRODUCTION_MODE" != 1 ]];then
+  if [[ "${TUNING_OR_NOT}" != "0" ]];then
     tuning_flag=1
   else
     tuning_flag=0
   fi
   
   adb -s $DEVICE_ID shell "mkdir -p ${PHONE_DATA_DIR}" || exit 1
-  if [ "$PRODUCTION_MODE" = 0 ]; then
-    adb -s $DEVICE_ID shell "mkdir -p ${KERNEL_DIR}" || exit 1
-  fi
+  adb -s $DEVICE_ID shell "mkdir -p ${COMPILED_PROGRAM_DIR}" || exit 1
 
   IFS=',' read -r -a INPUT_NAMES <<< "${INPUT_NODES}"
   for NAME in "${INPUT_NAMES[@]}";do
@@ -72,7 +69,7 @@ else
     MACE_TUNING=${tuning_flag} \
     MACE_CPP_MIN_VLOG_LEVEL=$VLOG_LEVEL \
     MACE_RUN_PARAMETER_PATH=${PHONE_DATA_DIR}/mace_run.config \
-    MACE_KERNEL_PATH=$KERNEL_DIR \
+    MACE_CL_PROGRAM_PATH=$COMPILED_PROGRAM_DIR \
     MACE_LIMIT_OPENCL_KERNEL_TIME=${LIMIT_OPENCL_KERNEL_TIME} \
     ${PHONE_DATA_DIR}/mace_run \
     --input_node="${INPUT_NODES}" \