Refactor OpenCL static dependencies

e4860f68 · Liangliang He · 2bee11ed · e4860f68 · e4860f68 · e4860f68
110 changed file
--- a/mace/BUILD
+++ b/mace/BUILD
@@ -25,7 +25,7 @@ config_setting(
 )

 config_setting(
-    name = "is_profiling",
+    name = "profiling_enabled",
    define_values = {
        "profiling": "true",
    },

--- a/mace/core/BUILD
+++ b/mace/core/BUILD
@@ -7,7 +7,7 @@ package(

 licenses(["notice"])  # Apache 2.0

-load("//mace:mace.bzl", "if_android", "if_profiling")
+load("//mace:mace.bzl", "if_android", "if_profiling_enabled")

 cc_library(
    name = "opencl_runtime",
@@ -15,76 +15,48 @@ cc_library(
        "runtime/opencl/*.cc",
    ]),
    hdrs = glob([
-        "runtime/opencl/cl.hpp",
        "runtime/opencl/cl2.hpp",
        "runtime/opencl/*.h",
    ]),
-    copts = ["-std=c++11"] + if_profiling(["-D__ENABLE_PROFILING"]),
+    copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"] +
+            if_profiling_enabled(["-DMACE_OPENCL_PROFILING"]),
+    linkopts = ["-ldl"],
    deps = [
-        ":logging",
+        ":core",
+        "//mace/utils:logging",
+        "//mace/utils:tuner",
        "@opencl_headers//:opencl20_headers",
    ],
    alwayslink = 1,
 )


-cc_library(
-    name = "logging",
-    srcs = [
-        "logging.cc",
-    ],
-    hdrs = [
-        "logging.h",
-    ],
-    copts = ["-std=c++11"],
-    linkopts = if_android([
-        "-llog",
-    ]),
-)
-
 cc_library(
    name = "core",
-    srcs = glob(
-        ["*.cc",],
-        exclude=[
-            "logging.cc",
-        ]),
-    hdrs = glob(
-        ["*.h"],
-        exclude=[
-            "logging.h",
-        ]),
-    copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"] + if_android([
-            "-D__USE_OPENCL",
-    ]),
-    linkopts = ["-ldl"] + if_android([
-        "-pie",
-    ]),
+    srcs = glob(["*.cc"]),
+    hdrs = glob(["*.h"]),
+    copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
+    linkopts = if_android(["-pie"]),
    deps = [
-        ":logging",
-        "//mace/proto:stats_proto",
-        "//mace/utils",
-        ":opencl_runtime",
+        "//mace/utils:utils_hdrs",
+        "//mace/utils:logging",
    ],
 )

-# Main program for tests
 cc_library(
    name = "test_benchmark_main",
    testonly = 1,
-    srcs = glob([
-        "testing/*.cc",
-    ]),
-    hdrs = glob([
-        "testing/*.h",
-    ]),
-    copts = [
-        "-std=c++11",
-        "-D_GLIBCXX_USE_C99_MATH_TR1",
+    hdrs = [
+        "testing/test_benchmark.h",
+    ],
+    srcs = [
+        "testing/test_benchmark.cc",
+        "testing/test_benchmark_main.cc",
    ],
-    linkopts = ["-lm"],
+    copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
+    alwayslink = 1,
    deps = [
        ":core",
+        "//mace/utils:utils_hdrs",
    ],
-    alwayslink = 1,
 )
--- a/mace/core/allocator.cc
+++ b/mace/core/allocator.cc
@@ -3,9 +3,6 @@
 //

 #include "mace/core/allocator.h"
-#ifdef __USE_OPENCL
-#include "mace/core/opencl_allocator.h"
-#endif

 namespace mace {

@@ -25,8 +22,5 @@ Allocator *GetDeviceAllocator(DeviceType type) {

 MACE_REGISTER_ALLOCATOR(DeviceType::CPU, new CPUAllocator());
 MACE_REGISTER_ALLOCATOR(DeviceType::NEON, new CPUAllocator());
-#ifdef __USE_OPENCL
-MACE_REGISTER_ALLOCATOR(DeviceType::OPENCL, new OpenCLAllocator());
-#endif

 }  // namespace mace
--- a/mace/core/common.h
+++ b/mace/core/common.h
@@ -12,7 +12,7 @@
 #include <string>
 #include <vector>

-#include "mace/core/logging.h"
+#include "mace/utils/logging.h"

 using std::set;
 using std::map;

--- a/mace/core/future.h
+++ b/mace/core/future.h
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_CORE_FUTURE_H_
+#define MACE_CORE_FUTURE_H_
+
+#include <functional>
+
+#include "mace/utils/logging.h"
+
+namespace mace {
+
+struct CallStats {
+  int64_t start_micros;
+  int64_t end_micros;
+};
+
+struct OperatorStats {
+  std::string operator_name;
+  std::string type;
+  CallStats stats;
+};
+
+struct RunMetadata {
+  std::vector<OperatorStats> op_stats;
+};
+
+// Wait the call to finish and get the stats if param is not nullptr
+struct StatsFuture {
+  std::function<void(CallStats *)> wait_fn = [](CallStats *) {
+    LOG(FATAL) << "wait_fn must be properly set";
+  };
+};
+
+}  //  namespace mace
+
+#endif  // MACE_CORE_FUTURE_H_
--- a/mace/core/mace.h
+++ b/mace/core/mace.h
@@ -7,7 +7,7 @@
 #include <cstdint>
 #include <vector>
 #include <string>
-#include "mace/core/logging.h"
+#include "mace/utils/logging.h"

 namespace mace {


--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -4,9 +4,6 @@

 #include "mace/core/net.h"
 #include "mace/utils/utils.h"
-#ifdef __USE_OPENCL
-#include "mace/core/runtime/opencl/opencl_runtime.h"
-#endif

 namespace mace {

@@ -33,65 +30,51 @@ SimpleNet::SimpleNet(const std::shared_ptr<const NetDef> &net_def,
    }
  }
 }
+
 bool SimpleNet::Run(RunMetadata *run_metadata) {
  VLOG(1) << "Running net " << name_;
-  for (auto &op : operators_) {
+  for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
+    bool future_wait = (device_type_ == DeviceType::OPENCL &&
+                        (run_metadata != nullptr ||
+                         std::distance(iter, operators_.end()) == 1));
+    auto &op = *iter;
    VLOG(1) << "Running operator " << op->debug_def().name() << "("
            << op->debug_def().type() << ").";
-    OperatorStats *op_stats = nullptr;
-    if (run_metadata ) {
-      if (device_type_ != DeviceType::OPENCL) {
-        op_stats = run_metadata->add_op_stats();
-        op_stats->set_operator_name(op->debug_def().name());
-        op_stats->set_type(op->debug_def().type());
-        op_stats->set_all_start_micros(NowInMicroSec());
-        op_stats->set_op_start_rel_micros(NowInMicroSec() -
-            op_stats->all_start_micros());
+
+    bool ret;
+    CallStats call_stats;
+    if (future_wait) {
+      StatsFuture future;
+      ret = op->Run(&future);
+      if (run_metadata != nullptr) {
+        future.wait_fn(&call_stats);
+      } else {
+        future.wait_fn(nullptr);
      }
+    } else if (run_metadata != nullptr) {
+      call_stats.start_micros = NowInMicroSec();
+      ret = op->Run(nullptr);
+      call_stats.end_micros = NowInMicroSec();
+    } else {
+      ret = op->Run(nullptr);
    }
-    if (!op->Run()) {
+
+    if (run_metadata != nullptr) {
+      OperatorStats op_stats = { op->debug_def().name(),
+                                 op->debug_def().type(),
+                                 call_stats };
+      run_metadata->op_stats.emplace_back(op_stats);
+    }
+
+    if (!ret) {
      LOG(ERROR) << "Operator failed: " << op->debug_def().name();
      return false;
    }

-    if (run_metadata) {
-      if (device_type_ == DeviceType::OPENCL) {
-#ifndef __USE_OPENCL
-        LOG(FATAL) << "OpenCL is not supported";
-#else
-        OpenCLRuntime::Get()->command_queue().finish();
-        op_stats = run_metadata->add_op_stats();
-        op_stats->set_operator_name(op->debug_def().name());
-        op_stats->set_type(op->debug_def().type());
-
-        op_stats->set_all_start_micros(
-            OpenCLRuntime::Get()->GetEventProfilingStartInfo() / 1000);
-        op_stats->set_op_start_rel_micros(
-            OpenCLRuntime::Get()->GetEventProfilingStartInfo() / 1000 -
-            op_stats->all_start_micros());
-
-        op_stats->set_op_end_rel_micros(
-            OpenCLRuntime::Get()->GetEventProfilingEndInfo() / 1000 -
-            op_stats->all_start_micros());
-        op_stats->set_all_end_rel_micros(
-            OpenCLRuntime::Get()->GetEventProfilingEndInfo() / 1000 -
-            op_stats->all_start_micros());
-#endif
-      } else {
-        op_stats->set_op_end_rel_micros(NowInMicroSec() -
-                                        op_stats->all_start_micros());
-        op_stats->set_all_end_rel_micros(NowInMicroSec() -
-                                         op_stats->all_start_micros());
-      }
-    }
    VLOG(1) << "Op " << op->debug_def().name()
            << " has shape: " << internal::MakeString(op->Output(0)->shape());
  }
-#ifdef __USE_OPENCL
-  if (device_type_ == DeviceType::OPENCL) {
-    OpenCLRuntime::Get()->command_queue().finish();
-  }
-#endif
+
  return true;
 }


--- a/mace/core/net.h
+++ b/mace/core/net.h
@@ -9,7 +9,6 @@
 #include "mace/core/operator.h"
 #include "mace/core/workspace.h"
 #include "mace/core/mace.h"
-#include "mace/proto/stats.pb.h"

 namespace mace {


--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -7,6 +7,7 @@

 #include "mace/core/common.h"
 #include "mace/core/arg_helper.h"
+#include "mace/core/future.h"
 #include "mace/core/registry.h"
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
@@ -55,7 +56,8 @@ class OperatorBase {
  inline const vector<const Tensor *> &Inputs() const { return inputs_; }
  inline const vector<Tensor *> &Outputs() { return outputs_; }

-  virtual bool Run() = 0;
+  // Run Op asynchronously (depends on device), return a future if not nullptr.
+  virtual bool Run(StatsFuture *future) = 0;

  inline const OperatorDef &debug_def() const {
    MACE_CHECK(has_debug_def(), "operator_def was null!");
@@ -100,7 +102,7 @@ class Operator : public OperatorBase {
      }
    }
  }
-  virtual bool Run() override = 0;
+  virtual bool Run(StatsFuture *future) override  = 0;
  ~Operator() noexcept override {}
 };


--- a/mace/core/runtime/opencl/cl.hpp
+++ b/mace/core/runtime/opencl/cl.hpp
--- a/mace/core/runtime/opencl/cl2_header.h
+++ b/mace/core/runtime/opencl/cl2_header.h
@@ -5,6 +5,8 @@
 #ifndef MACE_CORE_RUNTIME_OPENCL_CL2_HEADER_H_
 #define MACE_CORE_RUNTIME_OPENCL_CL2_HEADER_H_

+// Do not include cl2.hpp directly, include this header instead.
+
 #define CL_HPP_TARGET_OPENCL_VERSION 200

 #include "mace/core/runtime/opencl/cl2.hpp"

--- a/mace/core/opencl_allocator.cc
+++ b/mace/core/opencl_allocator.cc
@@ -3,7 +3,7 @@
 //

 #include "mace/core/runtime/opencl/cl2_header.h"
-#include "mace/core/opencl_allocator.h"
+#include "mace/core/runtime/opencl/opencl_allocator.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"

 namespace mace {
@@ -37,7 +37,7 @@ OpenCLAllocator::OpenCLAllocator() {}
 OpenCLAllocator::~OpenCLAllocator() {}
 void *OpenCLAllocator::New(size_t nbytes) {
  cl_int error;
-  cl::Buffer *buffer = new cl::Buffer(OpenCLRuntime::Get()->context(),
+  cl::Buffer *buffer = new cl::Buffer(OpenCLRuntime::Global()->context(),
                                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                                      nbytes, nullptr, &error);
  MACE_CHECK(error == CL_SUCCESS);
@@ -53,7 +53,7 @@ void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,

  cl_int error;
  cl::Image2D *cl_image =
-      new cl::Image2D(OpenCLRuntime::Get()->context(),
+      new cl::Image2D(OpenCLRuntime::Global()->context(),
                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
                      img_format,
                      image_shape[0], image_shape[1],
@@ -79,7 +79,7 @@ void OpenCLAllocator::DeleteImage(void *buffer) {

 void *OpenCLAllocator::Map(void *buffer, size_t nbytes) {
  auto cl_buffer = static_cast<cl::Buffer *>(buffer);
-  auto queue = OpenCLRuntime::Get()->command_queue();
+  auto queue = OpenCLRuntime::Global()->command_queue();
  // TODO(heliangliang) Non-blocking call
  cl_int error;
  void *mapped_ptr =
@@ -101,7 +101,7 @@ void *OpenCLAllocator::MapImage(void *buffer,
  mapped_image_pitch.resize(2);
  cl_int error;
  void *mapped_ptr =
-      OpenCLRuntime::Get()->command_queue().enqueueMapImage(*cl_image,
+      OpenCLRuntime::Global()->command_queue().enqueueMapImage(*cl_image,
                                                            CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
                                                            origin, region,
                                                            &mapped_image_pitch[0],
@@ -114,12 +114,13 @@ void *OpenCLAllocator::MapImage(void *buffer,

 void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) {
  auto cl_buffer = static_cast<cl::Buffer *>(buffer);
-  auto queue = OpenCLRuntime::Get()->command_queue();
+  auto queue = OpenCLRuntime::Global()->command_queue();
  MACE_CHECK(queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr, nullptr,
                                         nullptr) == CL_SUCCESS);
 }

 bool OpenCLAllocator::OnHost() { return false; }

+MACE_REGISTER_ALLOCATOR(DeviceType::OPENCL, new OpenCLAllocator());

 }  // namespace mace
--- a/mace/core/opencl_allocator.h
+++ b/mace/core/opencl_allocator.h
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -7,15 +7,17 @@
 #include <memory>
 #include <mutex>

-#include "mace/core/logging.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/utils/logging.h"
+#include "mace/utils/tuner.h"

 #include <CL/opencl.h>

 namespace mace {
 namespace {

-bool ReadFile(const std::string &filename, bool binary,
+bool ReadFile(const std::string &filename,
+              bool binary,
              std::vector<unsigned char> *content_ptr) {
  MACE_CHECK_NOTNULL(content_ptr);

@@ -55,7 +57,8 @@ bool ReadFile(const std::string &filename, bool binary,
  return true;
 }

-bool WriteFile(const std::string &filename, bool binary,
+bool WriteFile(const std::string &filename,
+               bool binary,
               const std::vector<unsigned char> &content) {
  std::ios_base::openmode mode = std::ios::out;
  if (binary) {
@@ -76,124 +79,92 @@ bool WriteFile(const std::string &filename, bool binary,

 }  // namespace

-bool OpenCLRuntime::enable_profiling_ = false;
-std::unique_ptr<cl::Event> OpenCLRuntime::profiling_ev_ = nullptr;
+void OpenCLProfilingTimer::StartTiming() {}

-OpenCLRuntime *OpenCLRuntime::Get() {
-  static std::once_flag init_once;
-  static OpenCLRuntime *instance = nullptr;
-  std::call_once(init_once, []() {
-    if (!mace::OpenCLLibrary::Supported()) {
-      LOG(ERROR) << "OpenCL not supported";
-      return;
-    }
+void OpenCLProfilingTimer::StopTiming() {
+  OpenCLRuntime::Global()->command_queue().finish();
+  start_nanos_ = event_->getProfilingInfo<CL_PROFILING_COMMAND_START>();
+  stop_nanos_ = event_->getProfilingInfo<CL_PROFILING_COMMAND_END>();
+}

-    std::vector<cl::Platform> all_platforms;
-    cl::Platform::get(&all_platforms);
-    if (all_platforms.size() == 0) {
-      LOG(ERROR) << "No OpenCL platforms found";
-      return;
-    }
-    cl::Platform default_platform = all_platforms[0];
-    VLOG(1) << "Using platform: "
-            << default_platform.getInfo<CL_PLATFORM_NAME>() << ", "
-            << default_platform.getInfo<CL_PLATFORM_PROFILE>() << ", "
-            << default_platform.getInfo<CL_PLATFORM_VERSION>();
-
-    // get default device (CPUs, GPUs) of the default platform
-    std::vector<cl::Device> all_devices;
-    default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
-    if (all_devices.size() == 0) {
-      LOG(ERROR) << "No OpenCL devices found";
-      return;
-    }
+double OpenCLProfilingTimer::ElapsedMicros() {
+  return (stop_nanos_ - start_nanos_) / 1000.0;
+}

-    bool gpu_detected = false;
-    cl::Device gpu_device;
-    for (auto device : all_devices) {
-      if (device.getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_GPU) {
-        gpu_device = device;
-        gpu_detected = true;
-        VLOG(1) << "Using device: " << device.getInfo<CL_DEVICE_NAME>();
-        break;
-      }
-    }
-    if (!gpu_detected) {
-      LOG(ERROR) << "No GPU device found";
-      return;
-    }
+OpenCLRuntime *OpenCLRuntime::Global() {
+  static OpenCLRuntime instance;
+  return &instance;
+}

-    cl_command_queue_properties properties = 0;
-#ifdef __ENABLE_PROFILING
-    enable_profiling_ = true;
-    profiling_ev_.reset(new cl::Event());
-    properties = CL_QUEUE_PROFILING_ENABLE;
-#endif
+OpenCLRuntime::OpenCLRuntime() {
+  LoadOpenCLLibrary();

-    // a context is like a "runtime link" to the device and platform;
-    // i.e. communication is possible
-    cl::Context context({gpu_device});
-    cl::CommandQueue command_queue(context, gpu_device, properties);
-    instance = new OpenCLRuntime(context, gpu_device, command_queue);
+  std::vector<cl::Platform> all_platforms;
+  cl::Platform::get(&all_platforms);
+  if (all_platforms.size() == 0) {
+    LOG(FATAL) << "No OpenCL platforms found";
+  }
+  cl::Platform default_platform = all_platforms[0];
+  VLOG(1) << "Using platform: " << default_platform.getInfo<CL_PLATFORM_NAME>()
+          << ", " << default_platform.getInfo<CL_PLATFORM_PROFILE>() << ", "
+          << default_platform.getInfo<CL_PLATFORM_VERSION>();
+
+  // get default device (CPUs, GPUs) of the default platform
+  std::vector<cl::Device> all_devices;
+  default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
+  if (all_devices.size() == 0) {
+    LOG(FATAL) << "No OpenCL devices found";
+  }

-  });
+  bool gpu_detected = false;
+  cl::Device gpu_device;
+  for (auto device : all_devices) {
+    if (device.getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_GPU) {
+      gpu_device = device;
+      gpu_detected = true;
+      VLOG(1) << "Using device: " << device.getInfo<CL_DEVICE_NAME>();
+      break;
+    }
+  }
+  if (!gpu_detected) {
+    LOG(FATAL) << "No GPU device found";
+  }

-  return instance;
-}
+  cl_command_queue_properties properties = 0;

-void OpenCLRuntime::EnableProfiling() { enable_profiling_ = true; }
+#ifdef MACE_OPENCL_PROFILING
+  properties |= CL_QUEUE_PROFILING_ENABLE;
+#endif

-cl::Event *OpenCLRuntime::GetDefaultEvent() { return profiling_ev_.get(); }
+  // a context is like a "runtime link" to the device and platform;
+  // i.e. communication is possible
+  cl::Context context({gpu_device});
+  cl::CommandQueue command_queue(context, gpu_device, properties);

-cl_ulong OpenCLRuntime::GetEventProfilingStartInfo() {
-  MACE_CHECK(profiling_ev_, "is NULL, should enable profiling first.");
-  return profiling_ev_->getProfilingInfo<CL_PROFILING_COMMAND_START>();
-}
+  const char *kernel_path = getenv("MACE_KERNEL_PATH");
+  this->kernel_path_ = std::string(kernel_path == nullptr ? "" : kernel_path) + "/";

-cl_ulong OpenCLRuntime::GetEventProfilingEndInfo() {
-  MACE_CHECK(profiling_ev_, "is NULL, should enable profiling first.");
-  return profiling_ev_->getProfilingInfo<CL_PROFILING_COMMAND_END>();
+  this->device_ = new cl::Device(gpu_device);
+  this->context_ = new cl::Context(context);
+  this->command_queue_ = new cl::CommandQueue(command_queue);
 }

-OpenCLRuntime::OpenCLRuntime(cl::Context context, cl::Device device,
-                             cl::CommandQueue command_queue)
-    : context_(context), device_(device), command_queue_(command_queue) {
-  const char *kernel_path = getenv("MACE_KERNEL_PATH");
-  kernel_path_ = std::string(kernel_path == nullptr ? "" : kernel_path) + "/";
+OpenCLRuntime::~OpenCLRuntime() {
+  built_program_map_.clear();
+  delete command_queue_;
+  delete context_;
+  delete device_;
+  UnloadOpenCLLibrary();
 }

-OpenCLRuntime::~OpenCLRuntime() {}
-
-cl::Context &OpenCLRuntime::context() { return context_; }
+cl::Context &OpenCLRuntime::context() { return *context_; }

-cl::Device &OpenCLRuntime::device() { return device_; }
+cl::Device &OpenCLRuntime::device() { return *device_; }

-cl::CommandQueue &OpenCLRuntime::command_queue() { return command_queue_; }
+cl::CommandQueue &OpenCLRuntime::command_queue() { return *command_queue_; }

-cl::Program &OpenCLRuntime::program() {
-  // TODO(liuqi) : useless, leave it for old code.
-  return program_;
-}
-
-// TODO(heliangliang) Support binary format
-const std::map<std::string, std::string> OpenCLRuntime::program_map_ = {
-    {"addn", "addn.cl"},
-    {"batch_norm", "batch_norm.cl"},
-    {"bias_add", "bias_add.cl"},
-    {"buffer_to_image", "buffer_to_image.cl"},
-    {"conv_2d", "conv_2d.cl"},
-    {"conv_2d_1x1", "conv_2d_1x1.cl"},
-    {"conv_2d_3x3", "conv_2d_3x3.cl"},
-    {"depthwise_conv_3x3", "depthwise_conv_3x3.cl"},
-    {"pooling", "pooling.cl"},
-    {"relu", "relu.cl"},
-    {"concat", "concat.cl"},
-    {"resize_bilinear", "resize_bilinear.cl"},
-    {"space_to_batch", "space_to_batch.cl"},
-};
-
-std::string
-OpenCLRuntime::GenerateCLBinaryFilenamePrefix(const std::string &filename_msg) {
+std::string OpenCLRuntime::GenerateCLBinaryFilenamePrefix(
+    const std::string &filename_msg) {
  std::string filename_prefix = filename_msg;
  for (auto it = filename_prefix.begin(); it != filename_prefix.end(); ++it) {
    if (*it == ' ' || *it == '-' || *it == '=') {
@@ -262,7 +233,7 @@ void OpenCLRuntime::BuildProgram(const std::string &program_file_name,
                                  program_binary_sizes.get(), nullptr);
    MACE_CHECK(err == CL_SUCCESS) << "Error code: " << err;
    std::unique_ptr<std::unique_ptr<unsigned char[]>[]> program_binaries(
-        new std::unique_ptr<unsigned char[]>[ device_list_size ]);
+        new std::unique_ptr<unsigned char[]>[device_list_size]);
    for (cl_uint i = 0; i < device_list_size; ++i) {
      program_binaries[i] = std::unique_ptr<unsigned char[]>(
          new unsigned char[program_binary_sizes[i]]);
@@ -281,16 +252,11 @@ void OpenCLRuntime::BuildProgram(const std::string &program_file_name,
  }
 }

-cl::Kernel
-OpenCLRuntime::BuildKernel(const std::string &program_name,
-                           const std::string &kernel_name,
-                           const std::set<std::string> &build_options) {
-  auto kernel_program_it = program_map_.find(program_name);
-  if (kernel_program_it == program_map_.end()) {
-    MACE_CHECK(false, program_name, " opencl kernel doesn't exist.");
-  }
-
-  std::string program_file_name = kernel_program_it->second;
+cl::Kernel OpenCLRuntime::BuildKernel(
+    const std::string &program_name,
+    const std::string &kernel_name,
+    const std::set<std::string> &build_options) {
+  std::string program_file_name = program_name + ".cl";
  std::string build_options_str;
  for (auto &option : build_options) {
    build_options_str += " " + option;
@@ -312,15 +278,24 @@ OpenCLRuntime::BuildKernel(const std::string &program_name,
  return cl::Kernel(program, kernel_name.c_str());
 }

+void OpenCLRuntime::GetCallStats(const cl::Event &event, CallStats *stats) {
+  if (stats != nullptr) {
+    stats->start_micros =
+      event.getProfilingInfo<CL_PROFILING_COMMAND_START>() / 1000;
+    stats->end_micros =
+      event.getProfilingInfo<CL_PROFILING_COMMAND_END>() / 1000;
+  }
+}
+
 uint32_t OpenCLRuntime::GetDeviceMaxWorkGroupSize() {
  unsigned long long size = 0;
-  device_.getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE, &size);
+  device_->getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE, &size);
  return static_cast<uint32_t>(size);
 }

 uint32_t OpenCLRuntime::GetKernelMaxWorkGroupSize(const cl::Kernel &kernel) {
  unsigned long long size = 0;
-  kernel.getWorkGroupInfo(device_, CL_KERNEL_WORK_GROUP_SIZE, &size);
+  kernel.getWorkGroupInfo(*device_, CL_KERNEL_WORK_GROUP_SIZE, &size);
  return static_cast<uint32_t>(size);
 }


--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -10,36 +10,42 @@
 #include <mutex>
 #include <set>

+#include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_wrapper.h"
+#include "mace/utils/timer.h"

 namespace mace {

-class OpenCLRuntime {
- public:
-  static OpenCLRuntime *Get();
+class OpenCLProfilingTimer : public Timer {
+  public:
+    explicit OpenCLProfilingTimer(const cl::Event *event) : event_(event) {};
+    void StartTiming() override;
+    void StopTiming() override;
+    double ElapsedMicros() override;

-  static void EnableProfiling();
-  cl::Event *GetDefaultEvent();
-
-  cl_ulong GetEventProfilingStartInfo();
-  cl_ulong GetEventProfilingEndInfo();
+  private:
+    const cl::Event *event_;
+    double start_nanos_;
+    double stop_nanos_;
+};

+class OpenCLRuntime {
+ public:
+  static OpenCLRuntime *Global();

  cl::Context &context();
  cl::Device &device();
  cl::CommandQueue &command_queue();
-  cl::Program &program();

+  void GetCallStats(const cl::Event &event, CallStats *stats);
  uint32_t GetDeviceMaxWorkGroupSize();
  uint32_t GetKernelMaxWorkGroupSize(const cl::Kernel& kernel);
  cl::Kernel BuildKernel(const std::string &program_name,
                         const std::string &kernel_name,
                         const std::set<std::string> &build_options);
 private:
-  OpenCLRuntime(cl::Context context,
-                cl::Device device,
-                cl::CommandQueue command_queue);
+  OpenCLRuntime();
  ~OpenCLRuntime();
  OpenCLRuntime(const OpenCLRuntime&) = delete;
  OpenCLRuntime &operator=(const OpenCLRuntime&) = delete;
@@ -51,19 +57,14 @@ class OpenCLRuntime {
  std::string GenerateCLBinaryFilenamePrefix(const std::string &filename_msg);

 private:
-  static bool enable_profiling_;
-  static std::unique_ptr<cl::Event> profiling_ev_;
-
-  cl::Context context_;
-  cl::Device device_;
-  cl::CommandQueue command_queue_;
-  cl::Program program_;
+  // All OpenCL object must be a pointer and manually deleted before unloading
+  // OpenCL library.
+  cl::Context *context_;
+  cl::Device *device_;
+  cl::CommandQueue *command_queue_;
+  std::map<std::string, cl::Program> built_program_map_;
  std::mutex program_build_mutex_;
  std::string kernel_path_;
-  static const std::map<std::string,
-               std::string> program_map_;
-  mutable std::map<std::string,
-          cl::Program> built_program_map_;
 };

 }  // namespace mace

--- a/mace/core/runtime/opencl/opencl_wrapper.cc
+++ b/mace/core/runtime/opencl/opencl_wrapper.cc
--- a/mace/core/runtime/opencl/opencl_wrapper.h
+++ b/mace/core/runtime/opencl/opencl_wrapper.h
@@ -7,13 +7,10 @@

 namespace mace {

-class OpenCLLibrary {
- public:
-  static bool Supported();
-  static void Load();
-  static void Unload();
-};
-
+  // These functions are not thread-safe.
+  void LoadOpenCLLibrary();
+  void UnloadOpenCLLibrary();
+ 
 }  // namespace mace

 #endif  // MACE_CORE_RUNTIME_OPENCL_OPENCL_WRAPPER_H_
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -7,7 +7,7 @@

 #include "mace/core/allocator.h"
 #include "mace/core/common.h"
-#include "mace/core/logging.h"
+#include "mace/utils/logging.h"
 #include "mace/core/types.h"
 #include "mace/core/mace.h"


--- a/mace/core/testing/test_benchmark.cc
+++ b/mace/core/testing/test_benchmark.cc
@@ -9,8 +9,8 @@
 #include <regex>
 #include <vector>

-#include "mace/core/logging.h"
-#include "mace/core/testing/env_time.h"
+#include "mace/utils/env_time.h"
+#include "mace/utils/logging.h"
 #include "mace/core/testing/test_benchmark.h"

 namespace mace {
@@ -82,7 +82,7 @@ void Benchmark::Run(const char *pattern) {
  }

  printf("%-*s %10s %10s\n", width, "Benchmark", "Time(ns)", "Iterations");
-  printf("%s\n", string(width + 22, '-').c_str());
+  printf("%s\n", std::string(width + 22, '-').c_str());
  for (auto b : *all_benchmarks) {
    if (!std::regex_match(b->name_, match, regex)) continue;
    for (auto arg : b->args_) {
@@ -128,7 +128,7 @@ void Benchmark::Run(int arg1, int arg2, int *run_count, double *run_seconds) {
  int64_t iters = kMinIters;
  while (true) {
    accum_time = 0;
-    start_time = NowMicros();
+    start_time = utils::NowMicros();
    bytes_processed = -1;
    items_processed = -1;
    label.clear();
@@ -160,11 +160,11 @@ void Benchmark::Run(int arg1, int arg2, int *run_count, double *run_seconds) {
 void BytesProcessed(int64_t n) { bytes_processed = n; }
 void ItemsProcessed(int64_t n) { items_processed = n; }
 void StartTiming() {
-  if (start_time == 0) start_time = NowMicros();
+  if (start_time == 0) start_time = utils::NowMicros();
 }
 void StopTiming() {
  if (start_time != 0) {
-    accum_time += (NowMicros() - start_time);
+    accum_time += (utils::NowMicros() - start_time);
    start_time = 0;
  }
 }

--- a/mace/core/testing/test_benchmark.h
+++ b/mace/core/testing/test_benchmark.h
@@ -3,13 +3,12 @@
 //

 // Simple benchmarking facility.
-#ifndef MACE_TEST_BENCHMARK_H_
-#define MACE_TEST_BENCHMARK_H_
+#ifndef MACE_CORE_TESTING_TEST_BENCHMARK_H_
+#define MACE_CORE_TESTING_TEST_BENCHMARK_H_

 #include <utility>
 #include <vector>
-
-#include "mace/core/types.h"
+#include <string>

 #define MACE_BENCHMARK_CONCAT(a, b, c) a##b##c
 #define BENCHMARK(n)                                        \
@@ -31,7 +30,7 @@ class Benchmark {
  static void Run(const char *pattern);

 private:
-  string name_;
+  std::string name_;
  int num_args_;
  std::vector<std::pair<int, int>> args_;
  void (*fn0_)(int) = nullptr;
@@ -51,4 +50,4 @@ void StopTiming();
 }  // namespace testing
 }  // namespace mace

-#endif  // MACE_TEST_BENCHMARK_H_
+#endif  // MACE_CORE_TESTING_TEST_BENCHMARK_H_
--- a/mace/dsp/hexagon_control_wrapper.cc
+++ b/mace/dsp/hexagon_control_wrapper.cc
@@ -335,4 +335,4 @@ bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor,
  return true;
 }

-} // namespace mace
\ No newline at end of file
+} // namespace mace
--- a/mace/dsp/hexagon_control_wrapper_test.cc
+++ b/mace/dsp/hexagon_control_wrapper_test.cc
@@ -3,7 +3,8 @@
 //

 #include "mace/dsp/hexagon_control_wrapper.h"
-#include "mace/core/logging.h"
+#include "mace/utils/logging.h"
+#include "mace/utils/env_time.h"
 #include "gtest/gtest.h"

 using namespace mace;
@@ -27,17 +28,14 @@ TEST(HexagonControlerWrapper, InputFloat) {
  }

  wrapper.ResetPerfInfo();
-  timeval tv1, tv2;
-  gettimeofday(&tv1, NULL);
+  int64_t start_micros = utils::NowMicros();
  int round = 10;
  for (int i = 0; i < round; ++i) {
    VLOG(0) << wrapper.ExecuteGraph(input_tensor, &output_tensor);
  }
-  gettimeofday(&tv2, NULL);
-  VLOG(0) << "avg duration: "
-       << ((tv2.tv_sec - tv1.tv_sec) * 1000 +
-           (tv2.tv_usec - tv1.tv_usec) / 1000) /
-           round;
+  int64_t end_micros = utils::NowMicros();
+  VLOG(0) << "avg duration: " << (end_micros - start_micros) / (double)round
+          << " ms";

  wrapper.GetPerfInfo();
  wrapper.PrintLog();
@@ -95,4 +93,4 @@ TEST(HexagonControlerWrapper, PreQuantize) {

  VLOG(0) << wrapper.TeardownGraph();
  wrapper.Finalize();
-}
\ No newline at end of file
+}
--- a/mace/dsp/hexagon_nn_ops.h
+++ b/mace/dsp/hexagon_nn_ops.h
@@ -5,7 +5,7 @@
 #ifndef MACE_HEXAGON_NN_OPS_H_
 #define MACE_HEXAGON_NN_OPS_H_

-#include "mace/core/logging.h"
+#include "mace/utils/logging.h"
 #include <unordered_map>

 namespace mace {

--- a/mace/dsp/util/BUILD
+++ b/mace/dsp/util/BUILD
@@ -30,7 +30,7 @@ cc_test(
    name = "util_test",
    testonly = 1,
    srcs = glob(["*_test.cc"]),
-    copts = ["-std=c++11"],
+    copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
    linkopts = if_android([
        "-ldl",
        "-lm",

--- a/mace/examples/BUILD
+++ b/mace/examples/BUILD
@@ -6,11 +6,12 @@ cc_binary(
    srcs = [
        "helloworld.cc",
    ],
-    copts = ["-std=c++11"],
-    linkopts = ["-fopenmp"] + if_android(["-ldl"]),
+    copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
+    linkopts = ["-fopenmp"],
    deps = [
        "//mace/core",
        "//mace/ops",
+        "//mace/core:opencl_runtime",
    ],
 )

@@ -18,8 +19,8 @@ cc_test(
    name = "benchmark_example",
    testonly = 1,
    srcs = ["benchmark_example.cc"],
-    copts = ["-std=c++11"],
-    linkopts = ["-fopenmp"] + if_android(["-ldl"]),
+    copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
+    linkopts = ["-fopenmp"],
    linkstatic = 1,
    deps = [
        "//mace/core",
@@ -30,8 +31,8 @@ cc_test(
 cc_binary(
    name = "mace_run",
    srcs = glob(["models/*/*.cc"] + ["mace_run.cc"]),
-    copts = ["-std=c++11"],
-    linkopts = ["-fopenmp"] + if_android(["-ldl"]),
+    copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1", "-v", "-ftime-report"],
+    linkopts = ["-fopenmp"],
    linkstatic = 1,
    deps = [
        "//mace/core",

--- a/mace/examples/helloworld.cc
+++ b/mace/examples/helloworld.cc
@@ -3,6 +3,7 @@
 //

 #include "mace/core/net.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"

 using namespace mace;

@@ -42,16 +43,10 @@ int main() {
  net_def.add_op()->CopyFrom(op_def_1);
  net_def.add_op()->CopyFrom(op_def_2);

-  auto input = net_def.add_tensors();
-  input->set_name("Input");
-  input->set_data_type(DataType::DT_FLOAT);
-  input->add_dims(2);
-  input->add_dims(3);
-  for (int i = 0; i < 6; ++i) {
-    input->add_float_data(i - 3);
-  }
-
-  VLOG(0) << net_def.DebugString();
+  alignas(4) unsigned char tensor_data[] = "012345678901234567890123";
+  const std::vector<int64_t> dims = {1, 2, 3, 1};
+  TensorProto input("Input", tensor_data, dims, DataType::DT_FLOAT);
+  net_def.mutable_tensors().push_back(input);

  // Create workspace and input tensor
  Workspace ws;

--- a/mace/kernels/BUILD
+++ b/mace/kernels/BUILD
@@ -25,8 +25,8 @@ cc_library(
    linkopts = if_android(["-lm"]),
    deps = [
        "//mace/core",
-        "//mace/utils",
-        "//mace/utils:tuner",
+        "//mace/core:opencl_runtime",
+        "//mace/utils:utils_hdrs",
    ],
 )


--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -5,6 +5,7 @@
 #ifndef MACE_KERNELS_ADDN_H_
 #define MACE_KERNELS_ADDN_H_

+#include "mace/core/future.h"
 #include "mace/core/tensor.h"

 namespace mace {
@@ -15,7 +16,7 @@ struct AddNFunctorBase {};
 template <DeviceType D, typename T>
 struct AddNFunctor : AddNFunctorBase {
  void operator()(const std::vector<const Tensor *> &input_tensors,
-                  Tensor *output_tensor) {
+                  Tensor *output_tensor, StatsFuture *future) {
    output_tensor->ResizeLike(input_tensors[0]);
    Tensor::MappingGuard output_map(output_tensor);
    index_t size = input_tensors[0]->size();
@@ -38,12 +39,14 @@ struct AddNFunctor : AddNFunctorBase {

 template <>
 void AddNFunctor<DeviceType::NEON, float>::operator()(
-    const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor);
+    const std::vector<const Tensor *> &input_tensors,
+    Tensor *output_tensor,
+    StatsFuture *future);

 template <typename T>
 struct AddNFunctor<DeviceType::OPENCL, T> : AddNFunctorBase {
  void operator()(const std::vector<const Tensor *> &input_tensors,
-                  Tensor *output_tensor);
+                  Tensor *output_tensor, StatsFuture *future);
 };

 }  //  namespace kernels

--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -5,6 +5,7 @@
 #ifndef MACE_KERNELS_BATCH_NORM_H_
 #define MACE_KERNELS_BATCH_NORM_H_

+#include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/core/mace.h"

@@ -20,7 +21,8 @@ struct BatchNormFunctor {
                  const Tensor *offset,
                  const Tensor *mean,
                  const Tensor *var,
-                  Tensor *output) {
+                  Tensor *output,
+                  StatsFuture *future) {
    // Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
    // The calculation formula for inference is
    // Y = \frac{ \scale } { \sqrt{var+\variance_epsilon} } * X +
@@ -80,7 +82,8 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
    const Tensor *offset,
    const Tensor *mean,
    const Tensor *var,
-    Tensor *output);
+    Tensor *output,
+    StatsFuture *future);

 template <typename T>
 struct BatchNormFunctor<DeviceType::OPENCL, T> {
@@ -91,7 +94,8 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> {
                  const Tensor *offset,
                  const Tensor *mean,
                  const Tensor *var,
-                  Tensor *output);
+                  Tensor *output,
+                  StatsFuture *future);
 };

 }  //  namepsace kernels

--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -5,6 +5,7 @@
 #ifndef MACE_KERNELS_BIAS_ADD_H_
 #define MACE_KERNELS_BIAS_ADD_H_

+#include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/core/mace.h"

@@ -15,7 +16,8 @@ template <DeviceType D, typename T>
 struct BiasAddFunctor {
  void operator()(const Tensor *input,
                  const Tensor *bias,
-                  Tensor *output) {
+                  Tensor *output,
+                  StatsFuture *future) {
    const index_t batch = input->dim(0);
    const index_t height = input->dim(1);
    const index_t width = input->dim(2);
@@ -51,14 +53,16 @@ template <>
 void BiasAddFunctor<DeviceType::NEON, float>::operator()(
    const Tensor *input,
    const Tensor *bias,
-    Tensor *output);
+    Tensor *output,
+    StatsFuture *future);
 */

 template <typename T>
 struct BiasAddFunctor<DeviceType::OPENCL, T> {
  void operator()(const Tensor *input,
                  const Tensor *bias,
-                  Tensor *output);
+                  Tensor *output,
+                  StatsFuture *future);
 };

 }  //  namepsace kernels

--- a/mace/kernels/buffer_to_image.h
+++ b/mace/kernels/buffer_to_image.h
@@ -5,6 +5,7 @@
 #ifndef MACE_KERNELS_BUFFER_TO_IMAGE_H_
 #define MACE_KERNELS_BUFFER_TO_IMAGE_H_

+#include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"

@@ -22,7 +23,8 @@ struct BufferToImageFunctor : BufferToImageFunctorBase{
      BufferToImageFunctorBase(i2b) {}
  void operator()(Tensor *input,
                  const BufferType type,
-                  Tensor *output) {
+                  Tensor *output,
+                  StatsFuture *future) {
    MACE_NOT_IMPLEMENTED;
  }
  bool i2b_;
@@ -34,7 +36,8 @@ struct BufferToImageFunctor<DeviceType::OPENCL, T> : BufferToImageFunctorBase{
      BufferToImageFunctorBase(i2b) {}
  void operator()(Tensor *input,
                  const BufferType type,
-                  Tensor *output);
+                  Tensor *output,
+                  StatsFuture *future);
 };

 }  //  namepsace kernels

--- a/mace/kernels/channel_shuffle.h
+++ b/mace/kernels/channel_shuffle.h
@@ -5,6 +5,7 @@
 #ifndef MACE_KERNELS_CHANNEL_SHUFFLE_H_
 #define MACE_KERNELS_CHANNEL_SHUFFLE_H_

+#include "mace/core/future.h"
 #include "mace/core/tensor.h"

 namespace mace {
@@ -15,7 +16,8 @@ class ChannelShuffleFunctor {
 public:
  ChannelShuffleFunctor(const int group) : group_(group) {}

-  void operator()(const T *input, const index_t *input_shape, T *output) {
+  void operator()(const T *input, const index_t *input_shape,
+                  T *output, StatsFuture *future) {
    index_t batch = input_shape[0];
    index_t channels = input_shape[1];
    index_t height = input_shape[2];
@@ -44,4 +46,4 @@ class ChannelShuffleFunctor {
 }  //  namespace kernels
 }  //  namespace mace

-#endif  // MACE_KERNELS_CHANNEL_SHUFFLE_H_
\ No newline at end of file
+#endif  // MACE_KERNELS_CHANNEL_SHUFFLE_H_
--- a/mace/kernels/concat.h
+++ b/mace/kernels/concat.h
@@ -6,6 +6,7 @@
 #define MACE_KERNELS_CONCAT_H_

 #include "mace/core/common.h"
+#include "mace/core/future.h"
 #include "mace/core/types.h"
 #include "mace/core/mace.h"
 #include "mace/core/tensor.h"
@@ -24,7 +25,8 @@ struct ConcatFunctor : ConcatFunctorBase {
  ConcatFunctor(const int32_t axis): ConcatFunctorBase(axis){}

  void operator()(const std::vector<const Tensor *> &input_list,
-                  Tensor *output) {
+                  Tensor *output,
+                  StatsFuture *future) {
    const Tensor *input0 = input_list.front();
    const int inputs_count = input_list.size();

@@ -78,7 +80,7 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase{
  ConcatFunctor(const int32_t axis): ConcatFunctorBase(axis){}

  void operator()(const std::vector<const Tensor *> &input_list,
-                  Tensor *output);
+                  Tensor *output, StatsFuture *future);

 };


--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -5,6 +5,7 @@
 #ifndef MACE_KERNELS_CONV_2D_H_
 #define MACE_KERNELS_CONV_2D_H_

+#include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/conv_pool_2d_util.h"

@@ -32,7 +33,8 @@ struct Conv2dFunctor : Conv2dFunctorBase {
  void operator()(const Tensor *input,
                  const Tensor *filter,
                  const Tensor *bias,
-                  Tensor *output) {
+                  Tensor *output,
+                  StatsFuture *future) {
    MACE_CHECK_NOTNULL(input);
    MACE_CHECK_NOTNULL(filter);
    MACE_CHECK_NOTNULL(output);
@@ -130,7 +132,8 @@ template<>
 void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
                                                        const Tensor *filter,
                                                        const Tensor *bias,
-                                                        Tensor *output);
+                                                        Tensor *output,
+                                                        StatsFuture *future);

 template<typename T>
 struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
@@ -142,7 +145,8 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
  void operator()(const Tensor *input,
                  const Tensor *filter,
                  const Tensor *bias,
-                  Tensor *output);
+                  Tensor *output,
+                  StatsFuture *future);
 };

 }  // namespace kernels

--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -5,6 +5,7 @@
 #ifndef MACE_KERNELS_DEPTHWISE_CONV_H_
 #define MACE_KERNELS_DEPTHWISE_CONV_H_

+#include "mace/core/future.h"
 #include "mace/core/common.h"
 #include "mace/kernels/conv_pool_2d_util.h"
 #include "mace/core/mace.h"
@@ -23,7 +24,8 @@ struct DepthwiseConv2dFunctor {
  void operator()(const Tensor *input,  // NCHW
                  const Tensor *filter,  // c_out, c_in, kernel_h, kernel_w
                  const Tensor *bias,  // c_out
-                  Tensor *output) {
+                  Tensor *output,
+                  StatsFuture *future) {
    MACE_CHECK_NOTNULL(input);
    MACE_CHECK_NOTNULL(filter);
    MACE_CHECK_NOTNULL(bias);
@@ -115,14 +117,16 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
    const Tensor *input,
    const Tensor *filter,
    const Tensor *bias,
-    Tensor *output);
+    Tensor *output,
+    StatsFuture *future);

 template <>
 void DepthwiseConv2dFunctor<DeviceType::OPENCL, float>::operator()(
    const Tensor *input,
    const Tensor *filter,
    const Tensor *bias,
-    Tensor *output);
+    Tensor *output,
+    StatsFuture *future);

 }  //  namespace kernels
 }  //  namespace mace

--- a/mace/kernels/fused_conv_2d.h
+++ b/mace/kernels/fused_conv_2d.h
@@ -33,8 +33,10 @@ struct FusedConv2dFunctor : FusedConv2dFunctorBase {
  void operator()(const Tensor *input,
                  const Tensor *filter,
                  const Tensor *bias,
-                  Tensor *output) {
-    Conv2dFunctor<D, T>(strides_, paddings_, dilations_)(input, filter, bias, output);
+                  Tensor *output,
+                  StatsFuture *future) {
+    Conv2dFunctor<D, T>(strides_, paddings_, dilations_)(input, filter, bias,
+                                                         output, future);
    T *output_data = output->mutable_data<T>();

    T zero_value;
@@ -62,7 +64,8 @@ struct FusedConv2dFunctor<DeviceType::OPENCL, T> : FusedConv2dFunctorBase {
  void operator()(const Tensor *input,
                  const Tensor *filter,
                  const Tensor *bias,
-                  Tensor *output);
+                  Tensor *output,
+                  StatsFuture *future);
 };

 }  // namespace kernels

--- a/mace/kernels/global_avg_pooling.h
+++ b/mace/kernels/global_avg_pooling.h
@@ -5,6 +5,7 @@
 #ifndef MACE_KERNELS_GLOBAL_AVG_POOLING_H_
 #define MACE_KERNELS_GLOBAL_AVG_POOLING_H_

+#include "mace/core/future.h"
 #include "mace/core/tensor.h"

 namespace mace {
@@ -12,7 +13,10 @@ namespace kernels {

 template <DeviceType D, typename T>
 struct GlobalAvgPoolingFunctor {
-  void operator()(const T *input, const index_t *input_shape, T *output) {
+  void operator()(const T *input,
+                  const index_t *input_shape,
+                  T *output,
+                  StatsFuture *future) {
    index_t batch = input_shape[0];
    index_t channels = input_shape[1];
    index_t height = input_shape[2];
@@ -35,9 +39,10 @@ struct GlobalAvgPoolingFunctor {

 template <>
 void GlobalAvgPoolingFunctor<DeviceType::NEON, float>::operator()(
-    const float *input, const index_t *input_shape, float *output);
+    const float *input, const index_t *input_shape,
+    float *output, StatsFuture *future);

 }  //  namespace kernels
 }  //  namespace mace

-#endif  // MACE_KERNELS_GLOBAL_AVG_POOLING_H_
\ No newline at end of file
+#endif  // MACE_KERNELS_GLOBAL_AVG_POOLING_H_
--- a/mace/kernels/neon/addn_neon.cc
+++ b/mace/kernels/neon/addn_neon.cc
@@ -10,7 +10,8 @@ namespace kernels {

 template <>
 void AddNFunctor<DeviceType::NEON, float>::operator()(
-    const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
+    const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor,
+    StatsFuture *future) {
  // TODO: neon mem copy
  index_t size = output_tensor->size();
  float *output_ptr = output_tensor->mutable_data<float>();

--- a/mace/kernels/neon/batch_norm_neon.cc
+++ b/mace/kernels/neon/batch_norm_neon.cc
@@ -15,7 +15,8 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
    const Tensor *offset,
    const Tensor *mean,
    const Tensor *var,
-    Tensor *output) {
+    Tensor *output,
+    StatsFuture *future) {
  // Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
  // The calculation formula for inference is
  // Y = \frac{ \scale } { \sqrt{var+\epsilon} } * X +

--- a/mace/kernels/neon/conv_2d_neon.cc
+++ b/mace/kernels/neon/conv_2d_neon.cc
@@ -44,7 +44,8 @@ template <>
 void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
                                                        const Tensor *filter,
                                                        const Tensor *bias,
-                                                        Tensor *output) {
+                                                        Tensor *output,
+                                                        StatsFuture *future) {
  MACE_CHECK_NOTNULL(input);
  MACE_CHECK_NOTNULL(filter);
  MACE_CHECK_NOTNULL(output);
@@ -79,7 +80,7 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
                 << " stride " << strides_[0] << "x" << strides_[1]
                 << " is not implemented yet, using slow version";
    Conv2dFunctor<DeviceType::CPU, float>(strides_, paddings_, dilations_)(
-        input, filter, bias, output);
+        input, filter, bias, output, future);
    return;
  }


--- a/mace/kernels/neon/depthwise_conv_neon.cc
+++ b/mace/kernels/neon/depthwise_conv_neon.cc
@@ -29,7 +29,8 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
    const Tensor *input,
    const Tensor *filter,
    const Tensor *bias,
-    Tensor *output) {
+    Tensor *output,
+    StatsFuture *future) {
  typedef void (*Conv2dNeonFunction)(
      const float *input, const index_t *input_shape, const float *filter,
      const index_t *filter_shape, const float *bias, float *output,
@@ -53,7 +54,7 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
                 << " is not implemented yet, using slow version";
    DepthwiseConv2dFunctor<DeviceType::CPU, float>(strides_, paddings_,
                                                   dilations_)(
-        input, filter, bias, output);
+        input, filter, bias, output, future);
    return;
  }

@@ -77,4 +78,4 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
 }

 }  //  namespace kernels
-}  //  namespace mace
\ No newline at end of file
+}  //  namespace mace
--- a/mace/kernels/neon/global_avg_pooling_neon.cc
+++ b/mace/kernels/neon/global_avg_pooling_neon.cc
@@ -10,7 +10,8 @@ namespace kernels {

 template <>
 void GlobalAvgPoolingFunctor<DeviceType::NEON, float>::operator()(
-    const float *input, const index_t *input_shape, float *output) {
+    const float *input, const index_t *input_shape,
+    float *output, StatsFuture *future) {
  index_t batch = input_shape[0];
  index_t channels = input_shape[1];
  index_t height = input_shape[2];
@@ -52,4 +53,4 @@ void GlobalAvgPoolingFunctor<DeviceType::NEON, float>::operator()(
 };

 }  // namespace kernels
-}  // namespace mace
\ No newline at end of file
+}  // namespace mace
--- a/mace/kernels/neon/pooling_neon.cc
+++ b/mace/kernels/neon/pooling_neon.cc
@@ -56,7 +56,8 @@ extern void PoolingAvgNeonK3x3S2x2Padded(const float *input,
 template <>
 void PoolingFunctor<DeviceType::NEON, float>::operator()(
    const Tensor *input_tensor,
-    Tensor *output_tensor) {
+    Tensor *output_tensor,
+    StatsFuture *future) {

  std::vector<index_t> output_shape(4);
  std::vector<int> paddings(2);
@@ -122,9 +123,9 @@ void PoolingFunctor<DeviceType::NEON, float>::operator()(
  } else {  // not implement yet
    PoolingFunctor<DeviceType::CPU, float>(pooling_type_, kernels_, strides_,
                                           padding_, dilations_)(
-        input_tensor, output_tensor);
+        input_tensor, output_tensor, future);
  }
 }

 }  //  namespace kernels
-}  //  namespace mace
\ No newline at end of file
+}  //  namespace mace
--- a/mace/kernels/neon/relu_neon.cc
+++ b/mace/kernels/neon/relu_neon.cc
@@ -10,7 +10,8 @@ namespace kernels {

 template <>
 void ReluFunctor<DeviceType::NEON, float>::operator()(const Tensor *input_tensor,
-                                                      Tensor *output_tensor) {
+                                                      Tensor *output_tensor,
+                                                      StatsFuture *future) {
  const float *input = input_tensor->data<float>();
  float *output = output_tensor->mutable_data<float>();
  index_t size = input_tensor->size();
@@ -66,4 +67,4 @@ void ReluFunctor<DeviceType::NEON, float>::operator()(const Tensor *input_tensor
 };

 }  // namespace kernels
-}  // namespace mace
\ No newline at end of file
+}  // namespace mace
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -13,7 +13,7 @@ namespace kernels {

 template <typename T>
 static void AddN(const std::vector<const Tensor *> &input_tensors,
-                 Tensor *output) {
+                 Tensor *output, StatsFuture *future) {
  if (input_tensors.size() > 4) {
    MACE_NOT_IMPLEMENTED;
  }
@@ -26,7 +26,7 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
  const index_t width_pixels = channel_blocks * width;
  const index_t batch_height_pixels = batch * height;

-  auto runtime = OpenCLRuntime::Get();
+  auto runtime = OpenCLRuntime::Global();
  std::set<std::string> built_options;
  auto dt = DataTypeToEnum<T>::value;
  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
@@ -61,12 +61,13 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
            {1, kwg_size}
    };
  };
+  cl::Event event;
  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
        addn_kernel, cl::NullRange,
        cl::NDRange(gws[0], gws[1]),
        cl::NDRange(params[0], params[1]),
-        NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+        nullptr, &event);

    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
    return error;
@@ -77,16 +78,25 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
     << output->dim(1) << "_"
     << output->dim(2) << "_"
     << output->dim(3);
+  OpenCLProfilingTimer timer(&event);
  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
                                                     lws,
                                                     params_generator,
-                                                     func);
-
+                                                     func,
+                                                     &timer);
+  future->wait_fn = [runtime, event](CallStats *stats) {
+    event.wait();
+    if (stats != nullptr) {
+      runtime->GetCallStats(event, stats);
+    }
+  };
 }

 template <typename T>
 void AddNFunctor<DeviceType::OPENCL, T>::operator()(
-    const std::vector<const Tensor *> &input_tensors, Tensor *output_tensor) {
+    const std::vector<const Tensor *> &input_tensors,
+    Tensor *output_tensor,
+    StatsFuture *future) {
  size_t size = input_tensors.size();
  MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);

@@ -108,7 +118,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
  CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
  output_tensor->ResizeImage(output_shape, output_image_shape);

-  AddN<T>(input_tensors, output_tensor);
+  AddN<T>(input_tensors, output_tensor, future);
 };

 template

--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -6,6 +6,7 @@
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/utils/tuner.h"
+#include "mace/utils/utils.h"
 #include "mace/kernels/opencl/helper.h"

 namespace mace {
@@ -18,8 +19,8 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
    const Tensor *offset,
    const Tensor *mean,
    const Tensor *var,
-    Tensor *output) {
-
+    Tensor *output,
+    StatsFuture *future) {
  const index_t batch = input->dim(0);
  const index_t height = input->dim(1);
  const index_t width = input->dim(2);
@@ -27,7 +28,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(

  const index_t channel_blocks = RoundUpDiv4(channels);

-  auto runtime = OpenCLRuntime::Get();
+  auto runtime = OpenCLRuntime::Global();
  std::set<std::string> built_options;
  auto dt = DataTypeToEnum<T>::value;
  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
@@ -72,12 +73,13 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
            {15, 7, 9},
            {1, kwg_size, 1}};
  };
+  cl::Event event;
  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
        bm_kernel, cl::NullRange,
        cl::NDRange(gws[0], gws[1], gws[2]),
        cl::NDRange(params[0], params[1], params[2]),
-        NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+        nullptr, &event);

    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
    return error;
@@ -88,10 +90,18 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
     << output->dim(1) << "_"
     << output->dim(2) << "_"
     << output->dim(3);
+  OpenCLProfilingTimer timer(&event);
  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
                                                     lws,
                                                     params_generator,
-                                                     func);
+                                                     func,
+                                                     &timer);
+  future->wait_fn = [runtime, event](CallStats *stats) {
+    event.wait();
+    if (stats != nullptr) {
+      runtime->GetCallStats(event, stats);
+    }
+  };
 }

 template

--- a/mace/kernels/opencl/bias_add_opencl.cc
+++ b/mace/kernels/opencl/bias_add_opencl.cc
@@ -15,8 +15,8 @@ template <typename T>
 void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
    const Tensor *input,
    const Tensor *bias,
-    Tensor *output) {
-
+    Tensor *output,
+    StatsFuture *future) {
  const index_t batch = input->dim(0);
  const index_t height = input->dim(1);
  const index_t width = input->dim(2);
@@ -28,7 +28,7 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
                           static_cast<uint32_t>(width),
                           static_cast<uint32_t>(height * batch)};

-  auto runtime = OpenCLRuntime::Get();
+  auto runtime = OpenCLRuntime::Global();
  std::set<std::string> built_options;
  auto dt = DataTypeToEnum<T>::value;
  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
@@ -43,12 +43,19 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
  bias_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
  bias_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));

+  cl::Event event;
  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
      bias_kernel, cl::NullRange,
      cl::NDRange(gws[0], gws[1], gws[2]),
      cl::NDRange(lws[0], lws[1], lws[2]),
-      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+      nullptr, &event);
  MACE_CHECK(error == CL_SUCCESS);
+  future->wait_fn = [runtime, event](CallStats *stats) {
+    event.wait();
+    if (stats != nullptr) {
+      runtime->GetCallStats(event, stats);
+    }
+  };
 }

 template

--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -12,7 +12,8 @@ namespace kernels {
 template<typename T>
 void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
                                                             const BufferType type,
-                                                             Tensor *image) {
+                                                             Tensor *image,
+                                                             StatsFuture *future) {
  MACE_CHECK(!buffer->is_image()) << "buffer must be buffer-type";
  std::vector<size_t> image_shape;
  if (!i2b_) {
@@ -31,7 +32,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
  }
-  auto runtime = OpenCLRuntime::Get();
+  auto runtime = OpenCLRuntime::Global();
  string kernel_name;
  switch (type) {
    case FILTER:
@@ -64,12 +65,20 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
                         1};
  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(b2f_kernel);
  const std::vector<uint32_t> lws = {kwg_size, 1, 1};
+  cl::Event event;
  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
      b2f_kernel, cl::NullRange,
      cl::NDRange(gws[0], gws[1], gws[2]),
-      cl::NDRange(lws[0], lws[1], lws[2]));
-
+      cl::NDRange(lws[0], lws[1], lws[2]),
+      nullptr, &event);
  MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+
+  future->wait_fn = [runtime, event](CallStats *stats) {
+    event.wait();
+    if (stats != nullptr) {
+      runtime->GetCallStats(event, stats);
+    }
+  };
 }

 template struct BufferToImageFunctor<DeviceType::OPENCL, float>;

--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -14,7 +14,8 @@ namespace kernels {
 static void Concat2(const Tensor *input0,
                    const Tensor *input1,
                    const DataType dt,
-                    Tensor *output) {
+                    Tensor *output,
+                    StatsFuture *future) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
  const index_t width = output->dim(2);
@@ -22,7 +23,7 @@ static void Concat2(const Tensor *input0,

  const int channel_blk = RoundUpDiv4(channel);

-  auto runtime = OpenCLRuntime::Get();
+  auto runtime = OpenCLRuntime::Global();
  std::set<std::string> built_options;
  if (input0->dtype() == output->dtype()) {
    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
@@ -73,12 +74,13 @@ static void Concat2(const Tensor *input0,
            {15, 7, 9},
            {1, kwg_size, 1}};
  };
+  cl::Event event;
  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
        concat_kernel, cl::NullRange,
        cl::NDRange(gws[0], gws[1], gws[2]),
        cl::NDRange(params[0], params[1], params[2]),
-        NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+        nullptr, &event);

    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
    return error;
@@ -89,15 +91,24 @@ static void Concat2(const Tensor *input0,
     << output->dim(1) << "_"
     << output->dim(2) << "_"
     << output->dim(3);
+  OpenCLProfilingTimer timer(&event);
  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
                                                     lws,
                                                     params_generator,
-                                                     func);
+                                                     func,
+                                                     &timer);
+  future->wait_fn = [runtime, event](CallStats *stats) {
+    event.wait();
+    if (stats != nullptr) {
+      runtime->GetCallStats(event, stats);
+    }
+  };
 }

 template<typename T>
 void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Tensor *> &input_list,
-                                                      Tensor *output) {
+                                                      Tensor *output,
+                                                      StatsFuture *future) {
  const int inputs_count = input_list.size();
  MACE_CHECK(inputs_count == 2 && axis_ == 3)
    << "Concat opencl kernel only support two elements with axis == 3";
@@ -124,7 +135,8 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Te

  switch (inputs_count) {
    case 2:
-      Concat2(input_list[0], input_list[1], DataTypeToEnum<T>::value, output);
+      Concat2(input_list[0], input_list[1], DataTypeToEnum<T>::value,
+              output, future);
      break;
    default:MACE_NOT_IMPLEMENTED;
  }

--- a/mace/kernels/opencl/conv_2d_opencl.cc
+++ b/mace/kernels/opencl/conv_2d_opencl.cc
@@ -11,37 +11,40 @@ namespace kernels {
 extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
                               const Tensor *bias, const bool fused_relu,
                               const int *padding, const DataType dt,
-                               Tensor *output);
+                               Tensor *output, StatsFuture *future);

 extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter,
                               const Tensor *bias, const bool fused_relu,
                               const int *padding, const DataType dt,
-                               Tensor *output);
+                               Tensor *output, StatsFuture *future);

 extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
                               const Tensor *bias, const bool fused_relu,
                               const int *padding, const DataType dt,
-                               Tensor *output);
+                               Tensor *output, StatsFuture *future);

 extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
                               const Tensor *bias, const bool fused_relu,
                               const int *padding, const DataType dt,
-                               Tensor *output);
+                               Tensor *output, StatsFuture *future);

 extern void Conv2dOpencl(const Tensor *input, const Tensor *filter,
                         const Tensor *bias, const bool fused_relu,
                         const uint32_t stride, const int *padding,
-                         const DataType dt, Tensor *output);
+                         const DataType dt, Tensor *output,
+                         StatsFuture *future);

 template<typename T>
 void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
                                                      const Tensor *filter,
                                                      const Tensor *bias,
-                                                      Tensor *output) {
+                                                      Tensor *output,
+                                                      StatsFuture *future) {
  typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
                                       const Tensor *bias, const bool fused_relu,
                                       const int *padding, const DataType dt,
-                                       Tensor *output);
+                                       Tensor *output,
+                                       StatsFuture *future);
  // Selection matrix: kernel_size x stride_size
  static const Conv2dOpenclFunction selector[5][2] = {
      {Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2},
@@ -74,9 +77,12 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  if (kernel_h == kernel_w && kernel_h <= 5 &&
      selector[kernel_h - 1][strides_[0] - 1] != nullptr) {
    auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
-    conv2d_func(input, filter, bias, false, paddings.data(), DataTypeToEnum<T>::value, output);
+    conv2d_func(input, filter, bias, false, paddings.data(),
+                DataTypeToEnum<T>::value, output, future);
  } else {
-    Conv2dOpencl(input, filter, bias, false, strides_[0], paddings.data(), DataTypeToEnum<T>::value, output);
+    Conv2dOpencl(input, filter, bias, false, strides_[0],
+                 paddings.data(), DataTypeToEnum<T>::value,
+                 output, future);
  }

 }

--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -18,7 +18,8 @@ void Conv1x1(const Tensor *input,
             const bool fused_relu,
             const int stride,
             const DataType dt,
-             Tensor *output) {
+             Tensor *output,
+             StatsFuture *future) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
  const index_t width = output->dim(2);
@@ -45,9 +46,7 @@ void Conv1x1(const Tensor *input,
    built_options.emplace("-DFUSED_RELU");
  }

-  auto runtime = OpenCLRuntime::Get();
-  auto program = runtime->program();
-
+  auto runtime = OpenCLRuntime::Global();
  auto conv_2d_kernel = runtime->BuildKernel("conv_2d_1x1", "conv_2d_1x1", built_options);

  uint32_t idx = 0;
@@ -92,12 +91,13 @@ void Conv1x1(const Tensor *input,
            {15, 7, 9},
            {1, kwg_size, 1}};
  };
+  cl::Event event;
  auto func = [&](const std::vector<uint32_t>& params)->cl_int {
    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
        conv_2d_kernel, cl::NullRange,
        cl::NDRange(gws[0], gws[1], gws[2]),
        cl::NDRange(params[0], params[1], params[2]),
-        NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+        nullptr, &event);

    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
    return error;
@@ -108,11 +108,18 @@ void Conv1x1(const Tensor *input,
     << output->dim(1) << "_"
     << output->dim(2) << "_"
     << output->dim(3);
+  OpenCLProfilingTimer timer(&event);
  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
                                                     lws,
                                                     params_generator,
-                                                     func);
-
+                                                     func,
+                                                     &timer);
+  future->wait_fn = [runtime, event](CallStats *stats) {
+    event.wait();
+    if (stats != nullptr) {
+      runtime->GetCallStats(event, stats);
+    }
+  };
 }

 extern void Conv2dOpenclK1x1S1(const Tensor *input,
@@ -121,8 +128,9 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input,
                               const bool fused_relu,
                               const int *padding,
                               const DataType dt,
-                               Tensor *output) {
-  Conv1x1(input, filter, bias, fused_relu, 1, dt, output);
+                               Tensor *output,
+                               StatsFuture *future) {
+  Conv1x1(input, filter, bias, fused_relu, 1, dt, output, future);
 };

 extern void Conv2dOpenclK1x1S2(const Tensor *input,
@@ -131,8 +139,9 @@ extern void Conv2dOpenclK1x1S2(const Tensor *input,
                               const bool fused_relu,
                               const int *padding,
                               const DataType dt,
-                               Tensor *output) {
-  Conv1x1(input, filter, bias, fused_relu, 2, dt, output);
+                               Tensor *output,
+                               StatsFuture *future) {
+  Conv1x1(input, filter, bias, fused_relu, 2, dt, output, future);
 };

 }  // namespace kernels

--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -15,7 +15,8 @@ namespace kernels {
 static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
                         const Tensor *bias, const bool fused_relu,
                         const uint32_t stride, const int *padding,
-                         const DataType dt, Tensor *output) {
+                         const DataType dt, Tensor *output,
+                         StatsFuture *future) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
  const index_t width = output->dim(2);
@@ -35,9 +36,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
    built_options.emplace("-DFUSED_RELU");
  }

-  auto runtime = OpenCLRuntime::Get();
-  auto program = runtime->program();
-
+  auto runtime = OpenCLRuntime::Global();
  auto conv_2d_kernel = runtime->BuildKernel("conv_2d_3x3", "conv_2d_3x3", built_options);

  uint32_t idx = 0;
@@ -84,12 +83,13 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
            {15, 7, 9},
            {1, kwg_size, 1}};
  };
+  cl::Event event;
  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
        conv_2d_kernel, cl::NullRange,
        cl::NDRange(gws[0], gws[1], gws[2]),
        cl::NDRange(params[0], params[1], params[2]),
-        NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+        nullptr, &event);

    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
    return error;
@@ -100,11 +100,19 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
     << output->dim(1) << "_"
     << output->dim(2) << "_"
     << output->dim(3);
+  OpenCLProfilingTimer timer(&event);
  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
                                                     lws,
                                                     params_generator,
-                                                     func);
+                                                     func,
+                                                     &timer);

+  future->wait_fn = [runtime, event](CallStats *stats) {
+    event.wait();
+    if (stats != nullptr) {
+      runtime->GetCallStats(event, stats);
+    }
+  };
 }
 void Conv2dOpenclK3x3S1(const Tensor *input,
                        const Tensor *filter,
@@ -112,8 +120,9 @@ void Conv2dOpenclK3x3S1(const Tensor *input,
                        const bool fused_relu,
                        const int *padding,
                        const DataType dt,
-                        Tensor *output) {
-  Conv2d3x3S12(input, filter, bias, fused_relu, 1, padding, dt, output);
+                        Tensor *output,
+                        StatsFuture *future) {
+  Conv2d3x3S12(input, filter, bias, fused_relu, 1, padding, dt, output, future);
 };

 void Conv2dOpenclK3x3S2(const Tensor *input,
@@ -122,8 +131,9 @@ void Conv2dOpenclK3x3S2(const Tensor *input,
                        const bool fused_relu,
                        const int *padding,
                        const DataType dt,
-                        Tensor *output) {
-  Conv2d3x3S12(input, filter, bias, fused_relu, 2, padding, dt, output);
+                        Tensor *output,
+                        StatsFuture *future) {
+  Conv2d3x3S12(input, filter, bias, fused_relu, 2, padding, dt, output, future);
 };

 }  // namespace kernels

--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -15,7 +15,8 @@ namespace kernels {
 void Conv2dOpencl(const Tensor *input, const Tensor *filter,
                  const Tensor *bias, const bool fused_relu,
                  const uint32_t stride, const int *padding,
-                  const DataType dt, Tensor *output) {
+                  const DataType dt, Tensor *output,
+                  StatsFuture *future) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
  const index_t width = output->dim(2);
@@ -35,9 +36,7 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
    built_options.emplace("-DFUSED_RELU");
  }

-  auto runtime = OpenCLRuntime::Get();
-  auto program = runtime->program();
-
+  auto runtime = OpenCLRuntime::Global();
  auto conv_2d_kernel = runtime->BuildKernel("conv_2d", "conv_2d", built_options);

  uint32_t idx = 0;
@@ -86,12 +85,13 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
            {15, 7, 9},
            {1, kwg_size, 1}};
  };
+  cl::Event event;
  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
        conv_2d_kernel, cl::NullRange,
        cl::NDRange(gws[0], gws[1], gws[2]),
        cl::NDRange(params[0], params[1], params[2]),
-        NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+        nullptr, &event);

    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
    return error;
@@ -102,11 +102,19 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
     << output->dim(1) << "_"
     << output->dim(2) << "_"
     << output->dim(3);
+  OpenCLProfilingTimer timer(&event);
  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
                                                     lws,
                                                     params_generator,
-                                                     func);
+                                                     func,
+                                                     &timer);

+  future->wait_fn = [runtime, event](CallStats *stats) {
+    event.wait();
+    if (stats != nullptr) {
+      runtime->GetCallStats(event, stats);
+    }
+  };
 }

 }  // namespace kernels

--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -8,17 +8,21 @@ namespace mace {
 namespace kernels {

 extern void DepthwiseConvOpenclK3x3S1(const Tensor *input, const Tensor *filter,
-                                      const Tensor *bias, Tensor *output);
+                                      const Tensor *bias, Tensor *output,
+                                      StatsFuture *future);

 extern void DepthwiseConvOpenclK3x3S2(const Tensor *input, const Tensor *filter,
-                                      const Tensor *bias, Tensor *output);
+                                      const Tensor *bias, Tensor *output,
+                                      StatsFuture *future);
 template <>
 void DepthwiseConv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
                                                                   const Tensor *filter,
                                                                   const Tensor *bias,
-                                                                   Tensor *output) {
+                                                                   Tensor *output,
+                                                                   StatsFuture *future) {
  typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
-                                       const Tensor *bias, Tensor *output);
+                                       const Tensor *bias, Tensor *output,
+                                       StatsFuture *future);
  // Selection matrix: kernel_size x stride_size
  static const Conv2dOpenclFunction selector[5][2] = {
      {nullptr, nullptr},
@@ -38,7 +42,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor
                 << " is not implemented yet, using slow version";
    // TODO(heliangliang) The CPU/NEON kernel should map the buffer
    DepthwiseConv2dFunctor<DeviceType::CPU, float>(strides_, paddings_, dilations_)(
-        input, filter, bias, output);
+        input, filter, bias, output, future);
    return;
  }

@@ -46,9 +50,9 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor
  if (paddings_[0] > 0 || paddings_[1] > 0) {
    Tensor padded_input(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<float>::v());
    ConstructInputWithPadding(input, paddings_.data(), &padded_input);
-    conv2d_func(&padded_input, filter, bias, output);
+    conv2d_func(&padded_input, filter, bias, output, future);
  }else {
-    conv2d_func(input, filter, bias, output);
+    conv2d_func(input, filter, bias, output, future);
  }

 }

--- a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
@@ -14,7 +14,8 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
                                            const Tensor *filter,
                                            const Tensor *bias,
                                            const uint32_t stride,
-                                            Tensor *output) {
+                                            Tensor *output,
+                                            StatsFuture *future) {
  const index_t batch = output->dim(0);
  const index_t channels = output->dim(1);
  const index_t height = output->dim(2);
@@ -30,7 +31,7 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
  const index_t channel_blocks = (channels + 3) / 4;
  const index_t pixel_blocks = (width + 3) / 4 * height;

-  auto runtime = OpenCLRuntime::Get();
+  auto runtime = OpenCLRuntime::Global();
  std::set<std::string> built_options;
  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype()));
  built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
@@ -57,26 +58,36 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
  const uint32_t lws[3] = {static_cast<uint32_t>(1),
                           static_cast<uint32_t>(1),
                           static_cast<uint32_t>(256)};
+  cl::Event event;
  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
      conv_kernel, cl::NullRange,
      cl::NDRange(gws[0], gws[1], gws[2]),
      cl::NDRange(lws[0], lws[1], lws[2]),
-      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+      nullptr, &event);
  MACE_CHECK(error == CL_SUCCESS);
+
+  future->wait_fn = [runtime, event](CallStats *stats) {
+    event.wait();
+    if (stats != nullptr) {
+      runtime->GetCallStats(event, stats);
+    }
+  };
 }

 extern void DepthwiseConvOpenclK3x3S1(const Tensor *input,
                                      const Tensor *filter,
                                      const Tensor *bias,
-                                      Tensor *output) {
-  InnerDepthwiseConvOpenclK3x3S12(input, filter, bias, 1, output);
+                                      Tensor *output,
+                                      StatsFuture *future) {
+  InnerDepthwiseConvOpenclK3x3S12(input, filter, bias, 1, output, future);
 };

 extern void DepthwiseConvOpenclK3x3S2(const Tensor *input,
                                      const Tensor *filter,
                                      const Tensor *bias,
-                                      Tensor *output) {
-  InnerDepthwiseConvOpenclK3x3S12(input, filter, bias, 2, output);
+                                      Tensor *output,
+                                      StatsFuture *future) {
+  InnerDepthwiseConvOpenclK3x3S12(input, filter, bias, 2, output, future);
 };

 }  // namespace kernels

--- a/mace/kernels/opencl/fused_conv_2d_opencl.cc
+++ b/mace/kernels/opencl/fused_conv_2d_opencl.cc
@@ -11,37 +11,43 @@ namespace kernels {
 extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
                               const Tensor *bias, const bool fused_relu,
                               const int *padding, const DataType dt,
-                               Tensor *output);
+                               Tensor *output,
+                               StatsFuture *future);

 extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter,
                               const Tensor *bias, const bool fused_relu,
                               const int *padding, const DataType dt,
-                               Tensor *output);
+                               Tensor *output,
+                               StatsFuture *future);

 extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
                               const Tensor *bias, const bool fused_relu,
                               const int *padding, const DataType dt,
-                               Tensor *output);
+                               Tensor *output,
+                               StatsFuture *future);

 extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
                               const Tensor *bias, const bool fused_relu,
                               const int *padding, const DataType dt,
-                               Tensor *output);
+                               Tensor *output,
+                               StatsFuture *future);

 extern void Conv2dOpencl(const Tensor *input, const Tensor *filter,
                         const Tensor *bias, const bool fused_relu,
                         const uint32_t stride, const int *padding,
-                         const DataType dt, Tensor *output);
+                         const DataType dt, Tensor *output,
+                         StatsFuture *future);

 template<typename T>
 void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
                                                           const Tensor *filter,
                                                           const Tensor *bias,
-                                                           Tensor *output) {
+                                                           Tensor *output,
+                                                           StatsFuture *future) {
  typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
                                       const Tensor *bias, const bool fused_relu,
                                       const int *padding, const DataType dt,
-                                       Tensor *output);
+                                       Tensor *output, StatsFuture *future);
  // Selection matrix: kernel_size x stride_size
  static const Conv2dOpenclFunction selector[5][2] = {
      {Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2},
@@ -73,9 +79,11 @@ void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  if (kernel_h == kernel_w && kernel_h <= 5 &&
      selector[kernel_h - 1][strides_[0] - 1] != nullptr) {
    auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
-    conv2d_func(input, filter, bias, true, paddings.data(), DataTypeToEnum<T>::value, output);
+    conv2d_func(input, filter, bias, true, paddings.data(),
+                DataTypeToEnum<T>::value, output, future);
  } else {
-    Conv2dOpencl(input, filter, bias, true, strides_[0], paddings.data(), DataTypeToEnum<T>::value, output);
+    Conv2dOpencl(input, filter, bias, true, strides_[0], paddings.data(),
+                 DataTypeToEnum<T>::value, output, future);
  }
 }


--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -17,7 +17,8 @@ static void Pooling(const Tensor *input,
                    const int pooling_size,
                    const PoolingType type,
                    const DataType dt,
-                    Tensor *output) {
+                    Tensor *output,
+                    StatsFuture *future) {
  index_t batch = output->dim(0);
  index_t out_height = output->dim(1);
  index_t out_width = output->dim(2);
@@ -25,7 +26,7 @@ static void Pooling(const Tensor *input,

  index_t channel_blocks = (channels + 3) / 4;

-  auto runtime = OpenCLRuntime::Get();
+  auto runtime = OpenCLRuntime::Global();
  std::set<std::string> built_options;
  if (type == MAX && input->dtype() == output->dtype()) {
    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
@@ -85,12 +86,13 @@ static void Pooling(const Tensor *input,
            {15, 7, 9},
            {1, kwg_size, 1}};
  };
+  cl::Event event;
  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
        pooling_kernel, cl::NullRange,
        cl::NDRange(gws[0], gws[1], gws[2]),
        cl::NDRange(params[0], params[1], params[2]),
-        NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+        nullptr, &event);

    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
    return error;
@@ -101,16 +103,27 @@ static void Pooling(const Tensor *input,
     << output->dim(1) << "_"
     << output->dim(2) << "_"
     << output->dim(3);
+  OpenCLProfilingTimer timer(&event);
  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
                                                     lws,
                                                     params_generator,
-                                                     func);
+                                                     func,
+                                                     &timer);
+
+  future->wait_fn = [runtime, event](CallStats *stats) {
+    event.wait();
+    if (stats != nullptr) {
+      runtime->GetCallStats(event, stats);
+    }
+  };
 }

 template<typename T>
 void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
-                                                       Tensor *output) {
-  MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1) << "Pooling opencl kernel not support dilation yet";
+                                                       Tensor *output,
+                                                       StatsFuture *future) {
+  MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1)
+    << "Pooling opencl kernel not support dilation yet";
  std::vector<index_t> output_shape(4);
  std::vector<int> paddings(2);
  std::vector<index_t> filter_shape = {
@@ -128,7 +141,7 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  output->ResizeImage(output_shape, output_image_shape);

  Pooling(input, strides_, paddings.data(), kernels_[0], pooling_type_,
-          DataTypeToEnum<T>::value, output);
+          DataTypeToEnum<T>::value, output, future);

 }


--- a/mace/kernels/opencl/relu_opencl.cc
+++ b/mace/kernels/opencl/relu_opencl.cc
@@ -14,7 +14,8 @@ namespace kernels {

 template <typename T>
 void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
-                                                        Tensor *output) {
+                                                    Tensor *output,
+                                                    StatsFuture *future) {

  const index_t batch = input->dim(0);
  const index_t height = input->dim(1);
@@ -23,8 +24,7 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,

  const index_t channel_blocks = RoundUpDiv4(channels);

-  auto runtime = OpenCLRuntime::Get();
-  auto program = runtime->program();
+  auto runtime = OpenCLRuntime::Global();

  std::set<std::string> built_options;
  auto dt = DataTypeToEnum<T>::value;
@@ -74,12 +74,13 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
            {15, 7, 9},
            {1, kwg_size, 1}};
  };
+  cl::Event event;
  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
        relu_kernel, cl::NullRange,
        cl::NDRange(gws[0], gws[1], gws[2]),
        cl::NDRange(params[0], params[1], params[2]),
-        NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+        nullptr, &event);

    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
    return error;
@@ -90,10 +91,18 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     << output->dim(1) << "_"
     << output->dim(2) << "_"
     << output->dim(3);
+  OpenCLProfilingTimer timer(&event);
  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
                                                     lws,
                                                     params_generator,
-                                                     func);
+                                                     func,
+                                                     &timer);
+  future->wait_fn = [runtime, event](CallStats *stats) {
+    event.wait();
+    if (stats != nullptr) {
+      runtime->GetCallStats(event, stats);
+    }
+  };
 }

 template

--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -14,7 +14,7 @@ namespace kernels {

 template <typename T>
 void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
-    const Tensor *input, Tensor *output) {
+    const Tensor *input, Tensor *output, StatsFuture *future) {
  const index_t batch = input->dim(0);
  const index_t in_height = input->dim(1);
  const index_t in_width = input->dim(2);
@@ -38,7 +38,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
      CalculateResizeScale(in_height, out_height, align_corners_);
  float width_scale = CalculateResizeScale(in_width, out_width, align_corners_);

-  auto runtime = OpenCLRuntime::Get();
+  auto runtime = OpenCLRuntime::Global();
  std::set<std::string> built_options;
  auto dt = DataTypeToEnum<T>::value;
  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
@@ -79,12 +79,13 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
            {1, kwg_size / 128, 128},
            {1, kwg_size, 1}};
  };
+  cl::Event event;
  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
        rb_kernel, cl::NullRange,
        cl::NDRange(gws[0], gws[1], gws[2]),
        cl::NDRange(params[0], params[1], params[2]),
-        NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+        nullptr, &event);

    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
    return error;
@@ -95,11 +96,18 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
     << output->dim(1) << "_"
     << output->dim(2) << "_"
     << output->dim(3);
+  OpenCLProfilingTimer timer(&event);
  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
                                                     lws,
                                                     params_generator,
-                                                     func);
-
+                                                     func,
+                                                     &timer);
+  future->wait_fn = [runtime, event](CallStats *stats) {
+    event.wait();
+    if (stats != nullptr) {
+      runtime->GetCallStats(event, stats);
+    }
+  };
 }

 template struct ResizeBilinearFunctor<DeviceType::OPENCL, float>;

--- a/mace/kernels/opencl/space_to_batch_opecl.cc
+++ b/mace/kernels/opencl/space_to_batch_opecl.cc
@@ -17,8 +17,9 @@ template <>
 void SpaceToBatchFunctor<DeviceType::OPENCL, float>::operator()(Tensor *space_tensor,
                                                                const Tensor *block_shape_tensor,
                                                                const Tensor *paddings_tensor,
-                                                                Tensor *batch_tensor) {
-  auto runtime = OpenCLRuntime::Get();
+                                                                Tensor *batch_tensor,
+                                                                StatsFuture *future) {
+  auto runtime = OpenCLRuntime::Global();
  std::set<std::string> built_options;
  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(space_tensor->dtype()));
  auto s2b_kernel = runtime->BuildKernel("space_to_batch", "space_to_batch", built_options);
@@ -42,12 +43,19 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, float>::operator()(Tensor *space_te
  const uint32_t lws[3] = {static_cast<uint32_t>(1),
                           static_cast<uint32_t>(8),
                           static_cast<uint32_t>(128)};
+  cl::Event event;
  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
      s2b_kernel, cl::NullRange,
      cl::NDRange(gws[0], gws[1], gws[2]),
      cl::NDRange(lws[0], lws[1], lws[2]),
-      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+      nullptr, &event);
  MACE_CHECK(error == CL_SUCCESS);
+  future->wait_fn = [runtime, event](CallStats *stats) {
+    event.wait();
+    if (stats != nullptr) {
+      runtime->GetCallStats(event, stats);
+    }
+  };
 }

 } //  namespace kernels

--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -6,6 +6,7 @@
 #define MACE_KERNELS_POOLING_H

 #include <limits>
+#include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/conv_pool_2d_util.h"

@@ -49,7 +50,8 @@ struct PoolingFunctor : PoolingFunctorBase {
                           dilations) {}

  void operator()(const Tensor *input_tensor,
-                  Tensor *output_tensor) {
+                  Tensor *output_tensor,
+                  StatsFuture *future) {

    std::vector<index_t> output_shape(4);
    std::vector<int> paddings(2);
@@ -153,7 +155,8 @@ struct PoolingFunctor : PoolingFunctorBase {
 template<>
 void PoolingFunctor<DeviceType::NEON, float>::operator()(
    const Tensor *input_tensor,
-    Tensor *output_tensor);
+    Tensor *output_tensor,
+    StatsFuture *future);

 template<typename T>
 struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
@@ -166,7 +169,8 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
                           strides, padding,
                           dilations) {}
  void operator()(const Tensor *input_tensor,
-                  Tensor *output_tensor);
+                  Tensor *output_tensor,
+                  StatsFuture *future);
 };

 }  //  namespace kernels

--- a/mace/kernels/relu.h
+++ b/mace/kernels/relu.h
@@ -5,6 +5,7 @@
 #ifndef MACE_KERNELS_RELU_H_
 #define MACE_KERNELS_RELU_H_

+#include "mace/core/future.h"
 #include "mace/core/tensor.h"

 namespace mace {
@@ -14,7 +15,7 @@ template <DeviceType D, typename T>
 struct ReluFunctor {
  T max_limit_;

-  void operator()(const Tensor *input, Tensor *output) {
+  void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
    const T *input_ptr = input->data<T>();
    T *output_ptr = output->mutable_data<T>();
    index_t size = input->size();
@@ -32,13 +33,14 @@ struct ReluFunctor {

 template <>
 void ReluFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
-                                                      Tensor *output);
+                                                      Tensor *output,
+                                                      StatsFuture *future);

 template <typename T>
 struct ReluFunctor<DeviceType::OPENCL, T> {
  T max_limit_;

-  void operator()(const Tensor *input, Tensor *output);
+  void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
 };

 }  //  namespace kernels

--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -4,6 +4,7 @@
 #ifndef MACE_KERNELS_RESIZE_BILINEAR_H_
 #define MACE_KERNELS_RESIZE_BILINEAR_H_

+#include "mace/core/future.h"
 #include "mace/core/tensor.h"

 namespace mace {
@@ -122,7 +123,7 @@ struct ResizeBilinearFunctor : ResizeBilinearFunctorBase {
  ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
      : ResizeBilinearFunctorBase(size, align_corners) {}

-  void operator()(const Tensor *input, Tensor *output) {
+  void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
    const index_t batch = input->dim(0);
    const index_t in_height = input->dim(1);
    const index_t in_width = input->dim(2);
@@ -167,7 +168,7 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T> : ResizeBilinearFunctorBase
  ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
      : ResizeBilinearFunctorBase(size, align_corners) {}

-  void operator()(const Tensor *input, Tensor *output);
+  void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
 };

 }  // namespace kernels

--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.h
@@ -5,6 +5,7 @@
 #ifndef MACE_KERNELS_CONV_2D_H_
 #define MACE_KERNELS_CONV_2D_H_

+#include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/core/mace.h"

@@ -18,7 +19,8 @@ struct SpaceToBatchFunctor {
  void operator()(Tensor *input_tensor,
                  const Tensor *block_shape_tensor,
                  const Tensor *paddings_tensor,
-                  Tensor *output_tensor) {
+                  Tensor *output_tensor,
+                  StatsFuture *future) {
    MACE_NOT_IMPLEMENTED;
  }

@@ -29,7 +31,8 @@ template <>
 void SpaceToBatchFunctor<DeviceType::OPENCL, float>::operator()(Tensor *input_tensor,
                                                                const Tensor *block_shape_tensor,
                                                                const Tensor *paddings_tensor,
-                                                                Tensor *output);
+                                                                Tensor *output,
+                                                                StatsFuture *future);

 }  // namespace kernels
 }  // namespace mace

--- a/mace/mace.bzl
+++ b/mace/mace.bzl
@@ -24,8 +24,8 @@ def if_android_arm64(a):
      "//conditions:default": [],
  })

-def if_profiling(a):
+def if_profiling_enabled(a):
  return select({
-      "//mace:is_profiling": a,
+      "//mace:profiling_enabled": a,
      "//conditions:default": [],
-  })
+})
--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
@@ -34,10 +34,7 @@ cc_library(
        ["*.h"],
        exclude = ["ops_test_util.h"],
    ),
-    copts = [
-        "-std=c++11",
-        "-D_GLIBCXX_USE_C99_MATH_TR1",
-    ],
+    copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
    deps = [
        "//mace/kernels",
    ],
@@ -50,7 +47,7 @@ cc_test(
    srcs = glob(
        ["*_test.cc"],
    ),
-    copts = ["-std=c++11"],
+    copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
    linkopts = ["-fopenmp"],
    linkstatic = 1,
    deps = [
@@ -64,12 +61,8 @@ cc_test(
    name = "ops_benchmark",
    testonly = 1,
    srcs = glob(["*_benchmark.cc"]),
-    copts = [
-        "-std=c++11",
-        "-fopenmp",
-        "-D_GLIBCXX_USE_C99_MATH_TR1",
-    ],
-    linkopts = ["-fopenmp"] + if_android(["-ldl"]),
+    copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
+    linkopts = ["-fopenmp"],
    linkstatic = 1,
    deps = [
        ":ops",

--- a/mace/ops/addn.h
+++ b/mace/ops/addn.h
@@ -16,7 +16,7 @@ class AddNOp : public Operator<D, T> {
  AddNOp(const OperatorDef &operator_def, Workspace *ws)
      : Operator<D, T>(operator_def, ws) {}

-  bool Run() override {
+  bool Run(StatsFuture *future) override {
    Tensor *output_tensor = this->outputs_[0];
    int n = this->inputs_.size();
    vector<const Tensor *> inputs(n, nullptr);
@@ -24,7 +24,7 @@ class AddNOp : public Operator<D, T> {
      inputs[i] = this->inputs_[i];
    }

-    functor_(inputs, output_tensor);
+    functor_(inputs, output_tensor, future);
    return true;
  }


--- a/mace/ops/batch_norm.h
+++ b/mace/ops/batch_norm.h
@@ -19,7 +19,7 @@ class BatchNormOp : public Operator<D, T> {
      OperatorBase::GetSingleArgument<float>("epsilon", static_cast<float>(1e-4));
  }

-  bool Run() override {
+  bool Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);
    const Tensor *scale = this->Input(SCALE);
    const Tensor *offset = this->Input(OFFSET);
@@ -40,7 +40,7 @@ class BatchNormOp : public Operator<D, T> {
    Tensor *output = this->Output(OUTPUT);
    output->ResizeLike(input);

-    functor_(input, scale, offset, mean, var, output);
+    functor_(input, scale, offset, mean, var, output, future);
    return true;
  }


--- a/mace/ops/batch_norm_test.cc
+++ b/mace/ops/batch_norm_test.cc
@@ -88,7 +88,7 @@ TEST_F(BatchNormOpTest, SimpleRandomNeon) {
  index_t height = 64;
  index_t width = 64;
  // Construct graph
-  auto &net = test_net();
+  OpsTestNet net;
  OpDefBuilder("BatchNorm", "BatchNormTest")
      .Input("Input")
      .Input("Scale")
@@ -129,7 +129,7 @@ TEST_F(BatchNormOpTest, ComplexRandomNeon) {
  index_t height = 103;
  index_t width = 113;
  // Construct graph
-  auto &net = test_net();
+  OpsTestNet net;
  OpDefBuilder("BatchNorm", "BatchNormTest")
      .Input("Input")
      .Input("Scale")
@@ -172,7 +172,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
  index_t width = 64;

  // Construct graph
-  auto &net = test_net();
+  OpsTestNet net;
  OpDefBuilder("BatchNorm", "BatchNormTest")
      .Input("Input")
      .Input("Scale")
@@ -237,7 +237,7 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
  index_t width = 64;

  // Construct graph
-  auto &net = test_net();
+  OpsTestNet net;
  OpDefBuilder("BatchNorm", "BatchNormTest")
      .Input("Input")
      .Input("Scale")
@@ -303,7 +303,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
  index_t width = 113;

  // Construct graph
-  auto &net = test_net();
+  OpsTestNet net;
  OpDefBuilder("BatchNorm", "BatchNormTest")
      .Input("Input")
      .Input("Scale")
@@ -369,7 +369,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
  index_t width = 113;

  // Construct graph
-  auto &net = test_net();
+  OpsTestNet net;
  OpDefBuilder("BatchNorm", "BatchNormTest")
      .Input("Input")
      .Input("Scale")

--- a/mace/ops/batch_to_space.h
+++ b/mace/ops/batch_to_space.h
@@ -53,14 +53,14 @@ class BatchToSpaceNDOp: public Operator<D, T> {
  BatchToSpaceNDOp(const OperatorDef &op_def, Workspace *ws)
      : Operator<D, T>(op_def, ws), functor_(true) {}

-  bool Run() override {
+  bool Run(StatsFuture *future) override {
    const Tensor *input_tensor = this->Input(INPUT);
    const Tensor *block_shape_tensor = this->Input(BLOCK_SHAPE);
    const Tensor *cropped_tensor = this->Input(CROPS);
    Tensor *output = this->Output(OUTPUT);

    BatchToSpaceHelper(input_tensor, block_shape_tensor, cropped_tensor, output);
-    functor_(output, block_shape_tensor, cropped_tensor, const_cast<Tensor*>(input_tensor));
+    functor_(output, block_shape_tensor, cropped_tensor, const_cast<Tensor*>(input_tensor), future);
    return true;
  }


--- a/mace/ops/bias_add.h
+++ b/mace/ops/bias_add.h
@@ -16,7 +16,7 @@ class BiasAddOp : public Operator<D, T> {
  BiasAddOp(const OperatorDef &operator_def, Workspace *ws)
      : Operator<D, T>(operator_def, ws), functor_() {}

-  bool Run() override {
+  bool Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);
    const Tensor *bias = this->Input(BIAS);

@@ -28,7 +28,7 @@ class BiasAddOp : public Operator<D, T> {
    Tensor *output = this->Output(OUTPUT);
    output->ResizeLike(input);

-    functor_(input, bias, output);
+    functor_(input, bias, output, future);
    return true;
  }


--- a/mace/ops/bias_add_test.cc
+++ b/mace/ops/bias_add_test.cc
@@ -68,7 +68,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
  index_t width = 64 + rand() % 50;

  // Construct graph
-  auto &net = test_net();
+  OpsTestNet net;
  OpDefBuilder("BiasAdd", "BiasAddTest")
      .Input("Input")
      .Input("Bias")
@@ -114,7 +114,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
  index_t width = 113 + rand() % 100;

  // Construct graph
-  auto &net = test_net();
+  OpsTestNet net;
  OpDefBuilder("BiasAdd", "BiasAddTest")
      .Input("Input")
      .Input("Bias")

--- a/mace/ops/buffer_to_image.h
+++ b/mace/ops/buffer_to_image.h
@@ -16,14 +16,14 @@ class BufferToImageOp: public Operator<D, T> {
  BufferToImageOp(const OperatorDef &op_def, Workspace *ws)
      : Operator<D, T>(op_def, ws)  {}

-  bool Run() override {
+  bool Run(StatsFuture *future) override {
    const Tensor *input_tensor = this->Input(INPUT);

    kernels::BufferType type = static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
        "buffer_type", static_cast<int>(kernels::FILTER)));
    Tensor *output = this->Output(OUTPUT);

-    functor_(const_cast<Tensor *>(input_tensor), type, output);
+    functor_(const_cast<Tensor *>(input_tensor), type, output, future);
    return true;
  }


--- a/mace/ops/channel_shuffle.h
+++ b/mace/ops/channel_shuffle.h
@@ -20,7 +20,7 @@ class ChannelShuffleOp : public Operator<D, T> {
        group_(OperatorBase::GetSingleArgument<int>("group", 1)),
        functor_(this->group_) {}

-  bool Run() override {
+  bool Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);
    Tensor *output = this->Output(OUTPUT);
    MACE_CHECK(input->shape()[1] % group_ == 0,
@@ -29,7 +29,7 @@ class ChannelShuffleOp : public Operator<D, T> {

    output->ResizeLike(input);
    functor_(input->data<T>(), input->shape().data(),
-             output->mutable_data<T>());
+             output->mutable_data<T>(), future);

    return true;
  }

--- a/mace/ops/channel_shuffle_test.cc
+++ b/mace/ops/channel_shuffle_test.cc
@@ -10,7 +10,7 @@ class ChannelShuffleOpTest : public OpsTestBase {};

 TEST_F(ChannelShuffleOpTest, C8G4) {
  // Construct graph
-  auto &net = test_net();
+  OpsTestNet net;
  OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
      .Input("Input")
      .Output("Output")

--- a/mace/ops/concat.h
+++ b/mace/ops/concat.h
@@ -17,7 +17,7 @@ class ConcatOp : public Operator<D, T> {
      : Operator<D, T>(op_def, ws),
        functor_(OperatorBase::GetSingleArgument<int>("axis", 3)){}

-  bool Run() override {
+  bool Run(StatsFuture *future) override {
    MACE_CHECK(this->InputSize() >= 2) << "There must be at least two inputs to concat";
    const std::vector<const Tensor *> input_list = this->Inputs();
    const int32_t concat_axis = OperatorBase::GetSingleArgument<int>("axis", 3);
@@ -30,7 +30,7 @@ class ConcatOp : public Operator<D, T> {

    Tensor *output = this->Output(OUTPUT);

-    functor_(input_list, output);
+    functor_(input_list, output, future);
    return true;
  }


--- a/mace/ops/concat_test.cc
+++ b/mace/ops/concat_test.cc
@@ -12,7 +12,7 @@ class ConcatOpTest : public OpsTestBase {};

 TEST_F(ConcatOpTest, CPUSimpleHorizon) {
  // Construct graph
-  auto &net = test_net();
+  OpsTestNet net;
  OpDefBuilder("Concat", "ConcatTest")
      .Input("Input0")
      .Input("Input1")
@@ -49,7 +49,7 @@ TEST_F(ConcatOpTest, CPUSimpleHorizon) {

 TEST_F(ConcatOpTest, CPUSimpleVertical) {
  // Construct graph
-  auto &net = test_net();
+  OpsTestNet net;
  OpDefBuilder("Concat", "ConcatTest")
      .Input("Input0")
      .Input("Input1")
@@ -92,7 +92,7 @@ TEST_F(ConcatOpTest, CPURandom) {
  int num_inputs = 2 + rand() % 10;
  int axis = rand() % dim;
  // Construct graph
-  auto &net = test_net();
+  OpsTestNet net;
  auto builder = OpDefBuilder("Concat", "ConcatTest");
  for (int i = 0; i < num_inputs; ++i) {
    builder = builder.Input(("Input" + ToString(i)).c_str());

--- a/mace/ops/conv_2d.h
+++ b/mace/ops/conv_2d.h
@@ -22,13 +22,13 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
                 this->dilations_.data()) {
  }

-  bool Run() override {
+  bool Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);
    const Tensor *filter = this->Input(FILTER);
    const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr;
    Tensor *output = this->Output(OUTPUT);

-    functor_(input, filter, bias, output);
+    functor_(input, filter, bias, output, future);

    return true;
  }

--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -78,11 +78,12 @@ void TestSimple3x3SAME() {
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }

-
+#if __ARM_NEON
 TEST_F(Conv2dOpTest, NEONSimple) {
  TestSimple3x3VALID<DeviceType::NEON>();
  TestSimple3x3SAME<DeviceType::NEON>();
 }
+#endif

 template<DeviceType D, typename T>
 void TestNHWCSimple3x3VALID() {
@@ -233,9 +234,11 @@ void TestSimple3x3WithoutBias() {
 }


+#ifdef __ARM_NEON
 TEST_F(Conv2dOpTest, NEONWithouBias) {
  TestSimple3x3WithoutBias<DeviceType::NEON>();
 }
+#endif

 template<DeviceType D, typename T>
 void TestNHWCSimple3x3WithoutBias() {
@@ -335,9 +338,11 @@ static void TestCombined3x3() {
 }


+#ifdef __ARM_NEON
 TEST_F(Conv2dOpTest, NEONCombined) {
  TestCombined3x3<DeviceType::NEON>();
 }
+#endif

 template<DeviceType D, typename T>
 static void TestNHWCCombined3x3() {

--- a/mace/ops/depthwise_conv2d.h
+++ b/mace/ops/depthwise_conv2d.h
@@ -23,7 +23,7 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
    functor_.dilations_ = this->dilations_.data();
  }

-  bool Run() override {
+  bool Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);
    const Tensor *filter = this->Input(FILTER);
    const Tensor *bias = nullptr;
@@ -46,7 +46,7 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
    output->Resize(output_shape);
    functor_.paddings_ = paddings;

-    functor_(input, filter, bias, output);
+    functor_(input, filter, bias, output, future);

    return true;
  }

--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
@@ -96,27 +96,33 @@ void TestNxNS12(const index_t height, const index_t width) {

 }

+#if __ARM_NEON
 TEST_F(DepthwiseConv2dOpTest, NeonSimpleNxNS12) {
  TestNxNS12<DeviceType::NEON>(4, 4);
 }
+#endif

 TEST_F(DepthwiseConv2dOpTest, OpenCLSimpleNxNS12) {
  TestNxNS12<DeviceType::OPENCL>(4, 4);
 }

+#if __ARM_NEON
 TEST_F(DepthwiseConv2dOpTest, NeonAlignedNxNS12) {
  TestNxNS12<DeviceType::NEON>(64, 64);
  TestNxNS12<DeviceType::NEON>(128, 128);
 }
+#endif

 TEST_F(DepthwiseConv2dOpTest, OpenCLAlignedNxNS12) {
  TestNxNS12<DeviceType::OPENCL>(64, 64);
  TestNxNS12<DeviceType::OPENCL>(128, 128);
 }

+#if __ARM_NEON
 TEST_F(DepthwiseConv2dOpTest, NeonUnalignedNxNS12) {
  TestNxNS12<DeviceType::NEON>(107, 113);
 }
+#endif

 TEST_F(DepthwiseConv2dOpTest, OpenCLUnalignedNxNS12) {
  TestNxNS12<DeviceType::OPENCL>(107, 113);

--- a/mace/ops/fused_conv_2d.h
+++ b/mace/ops/fused_conv_2d.h
@@ -22,13 +22,13 @@ class FusedConv2dOp : public ConvPool2dOpBase<D, T> {
                 this->dilations_.data()) {
  }

-  bool Run() override {
+  bool Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);
    const Tensor *filter = this->Input(FILTER);
    const Tensor *bias = this->InputSize() > 2 ? this->Input(BIAS) : nullptr;
    Tensor *output = this->Output(OUTPUT);

-    functor_(input, filter, bias, output);
+    functor_(input, filter, bias, output, future);

    return true;
  }

--- a/mace/ops/global_avg_pooling.h
+++ b/mace/ops/global_avg_pooling.h
@@ -16,7 +16,7 @@ class GlobalAvgPoolingOp : public Operator<D, T> {
  GlobalAvgPoolingOp(const OperatorDef &operator_def, Workspace *ws)
      : Operator<D, T>(operator_def, ws) {}

-  bool Run() override {
+  bool Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);
    Tensor *output = this->Output(OUTPUT);

@@ -29,7 +29,7 @@ class GlobalAvgPoolingOp : public Operator<D, T> {

    auto pooling_func = kernels::GlobalAvgPoolingFunctor<D, T>();
    pooling_func(input->data<float>(), input->shape().data(),
-                 output->mutable_data<float>());
+                 output->mutable_data<float>(), future);
    return true;
  }


--- a/mace/ops/global_avg_pooling_test.cc
+++ b/mace/ops/global_avg_pooling_test.cc
@@ -10,7 +10,7 @@ class GlobalAvgPoolingOpTest : public OpsTestBase {};

 TEST_F(GlobalAvgPoolingOpTest, 3x7x7_CPU) {
  // Construct graph
-  auto &net = test_net();
+  OpsTestNet net;
  OpDefBuilder("GlobalAvgPooling", "GlobalAvgPoolingTest")
      .Input("Input")
      .Output("Output")
@@ -32,9 +32,10 @@ TEST_F(GlobalAvgPoolingOpTest, 3x7x7_CPU) {
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }

+#if __ARM_NEON
 TEST_F(GlobalAvgPoolingOpTest, 3x7x7_NEON) {
  // Construct graph
-  auto &net = test_net();
+  OpsTestNet net;
  OpDefBuilder("GlobalAvgPooling", "GlobalAvgPoolingTest")
      .Input("Input")
      .Output("Output")
@@ -55,3 +56,4 @@ TEST_F(GlobalAvgPoolingOpTest, 3x7x7_NEON) {

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
+#endif
--- a/mace/ops/image_to_buffer.h
+++ b/mace/ops/image_to_buffer.h
@@ -16,13 +16,13 @@ class ImageToBufferOp: public Operator<D, T> {
  ImageToBufferOp(const OperatorDef &op_def, Workspace *ws)
      : Operator<D, T>(op_def, ws), functor_(true)  {}

-  bool Run() override {
+  bool Run(StatsFuture *future) override {
    const Tensor *input_tensor = this->Input(INPUT);
    Tensor *output = this->Output(OUTPUT);

    kernels::BufferType type = static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
        "buffer_type", static_cast<int>(kernels::FILTER)));
-    functor_(output, type, const_cast<Tensor *>(input_tensor));
+    functor_(output, type, const_cast<Tensor *>(input_tensor), future);
    return true;
  }


--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -176,7 +176,7 @@ class OpsTestNet {

  void Sync() {
    if (net_ && device_ == DeviceType::OPENCL) {
-      OpenCLRuntime::Get()->command_queue().finish();
+      OpenCLRuntime::Global()->command_queue().finish();
    }
  }

@@ -188,20 +188,14 @@ class OpsTestNet {
 };

 class OpsTestBase : public ::testing::Test {
- public:
-  OpsTestNet &test_net() { return test_net_; };
-
 protected:
-  virtual void TearDown() {
-    auto ws = test_net_.ws();
-    auto tensor_names = ws->Tensors();
-    for (auto &name : tensor_names) {
-      ws->RemoveTensor(name);
-    }
+  virtual void SetUp() {
+    // OpenCLRuntime::CreateGlobal();
  }

- private:
-  OpsTestNet test_net_;
+  virtual void TearDown() {
+    // OpenCLRuntime::DestroyGlobal();
+  }
 };

 template <typename T>

--- a/mace/ops/pooling.h
+++ b/mace/ops/pooling.h
@@ -23,11 +23,11 @@ class PoolingOp : public ConvPool2dOpBase<D, T> {
        functor_(pooling_type_, kernels_.data(), this->strides_.data(),
                 this->padding_, this->dilations_.data()){};

-  bool Run() override {
+  bool Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);
    Tensor *output = this->Output(OUTPUT);

-    functor_(input, output);
+    functor_(input, output, future);
    return true;
  };


--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
@@ -15,7 +15,7 @@ class PoolingOpTest : public OpsTestBase {};

 TEST_F(PoolingOpTest, MAX_VALID) {
  // Construct graph
-  auto &net = test_net();
+  OpsTestNet net;
  OpDefBuilder("Pooling", "PoolingTest")
      .Input("Input")
      .Output("Output")
@@ -45,7 +45,7 @@ TEST_F(PoolingOpTest, MAX_VALID) {

 TEST_F(PoolingOpTest, MAX_SAME) {
  // Construct graph
-  auto &net = test_net();
+  OpsTestNet net;
  OpDefBuilder("Pooling", "PoolingTest")
      .Input("Input")
      .Output("Output")
@@ -71,7 +71,7 @@ TEST_F(PoolingOpTest, MAX_SAME) {

 TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
  // Construct graph
-  auto &net = test_net();
+  OpsTestNet net;
  OpDefBuilder("Pooling", "PoolingTest")
      .Input("Input")
      .Output("Output")
@@ -98,7 +98,7 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {

 TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
  // Construct graph
-  auto &net = test_net();
+  OpsTestNet net;
  OpDefBuilder("Pooling", "PoolingTest")
      .Input("Input")
      .Output("Output")
@@ -245,7 +245,7 @@ TEST_F(PoolingOpTest, OPENCLUnalignedMaxPooling3S2) {

 TEST_F(PoolingOpTest, AVG_VALID) {
  // Construct graph
-  auto &net = test_net();
+  OpsTestNet net;
  OpDefBuilder("Pooling", "PoolingTest")
      .Input("Input")
      .Output("Output")

--- a/mace/ops/relu.h
+++ b/mace/ops/relu.h
@@ -18,12 +18,12 @@ class ReluOp : public Operator<D, T> {
    functor_.max_limit_ =
        OperatorBase::GetSingleArgument<float>("max_limit", static_cast<float>(-1));
  }
-  bool Run() override {
+  bool Run(StatsFuture *future) override {
    const Tensor *input_tensor = this->inputs_[0];
    Tensor *output_tensor = this->outputs_[0];
    output_tensor->ResizeLike(input_tensor);

-    functor_(input_tensor, output_tensor);
+    functor_(input_tensor, output_tensor, future);
    return true;
  }


--- a/mace/ops/relu_test.cc
+++ b/mace/ops/relu_test.cc
@@ -53,9 +53,11 @@ TEST_F(ReluOpTest, CPUSimple) {
  TestSimple<DeviceType::CPU>();
 }

+#if __ARM_NEON
 TEST_F(ReluOpTest, NEONSimple) {
  TestSimple<DeviceType::NEON>();
 }
+#endif

 TEST_F(ReluOpTest, OPENCLSimple) {
  TestSimple<DeviceType::OPENCL>();
@@ -103,9 +105,11 @@ TEST_F(ReluOpTest, CPUUnalignedSimple) {
  TestUnalignedSimple<DeviceType::CPU>();
 }

+#if __ARM_NEON
 TEST_F(ReluOpTest, NEONUnalignedSimple) {
  TestUnalignedSimple<DeviceType::NEON>();
 }
+#endif

 TEST_F(ReluOpTest, OPENCLUnalignedSimple) {
  TestUnalignedSimple<DeviceType::OPENCL>();
@@ -157,9 +161,11 @@ TEST_F(ReluOpTest, CPUSimpleReluX) {
  TestSimpleReluX<DeviceType::CPU>();
 }

+#if __ARM_NEON
 TEST_F(ReluOpTest, NEONSimpleReluX) {
  TestSimpleReluX<DeviceType::NEON>();
 }
+#endif

 TEST_F(ReluOpTest, OPENCLSimpleReluX) {
  TestSimpleReluX<DeviceType::OPENCL>();
@@ -209,9 +215,11 @@ TEST_F(ReluOpTest, CPUUnalignedSimpleReluX) {
  TestUnalignedSimpleReluX<DeviceType::CPU>();
 }

+#if __ARM_NEON
 TEST_F(ReluOpTest, NEONUnalignedSimpleReluX) {
  TestUnalignedSimpleReluX<DeviceType::NEON>();
 }
+#endif

 TEST_F(ReluOpTest, OPENCLUnalignedSimpleReluX) {
  TestUnalignedSimpleReluX<DeviceType::OPENCL>();

--- a/mace/ops/resize_bilinear.h
+++ b/mace/ops/resize_bilinear.h
@@ -19,14 +19,14 @@ class ResizeBilinearOp : public Operator<D, T> {
            OperatorBase::GetRepeatedArgument<index_t>("size", {-1, -1}),
            OperatorBase::GetSingleArgument<bool>("align_corners", false)) {}

-  bool Run() override {
+  bool Run(StatsFuture *future) override {
    const Tensor *input = this->Input(0);
    Tensor *output = this->Output(0);

    MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional.",
               input->dim_size());

-    functor_(input, output);
+    functor_(input, output, future);
    return true;
  }


--- a/mace/ops/resize_bilinear_benchmark.cc
+++ b/mace/ops/resize_bilinear_benchmark.cc
@@ -73,7 +73,6 @@ static void ResizeBilinearBenchmark(int iters,
  BM_RESIZE_BILINEAR_MACRO(N, C, H0, W0, H1, W1, TYPE, OPENCL);

 // SNPE 835 GPU: 6870us
-BM_RESIZE_BILINEAR(1, 128, 120, 120, 480, 480, half);
 BM_RESIZE_BILINEAR(1, 128, 120, 120, 480, 480, float);

 BM_RESIZE_BILINEAR(1, 256, 7, 7, 15, 15, float);

--- a/mace/ops/resize_bilinear_test.cc
+++ b/mace/ops/resize_bilinear_test.cc
@@ -13,7 +13,7 @@ class ResizeBilinearTest : public OpsTestBase {};
 TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) {
  testing::internal::LogToStderr();
  // Construct graph
-  auto &net = test_net();
+  OpsTestNet net;
  OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
      .Input("Input")
      .Output("Output")
@@ -37,7 +37,7 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) {
 TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) {
  testing::internal::LogToStderr();
  // Construct graph
-  auto &net = test_net();
+  OpsTestNet net;
  OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
      .Input("Input")
      .Output("Output")

--- a/mace/ops/space_to_batch.h
+++ b/mace/ops/space_to_batch.h
@@ -52,14 +52,14 @@ class SpaceToBatchNDOp : public Operator<D, T> {
  SpaceToBatchNDOp(const OperatorDef &op_def, Workspace *ws)
      : Operator<D, T>(op_def, ws) {}

-  bool Run() override {
+  bool Run(StatsFuture *future) override {
    const Tensor *input_tensor = this->Input(INPUT);
    const Tensor *block_shape_tensor = this->Input(BLOCK_SHAPE);
    const Tensor *paddings_tensor = this->Input(PADDINGS);
    Tensor *output = this->Output(OUTPUT);

    SpaceToBatchHelper(input_tensor, block_shape_tensor, paddings_tensor, output);
-    functor_(const_cast<Tensor*>(input_tensor), block_shape_tensor, paddings_tensor, output);
+    functor_(const_cast<Tensor*>(input_tensor), block_shape_tensor, paddings_tensor, output, future);
    return true;
  }


--- a/mace/proto/BUILD
+++ b/mace/proto/BUILD
@@ -10,16 +10,6 @@ licenses(["notice"])  # Apache 2.0

 load("@com_google_protobuf//:protobuf.bzl", "py_proto_library")

-proto_library(
-    name = "stats",
-    srcs = ["stats.proto"],
-)
-
-cc_proto_library(
-    name = "stats_proto",
-    deps = [":stats"],
-)
-
 py_proto_library(
    name = "mace_py",
    srcs = ["mace.proto"],

--- a/mace/proto/stats.proto
+++ b/mace/proto/stats.proto
-syntax = "proto2";
-
-package mace;
-
-message OperatorStats {
-  optional string operator_name      = 1;
-  optional string type               = 2;
-  optional int64 all_start_micros    = 3;
-  optional int64 op_start_rel_micros = 4;
-  optional int64 op_end_rel_micros   = 5;
-  optional int64 all_end_rel_micros  = 6;
-};
-
-message RunMetadata {
-  repeated OperatorStats op_stats = 1;
-}
--- a/mace/python/tools/BUILD
+++ b/mace/python/tools/BUILD
@@ -43,12 +43,3 @@ py_binary(
        "//mace/proto:mace_py",
    ],
 )
-
-py_binary(
-    name = "tf_ops_stats",
-    srcs = ["tf_ops_stats.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "@six_archive//:six",
-    ],
-)
--- a/mace/tools/benchmark/BUILD
+++ b/mace/tools/benchmark/BUILD
@@ -7,15 +7,10 @@ cc_library(
    name = "stat_summarizer",
    srcs = ["stat_summarizer.cc"],
    hdrs = ["stat_summarizer.h"],
-    copts = ["-std=c++11"],
-    linkopts = ["-fopenmp"] + if_android([
-        "-ldl",
-        "-lm",
-    ]),
+    copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
    linkstatic = 1,
    deps = [
        "//mace/core",
-        "//mace/proto:stats_proto",
    ],
 )

@@ -24,8 +19,7 @@ cc_binary(
    srcs = [
        "benchmark_model.cc",
    ],
-    copts = ["-std=c++11"],
-    linkopts = ["-fopenmp"] + if_android(["-ldl"]),
+    copts = ["-std=c++11", "-D_GLIBCXX_USE_C99_MATH_TR1"],
    linkstatic = 1,
    deps = [
        ":stat_summarizer",

--- a/mace/tools/benchmark/stat_summarizer.cc
+++ b/mace/tools/benchmark/stat_summarizer.cc
@@ -4,7 +4,7 @@

 #include "mace/tools/benchmark/stat_summarizer.h"
 #include "mace/core/common.h"
-#include "mace/proto/stats.pb.h"
+#include "mace/core/operator.h"

 #include <iomanip>
 #include <queue>
@@ -26,20 +26,21 @@ void StatSummarizer::ProcessMetadata(const RunMetadata &run_metadata) {
  int64_t curr_total_us = 0;
  int64_t mem_total = 0;

-  int64_t first_node_start_us = run_metadata.op_stats(0).all_start_micros();
+  MACE_CHECK(!run_metadata.op_stats.empty());
+  int64_t first_node_start_us = run_metadata.op_stats[0].stats.start_micros;

  int node_num = 0;
-  for (const auto &ops : run_metadata.op_stats()) {
-    std::string name = ops.operator_name();
-    std::string op_type = ops.type();
+  for (const auto &ops : run_metadata.op_stats) {
+    std::string name = ops.operator_name;
+    std::string op_type = ops.type;

    ++node_num;
-    const int64_t curr_time = ops.all_end_rel_micros();
+    const int64_t curr_time = ops.stats.end_micros - ops.stats.start_micros;
    curr_total_us += curr_time;
    auto result = details_.emplace(name, Detail());
    Detail *detail = &(result.first->second);

-    detail->start_us.UpdateStat(ops.all_start_micros() - first_node_start_us);
+    detail->start_us.UpdateStat(ops.stats.start_micros - first_node_start_us);
    detail->rel_end_us.UpdateStat(curr_time);

    // If this is the first pass, initialize some values.

--- a/mace/utils/BUILD
+++ b/mace/utils/BUILD
@@ -10,36 +10,43 @@ licenses(["notice"])  # Apache 2.0
 load("//mace:mace.bzl", "if_android")

 cc_library(
-    name = "command_line_flags",
+    name = "logging",
    srcs = [
-        "command_line_flags.cc",
+        "logging.cc",
    ],
    hdrs = [
-        "command_line_flags.h",
+        "logging.h",
    ],
    copts = ["-std=c++11"],
-    deps = [
-        "//mace/core",
-    ],
+    linkopts = if_android([
+        "-llog",
+    ]),
 )

 cc_library(
-    name = "utils",
+    name = "command_line_flags",
+    srcs = [
+        "command_line_flags.cc",
+    ],
    hdrs = [
-        "utils.h",
+        "command_line_flags.h",
    ],
    copts = ["-std=c++11"],
+    deps = [
+        ":logging",
+    ],
 )

 cc_library(
    name = "tuner",
    hdrs = [
        "tuner.h",
+        "timer.h",
    ],
    copts = ["-std=c++11"],
    deps = [
-        "//mace/core",
-        "//mace/core:opencl_runtime",
+        ":utils_hdrs",
+        ":logging",
    ],
 )

@@ -50,7 +57,7 @@ cc_test(
        "tuner_test.cc",
    ],
    copts = ["-std=c++11"],
-    linkopts = if_android(["-lm", "-ldl"]),
+    linkopts = if_android(["-pie", "-lm"]),
    linkstatic = 1,
    deps = [
        ":tuner",
@@ -58,3 +65,22 @@ cc_test(
        "@gtest//:gtest_main",
    ],
 )
+
+cc_library(
+    name = "utils_hdrs",
+    hdrs = [
+        "utils.h",
+        "env_time.h",
+    ],
+    copts = ["-std=c++11"],
+)
+
+cc_library(
+    name = "utils",
+    deps = [
+        ":utils_hdrs",
+        ":tuner",
+        ":command_line_flags",
+        ":logging",
+    ],
+)
--- a/mace/utils/command_line_flags.cc
+++ b/mace/utils/command_line_flags.cc
--- a/mace/utils/command_line_flags.h
+++ b/mace/utils/command_line_flags.h
--- a/mace/core/testing/env_time.h
+++ b/mace/core/testing/env_time.h
--- a/mace/core/logging.cc
+++ b/mace/core/logging.cc
--- a/mace/core/logging.h
+++ b/mace/core/logging.h
--- a/mace/utils/timer.h
+++ b/mace/utils/timer.h
--- a/mace/utils/tuner.h
+++ b/mace/utils/tuner.h
--- a/mace/utils/tuner_test.cc
+++ b/mace/utils/tuner_test.cc
--- a/tools/bazel-adb-run.sh
+++ b/tools/bazel-adb-run.sh
--- a/tools/validate_gcn.sh
+++ b/tools/validate_gcn.sh