Merge branch 'vlog' into 'master'

Refactor logging and headers dependencies See merge request !247

Merge branch 'vlog' into 'master'
Refactor logging and headers dependencies See merge request !247
751fd049 · 刘琦 · ef68d37c · 05340dcb · 751fd049 · 751fd049
63 changed file
--- a/mace/core/BUILD
+++ b/mace/core/BUILD
@@ -37,26 +37,23 @@ cc_library(
    ]),
    hdrs = glob([
        "*.h",
-        "public/*.h",
        "runtime/opencl/cl2.hpp",
        "runtime/opencl/*.h",
        "runtime/hexagon/*.h",
    ]),
-    linkopts = ["-ldl",] + if_android([
+    linkopts = ["-ldl"] + if_android([
        "-pie",
        "-lm",
    ]),
    deps = [
        ":opencl_headers",
-        "//mace/utils:logging",
+        "//mace/utils",
-        "//mace/utils:tuner",
-        "//mace/utils:utils_hdrs",
        "//mace/codegen:generated_version",
    ] + if_production_mode([
-        "//mace/utils:tuner_prod",
+        "//mace/utils:utils_prod",
        "//mace/core:opencl_prod",
    ]) + if_not_production_mode([
-        "//mace/utils:tuner_dev",
+        "//mace/utils:utils_dev",
        "//mace/core:opencl_dev",
    ]),
 )
@@ -84,7 +81,7 @@ cc_library(
    ],
    deps = [
        ":core",
-        "//mace/utils:utils_hdrs",
+        "//mace/utils",
    ],
    alwayslink = 1,
 )
@@ -96,8 +93,7 @@ cc_library(
    deps = [
        ":opencl_headers",
        "//mace/codegen:generated_opencl_dev",
-        "//mace/utils:logging",
+        "//mace/utils",
-        "//mace/utils:utils_hdrs",
    ],
 )
@@ -108,6 +104,6 @@ cc_library(
    deps = [
        ":opencl_headers",
        "//mace/codegen:generated_opencl_prod",
-        "//mace/utils:logging",
+        "//mace/utils",
    ],
 )
--- a/mace/core/allocator.h
+++ b/mace/core/allocator.h
@@ -7,9 +7,9 @@
 #define MACE_CORE_ALLOCATOR_H_
 #include <malloc.h>
-#include "mace/core/common.h"
 #include "mace/core/registry.h"
-#include "mace/core/public/mace.h"
+#include "mace/public/mace.h"
 #include "mace/core/types.h"
 namespace mace {

--- a/mace/core/arg_helper.cc
+++ b/mace/core/arg_helper.cc
@@ -3,6 +3,7 @@
 //
 #include "mace/core/arg_helper.h"
+#include "mace/utils/logging.h"
 namespace mace {
@@ -43,7 +44,7 @@ bool SupportsLosslessConversion(const InputType &value) {
  T ArgumentHelper::GetSingleArgument<T>(const string &name,                  \
                                         const T &default_value) const {      \
    if (arg_map_.count(name) == 0) {                                          \
-      VLOG(1) << "Using default parameter value " << default_value            \
+      VLOG(3) << "Using default parameter value " << default_value            \
              << " for parameter " << name;                                   \
      return default_value;                                                   \
    }                                                                         \
@@ -82,12 +83,12 @@ INSTANTIATE_GET_SINGLE_ARGUMENT(string, s, false)
 #define INSTANTIATE_GET_REPEATED_ARGUMENT(T, fieldname,                   \
                                          enforce_lossless_conversion)    \
  template <>                                                             \
-  vector<T> ArgumentHelper::GetRepeatedArgument<T>(                       \
+  std::vector<T> ArgumentHelper::GetRepeatedArgument<T>(                       \
      const string &name, const std::vector<T> &default_value) const {    \
    if (arg_map_.count(name) == 0) {                                      \
      return default_value;                                               \
    }                                                                     \
-    vector<T> values;                                                     \
+    std::vector<T> values;                                                     \
    for (const auto &v : arg_map_.at(name).fieldname()) {                 \
      if (enforce_lossless_conversion) {                                  \
        auto supportsConversion =                                         \

--- a/mace/core/arg_helper.h
+++ b/mace/core/arg_helper.h
@@ -7,8 +7,7 @@
 #include <map>
-#include "mace/core/common.h"
+#include "mace/public/mace.h"
-#include "mace/core/public/mace.h"
 namespace mace {
@@ -42,7 +41,7 @@ class ArgumentHelper {
  }
  template <typename Def, typename T>
-  static vector<T> GetRepeatedArgument(
+  static std::vector<T> GetRepeatedArgument(
      const Def &def,
      const string &name,
      const std::vector<T> &default_value = std::vector<T>()) {
@@ -58,7 +57,7 @@ class ArgumentHelper {
  template <typename T>
  bool HasSingleArgumentOfType(const string &name) const;
  template <typename T>
-  vector<T> GetRepeatedArgument(
+  std::vector<T> GetRepeatedArgument(
      const string &name,
      const std::vector<T> &default_value = std::vector<T>()) const;

--- a/mace/core/common.h
+++ b/mace/core/common.h
-//
-// Copyright (c) 2017 XiaoMi All rights reserved.
-//
-#ifndef MACE_CORE_COMMON_H_
-#define MACE_CORE_COMMON_H_
-#include <algorithm>
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-#include <vector>
-#include "mace/utils/logging.h"
-using std::set;
-using std::map;
-using std::string;
-using std::unique_ptr;
-using std::vector;
-typedef int64_t index_t;
-// Disable the copy and assignment operator for a class.
-#ifndef DISABLE_COPY_AND_ASSIGN
-#define DISABLE_COPY_AND_ASSIGN(classname) \
- private:                                  \
-  classname(const classname &) = delete;   \
-  classname &operator=(const classname &) = delete
-#endif
-#define MACE_NOT_IMPLEMENTED MACE_CHECK(false, "not implemented")
-#endif  // MACE_CORE_COMMON_H_
--- a/mace/core/mace.cc
+++ b/mace/core/mace.cc
@@ -2,7 +2,7 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
-#include "mace/core/public/mace.h"
+#include "mace/public/mace.h"
 #include "mace/core/types.h"
 #include "mace/core/net.h"
 #include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
@@ -524,7 +524,7 @@ MaceEngine::MaceEngine(const NetDef *net_def, DeviceType device_type) :
    MACE_CHECK(hexagon_controller_->Config(), "hexagon config error");
    MACE_CHECK(hexagon_controller_->Init(), "hexagon init error");
    hexagon_controller_->SetDebugLevel(
-      static_cast<int>(mace::internal::LogMessage::MinVLogLevel()));
+      static_cast<int>(mace::logging::LogMessage::MinVLogLevel()));
    int dsp_mode = ArgumentHelper::GetSingleArgument<NetDef, int>(
        *net_def, "dsp_mode", 0);
    hexagon_controller_->SetGraphMode(dsp_mode);

--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -4,6 +4,7 @@
 #include "mace/core/net.h"
 #include "mace/utils/utils.h"
+#include "mace/utils/timer.h"
 #include "mace/utils/memory_logging.h"
 namespace mace {
@@ -24,11 +25,11 @@ SimpleNet::SimpleNet(const std::shared_ptr<const OperatorRegistry> op_registry,
  VLOG(1) << "Constructing SimpleNet " << net_def->name();
  for (int idx = 0; idx < net_def->op_size(); ++idx) {
    const auto &operator_def = net_def->op(idx);
-    VLOG(1) << "Creating operator " << operator_def.name() << ":"
+    VLOG(3) << "Creating operator " << operator_def.name() << "("
-            << operator_def.type();
+            << operator_def.type() << ")";
-    std::unique_ptr<OperatorBase> op{nullptr};
    OperatorDef temp_def(operator_def);
-    op = op_registry->CreateOperator(temp_def, ws, type, mode);
+    std::unique_ptr<OperatorBase> op(
+        op_registry->CreateOperator(temp_def, ws, type, mode));
    if (op) {
      operators_.emplace_back(std::move(op));
    }
@@ -37,14 +38,16 @@ SimpleNet::SimpleNet(const std::shared_ptr<const OperatorRegistry> op_registry,
 bool SimpleNet::Run(RunMetadata *run_metadata) {
  MACE_MEMORY_LOGGING_GUARD();
-  VLOG(1) << "Running net " << name_;
+  MACE_LATENCY_LOGGER(1, "Running net");
  for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
+    auto &op = *iter;
+    VLOG(3) << "Running operator " << op->debug_def().name() << "("
+            << op->debug_def().type() << ").";
+    MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name());
    bool future_wait = (device_type_ == DeviceType::OPENCL &&
                        (run_metadata != nullptr ||
                         std::distance(iter, operators_.end()) == 1));
-    auto &op = *iter;
-    VLOG(1) << "Running operator " << op->debug_def().name() << "("
-            << op->debug_def().type() << ").";
    bool ret;
    CallStats call_stats;
@@ -57,9 +60,9 @@ bool SimpleNet::Run(RunMetadata *run_metadata) {
        future.wait_fn(nullptr);
      }
    } else if (run_metadata != nullptr) {
-      call_stats.start_micros = NowInMicroSec();
+      call_stats.start_micros = NowMicros();
      ret = op->Run(nullptr);
-      call_stats.end_micros = NowInMicroSec();
+      call_stats.end_micros = NowMicros();
    } else {
      ret = op->Run(nullptr);
    }
@@ -75,8 +78,8 @@ bool SimpleNet::Run(RunMetadata *run_metadata) {
      return false;
    }
-    VLOG(1) << "Op " << op->debug_def().name()
+    VLOG(3) << "Operator " << op->debug_def().name()
-            << " has shape: " << internal::MakeString(op->Output(0)->shape());
+            << " has shape: " << MakeString(op->Output(0)->shape());
  }
  return true;
@@ -98,8 +101,8 @@ std::unique_ptr<NetBase> CreateNet(
    Workspace *ws,
    DeviceType type,
    const NetMode mode) {
-  unique_ptr<NetBase> net(new SimpleNet(op_registry, net_def, ws, type, mode));
+  std::unique_ptr<NetBase> net(new SimpleNet(op_registry, net_def, ws, type, mode));
  return net;
 }
-}  //  namespace mace
+}  // namespace mace
--- a/mace/core/net.h
+++ b/mace/core/net.h
@@ -5,9 +5,8 @@
 #ifndef MACE_CORE_NET_H_
 #define MACE_CORE_NET_H_
-#include "mace/core/common.h"
 #include "mace/core/operator.h"
-#include "mace/core/public/mace.h"
+#include "mace/public/mace.h"
 namespace mace {
@@ -45,7 +44,7 @@ class SimpleNet : public NetBase {
  bool Run(RunMetadata *run_metadata = nullptr) override;
 protected:
-  vector<unique_ptr<OperatorBase> > operators_;
+  std::vector<std::unique_ptr<OperatorBase> > operators_;
  DeviceType device_type_;
  DISABLE_COPY_AND_ASSIGN(SimpleNet);

--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -6,9 +6,8 @@
 #define MACE_CORE_OPERATOR_H
 #include "mace/core/arg_helper.h"
-#include "mace/core/common.h"
 #include "mace/core/future.h"
-#include "mace/core/public/mace.h"
+#include "mace/public/mace.h"
 #include "mace/core/registry.h"
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
@@ -37,8 +36,8 @@ class OperatorBase {
        *operator_def_, name);
  }
  template <typename T>
-  inline vector<T> GetRepeatedArgument(
+  inline std::vector<T> GetRepeatedArgument(
-      const string &name, const vector<T> &default_value = {}) const {
+      const string &name, const std::vector<T> &default_value = {}) const {
    MACE_CHECK(operator_def_, "operator_def was null!");
    return ArgumentHelper::GetRepeatedArgument<OperatorDef, T>(
        *operator_def_, name, default_value);
@@ -53,8 +52,8 @@ class OperatorBase {
  inline int InputSize() { return inputs_.size(); }
  inline int OutputSize() { return outputs_.size(); }
-  inline const vector<const Tensor *> &Inputs() const { return inputs_; }
+  inline const std::vector<const Tensor *> &Inputs() const { return inputs_; }
-  inline const vector<Tensor *> &Outputs() { return outputs_; }
+  inline const std::vector<Tensor *> &Outputs() { return outputs_; }
  // Run Op asynchronously (depends on device), return a future if not nullptr.
  virtual bool Run(StatsFuture *future) = 0;
@@ -74,8 +73,8 @@ class OperatorBase {
 protected:
  Workspace *operator_ws_;
  std::shared_ptr<const OperatorDef> operator_def_;
-  vector<const Tensor *> inputs_;
+  std::vector<const Tensor *> inputs_;
-  vector<Tensor *> outputs_;
+  std::vector<Tensor *> outputs_;
  DISABLE_COPY_AND_ASSIGN(OperatorBase);
 };

--- a/mace/core/registry.h
+++ b/mace/core/registry.h
@@ -5,7 +5,14 @@
 #ifndef MACE_CORE_REGISTRY_H_
 #define MACE_CORE_REGISTRY_H_
+#include <map>
+#include <memory>
 #include <mutex>
+#include <string>
+#include <vector>
+#include "mace/public/mace.h"
+#include "mace/utils/logging.h"
 namespace mace {
@@ -17,7 +24,7 @@ class Registry {
  Registry() : registry_() {}
  void Register(const SrcType &key, Creator creator) {
-    VLOG(2) << "Registering: " << key;
+    VLOG(3) << "Registering: " << key;
    std::lock_guard<std::mutex> lock(register_mutex_);
    MACE_CHECK(registry_.count(key) == 0, "Key already registered: ", key);
    registry_[key] = creator;
@@ -27,7 +34,7 @@ class Registry {
    return registry_.count(key) != 0;
  }
-  unique_ptr<ObjectType> Create(const SrcType &key, Args... args) const {
+  std::unique_ptr<ObjectType> Create(const SrcType &key, Args... args) const {
    if (registry_.count(key) == 0) {
      LOG(FATAL) << "Key not registered: " << key;
    }
@@ -37,8 +44,8 @@ class Registry {
  /**
   * Returns the keys currently registered as a vector.
   */
-  vector<SrcType> Keys() const {
+  std::vector<SrcType> Keys() const {
-    vector<SrcType> keys;
+    std::vector<SrcType> keys;
    for (const auto &it : registry_) {
      keys.push_back(it.first);
    }
@@ -62,7 +69,7 @@ class Registerer {
  }
  template <class DerivedType>
-  static unique_ptr<ObjectType> DefaultCreator(Args... args) {
+  static std::unique_ptr<ObjectType> DefaultCreator(Args... args) {
    return std::unique_ptr<ObjectType>(new DerivedType(args...));
  }
 };

--- a/mace/core/runtime/hexagon/hexagon_control_wrapper.cc
+++ b/mace/core/runtime/hexagon/hexagon_control_wrapper.cc
@@ -2,11 +2,13 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
-#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
+#include <vector>
-#include "mace/core/runtime/hexagon/hexagon_nn_ops.h"
 #include <thread>
 #include <sys/time.h>
+#include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
+#include "mace/core/runtime/hexagon/hexagon_nn_ops.h"
 namespace {
  inline int64_t NowMicros() {
    struct timeval tv;
@@ -61,9 +63,9 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
  // const node
  std::thread const_thread([&]() {
    std::cout << "thread function\n";
-    vector<hexagon_nn_const_node> const_node_list;
+    std::vector<hexagon_nn_const_node> const_node_list;
    for (const ConstTensor &tensor_proto: net_def.tensors()) {
-      vector<int> tensor_shape(tensor_proto.dims().begin(),
+      std::vector<int> tensor_shape(tensor_proto.dims().begin(),
                               tensor_proto.dims().end());
      while (tensor_shape.size() < 4) {
        tensor_shape.insert(tensor_shape.begin(), 1);
@@ -110,11 +112,11 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
  std::thread op_thread([&]() {
    OpMap op_map;
    op_map.Init();
-    vector<hexagon_nn_op_node> op_node_list;
+    std::vector<hexagon_nn_op_node> op_node_list;
-    vector<vector<hexagon_nn_input>> cached_inputs;
+    std::vector<std::vector<hexagon_nn_input>> cached_inputs;
-    vector<vector<hexagon_nn_output>> cached_outputs;
+    std::vector<std::vector<hexagon_nn_output>> cached_outputs;
-    vector<hexagon_nn_input> inputs;
+    std::vector<hexagon_nn_input> inputs;
-    vector<hexagon_nn_output> outputs;
+    std::vector<hexagon_nn_output> outputs;
    for (const OperatorDef &op: net_def.op()) {
      int op_id = op_map.GetOpId(op.type());
@@ -172,7 +174,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
  // input info
  num_inputs_ = 0;
  for (const InputInfo &input_info: net_def.input_info()) {
-    vector<index_t> input_shape;
+    std::vector<index_t> input_shape;
    input_shape.insert(input_shape.begin(),
                       input_info.dims().begin(), input_info.dims().end());
    while (input_shape.size() < 4) {
@@ -186,7 +188,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
  // output info
  num_outputs_ = 0;
  for (const OutputInfo &output_info: net_def.output_info()) {
-    vector<index_t> output_shape;
+    std::vector<index_t> output_shape;
    output_shape.insert(output_shape.begin(),
                        output_info.dims().begin(), output_info.dims().end());
    while (output_shape.size() < 4) {
@@ -207,7 +209,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
  int64_t t2 = NowMicros();
-  VLOG(0) << "Setup time: " << t1 - t0 << " " << t2 - t1;
+  VLOG(1) << "Setup time: " << t1 - t0 << " " << t2 - t1;
  return res == 0;
 }
@@ -225,7 +227,7 @@ void HexagonControlWrapper::PrintLog() {
  MACE_CHECK(hexagon_nn_getlog(nn_id_,
                               reinterpret_cast<unsigned char *>(buf),
                               PRINT_BUFSIZE) == 0, "print log error");
-  LOG(INFO) << string(buf);
+  LOG(INFO) << std::string(buf);
  delete[]buf;
 }
@@ -236,7 +238,7 @@ void HexagonControlWrapper::PrintGraph() {
  MACE_CHECK(hexagon_nn_snpprint(nn_id_,
                                 reinterpret_cast<unsigned char *>(buf),
                                 PRINT_BUFSIZE) == 0, "print graph error");
-  LOG(INFO) << string(buf);
+  LOG(INFO) << std::string(buf);
  delete[]buf;
 }
@@ -253,7 +255,7 @@ void HexagonControlWrapper::SetGraphMode(int mode) {
 void HexagonControlWrapper::GetPerfInfo() {
  LOG(INFO) << "Get perf info";
-  vector<hexagon_nn_perfinfo> perf_info(MAX_NODE);
+  std::vector<hexagon_nn_perfinfo> perf_info(MAX_NODE);
  unsigned int n_items = 0;
  MACE_CHECK(
    hexagon_nn_get_perfinfo(nn_id_, perf_info.data(), MAX_NODE, &n_items) == 0,
@@ -263,7 +265,7 @@ void HexagonControlWrapper::GetPerfInfo() {
  std::unordered_map<std::string, std::pair<int, float>> node_type_counters;
  float total_duration = 0.0;
-  VLOG(0) << "items: " << n_items;
+  VLOG(1) << "items: " << n_items;
  for (int i = 0; i < n_items; ++i) {
    unsigned int node_id = perf_info[i].node_id;
    unsigned int node_type_id = perf_info[i].node_type;
@@ -309,7 +311,7 @@ bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor,
  MACE_ASSERT(num_outputs_ == 1, "Wrong outputs num");
  output_tensor->SetDtype(output_data_types_[0]);
  output_tensor->Resize(output_shapes_[0]);
-  vector<uint32_t> output_shape(4);
+  std::vector<uint32_t> output_shape(4);
  uint32_t output_bytes;
  int res = hexagon_nn_execute(nn_id_,
                               input_tensor.shape()[0],
@@ -336,8 +338,8 @@ bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor,
  return res == 0;
 };
-bool HexagonControlWrapper::ExecuteGraphNew(const vector<Tensor> &input_tensors,
+bool HexagonControlWrapper::ExecuteGraphNew(const std::vector<Tensor> &input_tensors,
-                                            vector<Tensor> *output_tensors) {
+                                            std::vector<Tensor> *output_tensors) {
  LOG(INFO) << "Execute graph new: " << nn_id_;
  int num_inputs = input_tensors.size();
  int num_outputs = output_tensors->size();
@@ -348,7 +350,7 @@ bool HexagonControlWrapper::ExecuteGraphNew(const vector<Tensor> &input_tensors,
  hexagon_nn_tensordef *outputs = new hexagon_nn_tensordef[num_outputs];
  for (int i = 0; i < num_inputs; ++i) {
-    vector<index_t> input_shape = input_tensors[i].shape();
+    std::vector<index_t> input_shape = input_tensors[i].shape();
    inputs[i].batches = input_shape[0];
    inputs[i].height = input_shape[1];
    inputs[i].width = input_shape[2];
@@ -372,7 +374,7 @@ bool HexagonControlWrapper::ExecuteGraphNew(const vector<Tensor> &input_tensors,
                                   outputs, num_outputs);
  for (int i = 0; i < num_outputs; ++i) {
-    vector<uint32_t> output_shape{outputs[i].batches, outputs[i].height,
+    std::vector<uint32_t> output_shape{outputs[i].batches, outputs[i].height,
                                  outputs[i].width, outputs[i].depth};
    MACE_ASSERT(output_shape == output_shapes_[i],
                "wrong output shape inferred");
@@ -387,8 +389,8 @@ bool HexagonControlWrapper::ExecuteGraphNew(const vector<Tensor> &input_tensors,
 bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor,
                                                    Tensor *output_tensor) {
-  vector<Tensor> input_tensors(3);
+  std::vector<Tensor> input_tensors(3);
-  vector<Tensor> output_tensors(3);
+  std::vector<Tensor> output_tensors(3);
  input_tensors[0].SetDtype(DT_UINT8);
  output_tensors[0].SetDtype(DT_UINT8);
  input_tensors[0].ResizeLike(input_tensor);

--- a/mace/core/runtime/hexagon/hexagon_control_wrapper.h
+++ b/mace/core/runtime/hexagon/hexagon_control_wrapper.h
@@ -5,11 +5,12 @@
 #ifndef MACE_DSP_HEXAGON_CONTROL_WRAPPER_H_
 #define MACE_DSP_HEXAGON_CONTROL_WRAPPER_H_
+#include <vector>
 #include "mace/core/runtime/hexagon/hexagon_controller.h"
 #include "mace/core/runtime/hexagon/quantize.h"
-#include "mace/core/common.h"
 #include "mace/core/tensor.h"
-#include "mace/core/public/mace.h"
+#include "mace/public/mace.h"
 #include "mace/core/serializer.h"
 namespace mace {
@@ -23,8 +24,8 @@ class HexagonControlWrapper {
  bool Finalize();
  bool SetupGraph(const NetDef& net_def);
  bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor);
-  bool ExecuteGraphNew(const vector<Tensor>& input_tensors,
+  bool ExecuteGraphNew(const std::vector<Tensor>& input_tensors,
-                       vector<Tensor> *output_tensors);
+                       std::vector<Tensor> *output_tensors);
  bool ExecuteGraphPreQuantize(const Tensor &input_tensor, Tensor *output_tensor);
  bool TeardownGraph();
@@ -47,10 +48,10 @@ class HexagonControlWrapper {
  Serializer serializer_;
  Quantizer quantizer_;
-  vector<vector<index_t>> input_shapes_;
+  std::vector<std::vector<index_t>> input_shapes_;
-  vector<vector<index_t>> output_shapes_;
+  std::vector<std::vector<index_t>> output_shapes_;
-  vector<DataType> input_data_types_;
+  std::vector<DataType> input_data_types_;
-  vector<DataType> output_data_types_;
+  std::vector<DataType> output_data_types_;
  uint32_t num_inputs_;
  uint32_t num_outputs_;

--- a/mace/core/runtime/hexagon/quantize.h
+++ b/mace/core/runtime/hexagon/quantize.h
@@ -5,7 +5,6 @@
 #ifndef MACE_DSP_UTIL_QUANTIZE_H_
 #define MACE_DSP_UTIL_QUANTIZE_H_
-#include "mace/core/common.h"
 #include "mace/core/tensor.h"
 namespace mace {

--- a/mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.cc
+++ b/mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.cc
@@ -18,7 +18,7 @@ void OpenCLPreallocatedPooledAllocator::PreallocateImage(int mem_id,
                                                           size_t> &image_shape,
                                                         DataType data_type) {
  MACE_CHECK(!this->HasImage(mem_id), "Memory already exists: ", mem_id);
-  VLOG(3) << "Preallocate OpenCL image: " << mem_id << " "
+  VLOG(2) << "Preallocate OpenCL image: " << mem_id << " "
          << image_shape[0] << ", " << image_shape[1];
  images_[mem_id] = std::move(std::unique_ptr<void, std::function<void(void *)>>(
    allocator->NewImage(image_shape, data_type), [this](void *p) {
@@ -27,4 +27,4 @@ void OpenCLPreallocatedPooledAllocator::PreallocateImage(int mem_id,
  image_shapes_[mem_id] = image_shape;
 }
 } // namespace mace
\ No newline at end of file
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -8,7 +8,7 @@
 #include <mutex>
 #include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/core/public/mace.h"
+#include "mace/public/mace.h"
 #include "mace/utils/tuner.h"
 namespace mace {

--- a/mace/core/serializer.cc
+++ b/mace/core/serializer.cc
@@ -6,17 +6,17 @@
 namespace mace {
-unique_ptr<ConstTensor> Serializer::Serialize(const Tensor &tensor,
+std::unique_ptr<ConstTensor> Serializer::Serialize(const Tensor &tensor,
-                                              const string &name) {
+                                                   const std::string &name) {
  MACE_NOT_IMPLEMENTED;
  return nullptr;
 }
-unique_ptr<Tensor> Serializer::Deserialize(const ConstTensor &proto,
+std::unique_ptr<Tensor> Serializer::Deserialize(const ConstTensor &proto,
-                                           DeviceType type) {
+                                                DeviceType type) {
-  unique_ptr<Tensor> tensor(
+  std::unique_ptr<Tensor> tensor(
      new Tensor(GetDeviceAllocator(type), proto.data_type()));
-  vector<index_t> dims;
+  std::vector<index_t> dims;
  for (const index_t d : proto.dims()) {
    dims.push_back(d);
  }

--- a/mace/core/serializer.h
+++ b/mace/core/serializer.h
@@ -5,9 +5,8 @@
 #ifndef MACE_CORE_SERIALIZER_H_
 #define MACE_CORE_SERIALIZER_H_
-#include "mace/core/common.h"
 #include "mace/core/tensor.h"
-#include "mace/core/public/mace.h"
+#include "mace/public/mace.h"
 namespace mace {
@@ -16,9 +15,9 @@ class Serializer {
  Serializer() {}
  ~Serializer() {}
-  unique_ptr<ConstTensor> Serialize(const Tensor &tensor, const string &name);
+  std::unique_ptr<ConstTensor> Serialize(const Tensor &tensor, const std::string &name);
-  unique_ptr<Tensor> Deserialize(const ConstTensor &proto, DeviceType type);
+  std::unique_ptr<Tensor> Deserialize(const ConstTensor &proto, DeviceType type);
  DISABLE_COPY_AND_ASSIGN(Serializer);
 };

--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -6,10 +6,9 @@
 #define MACE_CORE_TENSOR_H_
 #include "mace/core/allocator.h"
-#include "mace/core/common.h"
 #include "mace/utils/logging.h"
 #include "mace/core/types.h"
-#include "mace/core/public/mace.h"
+#include "mace/public/mace.h"
 #include "preallocated_pooled_allocator.h"
 namespace mace {
@@ -32,7 +31,7 @@ namespace mace {
    CASE(uint16_t, SINGLE_ARG(STMTS))                          \
    CASE(int16_t, SINGLE_ARG(STMTS))                           \
    CASE(int8_t, SINGLE_ARG(STMTS))                            \
-    CASE(string, SINGLE_ARG(STMTS))                            \
+    CASE(std::string, SINGLE_ARG(STMTS))                       \
    CASE(int64_t, SINGLE_ARG(STMTS))                           \
    CASE(bool, SINGLE_ARG(STMTS))                              \
    case DT_INVALID:                                           \
@@ -92,9 +91,9 @@ class Tensor {
  inline void SetDtype(DataType dtype) { dtype_ = dtype; }
-  inline const vector<index_t> &shape() const { return shape_; }
+  inline const std::vector<index_t> &shape() const { return shape_; }
-  inline const vector<size_t> &image_shape() const { return image_shape_; }
+  inline const std::vector<size_t> &image_shape() const { return image_shape_; }
  inline const bool is_image() const { return is_image_; }
@@ -174,7 +173,7 @@ class Tensor {
    return static_cast<T *>(raw_mutable_data());
  }
-  inline void Resize(const vector<index_t> &shape) {
+  inline void Resize(const std::vector<index_t> &shape) {
    MACE_CHECK(!is_image_ || buffer_ == nullptr,
               "Resize is not for image, use ResizeImage instead.");
    is_image_ = false;
@@ -194,7 +193,7 @@ class Tensor {
    }
  }
-  inline void ResizeImage(const vector<index_t> &shape,
+  inline void ResizeImage(const std::vector<index_t> &shape,
                          const std::vector<size_t> &image_shape) {
    MACE_CHECK(is_image_ || buffer_ == nullptr,
               "ResizeImage is not for buffer, use Resize instead.");
@@ -260,7 +259,7 @@ class Tensor {
  inline void CopyWithCast(const SrcType *src, size_t size) {
    MACE_CHECK(static_cast<index_t>(size) == size_,
               "copy src and dst with different size.");
-    unique_ptr<DstType[]> buffer(new DstType[size]);
+    std::unique_ptr<DstType[]> buffer(new DstType[size]);
    for (size_t i = 0; i < size; ++i) {
      buffer[i] = static_cast<DstType>(src[i]);
    }
@@ -335,7 +334,7 @@ class Tensor {
      if (tensor_ != nullptr) tensor_->Unmap();
    }
-    inline const vector<size_t> &mapped_image_pitch() const { return mapped_image_pitch_; }
+    inline const std::vector<size_t> &mapped_image_pitch() const { return mapped_image_pitch_; }
   private:
    const Tensor *tensor_;
@@ -358,7 +357,7 @@ class Tensor {
  std::unique_ptr<void, std::function<void(void*)>> buffer_;
  // Mapped buffer
  mutable void *data_;
-  vector<index_t> shape_;
+  std::vector<index_t> shape_;
  // Image for opencl
  bool unused_;
  bool is_image_;

--- a/mace/core/testing/test_benchmark.cc
+++ b/mace/core/testing/test_benchmark.cc
@@ -120,7 +120,7 @@ void Benchmark::Run(int arg1, int arg2, int *run_count, double *run_seconds) {
  int64_t iters = kMinIters;
  while (true) {
    accum_time = 0;
-    start_time = utils::NowMicros();
+    start_time = NowMicros();
    bytes_processed = -1;
    macc_processed = -1;
    label.clear();
@@ -152,11 +152,11 @@ void Benchmark::Run(int arg1, int arg2, int *run_count, double *run_seconds) {
 void BytesProcessed(int64_t n) { bytes_processed = n; }
 void MaccProcessed(int64_t n) { macc_processed = n; }
 void StartTiming() {
-  if (start_time == 0) start_time = utils::NowMicros();
+  if (start_time == 0) start_time = NowMicros();
 }
 void StopTiming() {
  if (start_time != 0) {
-    accum_time += (utils::NowMicros() - start_time);
+    accum_time += (NowMicros() - start_time);
    start_time = 0;
  }
 }

--- a/mace/core/types.cc
+++ b/mace/core/types.cc
@@ -2,7 +2,11 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
+#include <map>
+#include <cstdint>
 #include "mace/core/types.h"
+#include "mace/utils/logging.h"
 namespace mace {
@@ -70,4 +74,4 @@ size_t GetEnumTypeSize(const DataType dt) {
  }
 }
 }  //  namespace mace
\ No newline at end of file
--- a/mace/core/types.h
+++ b/mace/core/types.h
@@ -5,13 +5,15 @@
 #ifndef MACE_CORE_TYPES_H_
 #define MACE_CORE_TYPES_H_
-#include "mace/core/common.h"
+#include <cstdint>
-#include "mace/core/public/mace.h"
-#include "mace/core/half.h"
+#include "mace/core/half.h"
+#include "mace/public/mace.h"
 namespace mace {
+typedef int64_t index_t;
 using half = half_float::half;
 bool DataTypeCanUseMemcpy(DataType dt);
@@ -57,7 +59,7 @@ MATCH_TYPE_AND_ENUM(uint16_t, DT_UINT16);
 MATCH_TYPE_AND_ENUM(uint8_t, DT_UINT8);
 MATCH_TYPE_AND_ENUM(int16_t, DT_INT16);
 MATCH_TYPE_AND_ENUM(int8_t, DT_INT8);
-MATCH_TYPE_AND_ENUM(string, DT_STRING);
+MATCH_TYPE_AND_ENUM(std::string, DT_STRING);
 MATCH_TYPE_AND_ENUM(int64_t, DT_INT64);
 MATCH_TYPE_AND_ENUM(uint32_t, DT_UINT32);
 MATCH_TYPE_AND_ENUM(bool, DT_BOOL);

--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -2,44 +2,48 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
+#include <string>
+#include <vector>
 #include "mace/core/workspace.h"
 #include "mace/core/serializer.h"
 #include "mace/core/arg_helper.h"
 #include "mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.h"
+#include "mace/utils/timer.h"
 namespace mace {
-vector<string> Workspace::Tensors() const {
+std::vector<std::string> Workspace::Tensors() const {
-  vector<string> names;
+  std::vector<std::string> names;
  for (auto &entry : tensor_map_) {
    names.push_back(entry.first);
  }
  return names;
 }
-Tensor *Workspace::CreateTensor(const string &name,
+Tensor *Workspace::CreateTensor(const std::string &name,
                                Allocator *alloc,
                                DataType type) {
  if (HasTensor(name)) {
-    VLOG(1) << "Tensor " << name << " already exists. Skipping.";
+    VLOG(3) << "Tensor " << name << " already exists. Skipping.";
  } else {
-    VLOG(1) << "Creating Tensor " << name;
+    VLOG(3) << "Creating Tensor " << name;
    tensor_map_[name] = std::move(std::unique_ptr<Tensor>(new Tensor(alloc, type)));
  }
  return GetTensor(name);
 }
-bool Workspace::RemoveTensor(const string &name) {
+bool Workspace::RemoveTensor(const std::string &name) {
  auto it = tensor_map_.find(name);
  if (it != tensor_map_.end()) {
-    VLOG(1) << "Removing blob " << name << " from this workspace.";
+    VLOG(3) << "Removing blob " << name << " from this workspace.";
    tensor_map_.erase(it);
    return true;
  }
  return false;
 }
-const Tensor *Workspace::GetTensor(const string &name) const {
+const Tensor *Workspace::GetTensor(const std::string &name) const {
  if (tensor_map_.count(name)) {
    return tensor_map_.at(name).get();
  } else {
@@ -60,19 +64,21 @@ void Workspace::RemoveUnsedTensor() {
  }
 }
-Tensor *Workspace::GetTensor(const string &name) {
+Tensor *Workspace::GetTensor(const std::string &name) {
  return const_cast<Tensor *>(
      static_cast<const Workspace *>(this)->GetTensor(name));
 }
 void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
+  MACE_LATENCY_LOGGER(1, "Load model tensors");
  Serializer serializer;
  for (auto &tensor_proto : net_def.tensors()) {
-    VLOG(1) << "Load tensor: " << tensor_proto.name()
+    MACE_LATENCY_LOGGER(2, "Load tensor ", tensor_proto.name());
+    VLOG(3) << "Load tensor: " << tensor_proto.name()
            << ", with data type: " << tensor_proto.data_type()
            << ", has shape: "
-            << internal::MakeString(vector<index_t>(tensor_proto.dims().begin(),
+            << MakeString(std::vector<index_t>(tensor_proto.dims().begin(),
-                                                    tensor_proto.dims().end()));
+                                          tensor_proto.dims().end()));
    tensor_map_[tensor_proto.name()] =
        serializer.Deserialize(tensor_proto, type);
  }
@@ -114,7 +120,7 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
                                              {mem_block.x(), mem_block.y()},
                                              dtype);
  }
-  VLOG(1) << "Preallocate image to tensors";
+  VLOG(3) << "Preallocate image to tensors";
  auto allocator = GetDeviceAllocator(DeviceType::OPENCL);
  for (auto &op: net_def.op()) {
    if (op.has_mem_id()) {

--- a/mace/core/workspace.h
+++ b/mace/core/workspace.h
@@ -5,36 +5,35 @@
 #ifndef MACE_CORE_WORKSPACE_H_
 #define MACE_CORE_WORKSPACE_H_
-#include "mace/core/common.h"
 #include "mace/core/tensor.h"
-#include "mace/core/public/mace.h"
+#include "mace/public/mace.h"
 #include "mace/core/preallocated_pooled_allocator.h"
 namespace mace {
 class Workspace {
 public:
-  typedef map<string, std::unique_ptr<Tensor>> TensorMap;
+  typedef std::map<std::string, std::unique_ptr<Tensor>> TensorMap;
  Workspace()
    : preallocated_allocator_(nullptr) {}
  ~Workspace() {}
-  vector<string> Tensors() const;
+  std::vector<std::string> Tensors() const;
-  Tensor *CreateTensor(const string &name, Allocator *alloc, DataType type);
+  Tensor *CreateTensor(const std::string &name, Allocator *alloc, DataType type);
-  bool RemoveTensor(const string &name);
+  bool RemoveTensor(const std::string &name);
  void RemoveUnsedTensor();
-  inline bool HasTensor(const string &name) const {
+  inline bool HasTensor(const std::string &name) const {
    return tensor_map_.count(name);
  }
-  const Tensor *GetTensor(const string &name) const;
+  const Tensor *GetTensor(const std::string &name) const;
-  Tensor *GetTensor(const string &name);
+  Tensor *GetTensor(const std::string &name);
  void LoadModelTensor(const NetDef &net_def, DeviceType type);

--- a/mace/kernels/BUILD
+++ b/mace/kernels/BUILD
@@ -23,7 +23,7 @@ cc_library(
    linkopts = if_android(["-lm"]),
    deps = [
        "//mace/core",
-        "//mace/utils:utils_hdrs",
+        "//mace/utils",
    ],
 )

--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -10,7 +10,7 @@
 #endif
 #include "mace/core/future.h"
-#include "mace/core/public/mace.h"
+#include "mace/public/mace.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/activation.h"

--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -7,7 +7,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
-#include "mace/core/public/mace.h"
+#include "mace/public/mace.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 namespace mace {

--- a/mace/kernels/concat.h
+++ b/mace/kernels/concat.h
@@ -5,10 +5,9 @@
 #ifndef MACE_KERNELS_CONCAT_H_
 #define MACE_KERNELS_CONCAT_H_
-#include "mace/core/common.h"
 #include "mace/core/future.h"
 #include "mace/core/types.h"
-#include "mace/core/public/mace.h"
+#include "mace/public/mace.h"
 #include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/cl2_header.h"

--- a/mace/kernels/conv_pool_2d_util.cc
+++ b/mace/kernels/conv_pool_2d_util.cc
@@ -183,7 +183,6 @@ void ConstructInputWithPadding(const Tensor *input_tensor,
                               const int *paddings,
                               Tensor *output_tensor,
                               bool padding_same_value) {
-  VLOG(1) << "input: " << input_tensor->NumElements();
  Tensor::MappingGuard input_mapper(input_tensor);
  const float *input = input_tensor->data<float>();
  const index_t *input_shape = input_tensor->shape().data();
@@ -255,7 +254,6 @@ void ConstructNHWCInputWithPadding(const Tensor *input_tensor,
                                   const int *paddings,
                                   Tensor *output_tensor,
                                   bool padding_same_value) {
-  VLOG(1) << "input: " << input_tensor->NumElements();
  Tensor::MappingGuard input_mapper(input_tensor);
  const float *input = input_tensor->data<float>();
  const index_t *input_shape = input_tensor->shape().data();

--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -9,9 +9,8 @@
 #include <arm_neon.h>
 #endif
-#include "mace/core/common.h"
 #include "mace/core/future.h"
-#include "mace/core/public/mace.h"
+#include "mace/public/mace.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/kernels/conv_pool_2d_util.h"

--- a/mace/kernels/neon/avg_pooling_neon_2x2.cc
+++ b/mace/kernels/neon/avg_pooling_neon_2x2.cc
@@ -6,8 +6,6 @@
 #include <float.h>
 #include <limits>
-#include "mace/core/common.h"
 namespace mace {
 namespace kernels {

--- a/mace/kernels/neon/conv_2d_neon_1x1.cc
+++ b/mace/kernels/neon/conv_2d_neon_1x1.cc
@@ -3,7 +3,6 @@
 //
 #include <arm_neon.h>
-#include "mace/core/common.h"
 #include "mace/utils/utils.h"
 namespace mace {

--- a/mace/kernels/neon/conv_2d_neon_3x3.cc
+++ b/mace/kernels/neon/conv_2d_neon_3x3.cc
@@ -3,7 +3,6 @@
 //
 #include <arm_neon.h>
-#include "mace/core/common.h"
 namespace mace {
 namespace kernels {

--- a/mace/kernels/neon/conv_2d_neon_5x5.cc
+++ b/mace/kernels/neon/conv_2d_neon_5x5.cc
@@ -5,7 +5,6 @@
 #define MACE_KERNELS_NEON_CONV_2D_NEON_5X5_H_
 #include <arm_neon.h>
-#include "mace/core/common.h"
 namespace mace {
 namespace kernels {

--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -52,7 +52,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
    built_options.emplace("-Daddn=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    built_options.emplace("-DINPUT_NUM=" + ToString(input_tensors.size()));
+    built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
    kernel_ = runtime->BuildKernel("addn", kernel_name, built_options);
    uint32_t idx = 0;

--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -33,7 +33,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
  size_t gws[2] = {image_shape[0],
                   image_shape[1]};
-  string kernel_name;
+  std::string kernel_name;
  switch (type) {
    case CONV2D_FILTER:
      kernel_name = i2b_ ? "filter_image_to_buffer" : "filter_buffer_to_image";
@@ -59,7 +59,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
      kernel_name = i2b_ ? "winograd_filter_image_to_buffer" : "winograd_filter_buffer_to_image";
      break;
  }
-  string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
+  std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
  std::set<std::string> built_options;
  std::stringstream kernel_name_ss;
  kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;

--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -44,7 +44,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
    built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    built_options.emplace("-DSTRIDE=" + ToString(stride));
+    built_options.emplace(MakeString("-DSTRIDE=", stride));
    if (bias != nullptr) {
      built_options.emplace("-DBIAS");
    }

--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -3,7 +3,6 @@
 //
 #include "mace/kernels/conv_2d.h"
-#include "mace/core/common.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/activation.h"
 #include "mace/kernels/opencl/helper.h"
@@ -43,7 +42,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
-    built_options.emplace("-DSTRIDE=" + ToString(stride));
+    built_options.emplace(MakeString("-DSTRIDE=", stride));
    switch (activation) {
      case NOOP:
        break;

--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -3,7 +3,6 @@
 //
 #include "mace/kernels/conv_2d.h"
-#include "mace/core/common.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/activation.h"
 #include "mace/kernels/opencl/helper.h"
@@ -43,7 +42,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
-    built_options.emplace("-DSTRIDE=" + ToString(stride));
+    built_options.emplace(MakeString("-DSTRIDE=", stride));
    switch (activation) {
      case NOOP:
        break;

--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -59,7 +59,7 @@ void DepthwiseConv2d(cl::Kernel *kernel,
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
-    built_options.emplace("-DSTRIDE=" + ToString(stride));
+    built_options.emplace(MakeString("-DSTRIDE=", stride));
    switch (activation) {
      case NOOP:
        break;

--- a/mace/kernels/opencl/eltwise_opencl.cc
+++ b/mace/kernels/opencl/eltwise_opencl.cc
@@ -33,7 +33,7 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
    built_options.emplace("-Deltwise=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    built_options.emplace("-DELTWISE_TYPE=" + ToString(type_));
+    built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
    if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
    kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options);

--- a/mace/kernels/opencl/space_to_batch_opencl.cc
+++ b/mace/kernels/opencl/space_to_batch_opencl.cc
@@ -5,7 +5,6 @@
 #ifndef MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_
 #define MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_
-#include "mace/core/common.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/space_to_batch.h"
 #include "mace/kernels/opencl/helper.h"

--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -32,7 +32,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *i
    CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, image_shape);
    output_tensor->ResizeImage(output_shape, image_shape);
-    string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
+    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
    std::set<std::string> built_options;
    built_options.emplace("-Dwinograd_transform_2x2=" + obfuscated_kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
@@ -77,7 +77,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Te
  output_tensor->ResizeImage(output_shape, image_shape);
  if (kernel_.get() == nullptr) {
-    string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
+    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
    std::set<std::string> built_options;
    built_options.emplace("-Dwinograd_inverse_transform_2x2=" + obfuscated_kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));

--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
@@ -7,7 +7,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
-#include "mace/core/public/mace.h"
+#include "mace/public/mace.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 namespace mace {

--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.h
@@ -7,7 +7,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
-#include "mace/core/public/mace.h"
+#include "mace/public/mace.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 namespace mace {

--- a/mace/ops/addn.h
+++ b/mace/ops/addn.h
@@ -5,6 +5,8 @@
 #ifndef MACE_OPS_ADDN_H_
 #define MACE_OPS_ADDN_H_
+#include <vector>
 #include "mace/core/operator.h"
 #include "mace/kernels/addn.h"
@@ -19,7 +21,7 @@ class AddNOp : public Operator<D, T> {
  bool Run(StatsFuture *future) override {
    Tensor *output_tensor = this->Output(0);
    int n = this->inputs_.size();
-    vector<const Tensor *> inputs(n, nullptr);
+    std::vector<const Tensor *> inputs(n, nullptr);
    inputs[0] = this->Input(0);
    for (int i = 1; i < n; ++i) {
      inputs[i] = this->Input(i);

--- a/mace/ops/addn_benchmark.cc
+++ b/mace/ops/addn_benchmark.cc
@@ -15,19 +15,19 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
  OpsTestNet net;
  // Add input data
  for (int i = 0; i < inputs; ++i) {
-    net.AddRandomInput<D, float>(internal::MakeString("Input", i).c_str(),
+    net.AddRandomInput<D, float>(MakeString("Input", i).c_str(),
                                 {n, h, w, c});
  }
  if (D == DeviceType::OPENCL) {
    for (int i = 0; i < inputs; ++i) {
-      BufferToImage<D, T>(net, internal::MakeString("Input", i).c_str(),
+      BufferToImage<D, T>(net, MakeString("Input", i).c_str(),
-                          internal::MakeString("InputImage", i).c_str(),
+                          MakeString("InputImage", i).c_str(),
                          kernels::BufferType::IN_OUT_CHANNEL);
    }
    OpDefBuilder op_def_builder("AddN", "AddNBM");
    for (int i = 0; i < inputs; ++i) {
-      op_def_builder.Input(internal::MakeString("InputImage", i).c_str());
+      op_def_builder.Input(MakeString("InputImage", i).c_str());
    }
    op_def_builder.Output("OutputImage")
        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
@@ -35,7 +35,7 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
  } else {
    OpDefBuilder op_def_builder("AddN", "AddNBM");
    for (int i = 0; i < inputs; ++i) {
-      op_def_builder.Input(internal::MakeString("Input", i).c_str());
+      op_def_builder.Input(MakeString("Input", i).c_str());
    }
    op_def_builder.Output("Output")
        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))

--- a/mace/ops/addn_test.cc
+++ b/mace/ops/addn_test.cc
@@ -75,13 +75,13 @@ void RandomTest() {
    OpsTestNet net;
    auto op_def = OpDefBuilder("AddN", "AddNTest");
    for (int i = 0; i < input_num; ++i) {
-      op_def.Input("Input" + ToString(i));
+      op_def.Input(MakeString("Input", i));
    }
    op_def.Output("Output").Finalize(net.NewOperatorDef());
    // Add input data
    for (int i = 0; i < input_num; ++i) {
-      net.AddRandomInput<D, float>("Input" + ToString(i), {n, h, w, c});
+      net.AddRandomInput<D, float>(MakeString("Input", i), {n, h, w, c});
    }
    // run on cpu
@@ -92,14 +92,14 @@ void RandomTest() {
    // run on gpu
    for (int i = 0; i < input_num; ++i) {
-      BufferToImage<D, half>(net, "Input" + ToString(i),
+      BufferToImage<D, half>(net, MakeString("Input", i),
-                             "InputImage" + ToString(i),
+                             MakeString("InputImage", i),
                             kernels::BufferType::IN_OUT_CHANNEL);
    }
    auto op_def_cl = OpDefBuilder("AddN", "AddNTest");
    for (int i = 0; i < input_num; ++i) {
-      op_def_cl.Input("InputImage" + ToString(i));
+      op_def_cl.Input(MakeString("InputImage", i));
    }
    op_def_cl.Output("OutputImage")
        .AddIntArg("T", static_cast<int>(DataType::DT_HALF))

--- a/mace/ops/concat_test.cc
+++ b/mace/ops/concat_test.cc
@@ -95,7 +95,7 @@ TEST_F(ConcatOpTest, CPURandom) {
  OpsTestNet net;
  auto builder = OpDefBuilder("Concat", "ConcatTest");
  for (int i = 0; i < num_inputs; ++i) {
-    builder = builder.Input(("Input" + ToString(i)).c_str());
+    builder = builder.Input(MakeString("Input", i));
  }
  builder.AddIntArg("axis", axis)
      .Output("Output")
@@ -113,7 +113,7 @@ TEST_F(ConcatOpTest, CPURandom) {
    GenerateRandomRealTypeData(input_shapes[i], inputs[i]);
    input_ptrs[i] = inputs[i].data();
    net.AddInputFromArray<DeviceType::CPU, float>(
-        ("Input" + ToString(i)).c_str(), input_shapes[i], inputs[i]);
+        MakeString("Input", i), input_shapes[i], inputs[i]);
  }
  // Run
@@ -148,8 +148,8 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
  // Construct graph
  OpsTestNet net;
  for (int i = 0; i < num_inputs; ++i) {
-    const std::string input_name = ("Input" + ToString(i)).c_str();
+    const std::string input_name = MakeString("Input", i);
-    const std::string image_name = ("InputImage" + ToString(i)).c_str();
+    const std::string image_name = MakeString("InputImage", i);
    concat_axis_size += shapes[i][axis];
    net.AddRandomInput<DeviceType::OPENCL, float>(input_name, shapes[i]);
    BufferToImage<DeviceType::OPENCL, T>(net, input_name, image_name,
@@ -158,7 +158,7 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
  auto builder = OpDefBuilder("Concat", "ConcatTest");
  for (int i = 0; i < num_inputs; ++i) {
-    const std::string image_name = ("InputImage" + ToString(i)).c_str();
+    const std::string image_name = MakeString("InputImage", i);
    builder = builder.Input(image_name);
  }
  builder.AddIntArg("axis", axis)
@@ -188,7 +188,7 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
          std::accumulate(shapes[i].begin() + axis, shapes[i].end(), 1,
                          std::multiplies<index_t>());
-      const std::string input_name = ("Input" + ToString(i)).c_str();
+      const std::string input_name = MakeString("Input", i);
      const Tensor *input_tensor = net.GetTensor(input_name.data());
      Tensor::MappingGuard input_guard(input_tensor);
      const float *input_ptr = input_tensor->data<float>() + k * num_elements;

--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -8,7 +8,6 @@
 #include <type_traits>
 #include "gtest/gtest.h"
-#include "mace/core/common.h"
 #include "mace/core/net.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/tensor.h"
@@ -239,9 +238,9 @@ void GenerateRandomIntTypeData(const std::vector<index_t> &shape,
 }
 template <typename T>
-unique_ptr<Tensor> CreateTensor(const std::vector<index_t> &shape,
+std::unique_ptr<Tensor> CreateTensor(const std::vector<index_t> &shape,
                                const std::vector<T> &data) {
-  unique_ptr<Tensor> res(
+  std::unique_ptr<Tensor> res(
      new Tensor(GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum<T>::v()));
  res->Resize(shape);
  T *input_data = res->mutable_data<T>();

--- a/mace/ops/reshape.h
+++ b/mace/ops/reshape.h
@@ -29,9 +29,9 @@ class ReshapeOp : public Operator<D, T> {
        MACE_CHECK(unknown_idx == -1) << "Only one input size may be -1";
        unknown_idx = i;
        out_shape.push_back(1);
-      } else if (shape_[i] < 0) {
-        VLOG(ERROR) << "Shape must be non-negative";
      } else {
+        MACE_CHECK(shape_[i] >= 0) << "Shape must be non-negative: "
+                                   << shape_[i];
        out_shape.push_back(shape_[i]);
        product *= shape_[i];
      }

--- a/mace/ops/resize_bilinear_test.cc
+++ b/mace/ops/resize_bilinear_test.cc
@@ -21,7 +21,7 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) {
      .Finalize(net.NewOperatorDef());
  // Add input data
-  vector<float> input(24);
+  std::vector<float> input(24);
  std::iota(begin(input), end(input), 0);
  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
@@ -46,7 +46,7 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) {
      .Finalize(net.NewOperatorDef());
  // Add input data
-  vector<float> input(24);
+  std::vector<float> input(24);
  std::iota(begin(input), end(input), 0);
  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);

--- a/mace/ops/space_to_batch_test.cc
+++ b/mace/ops/space_to_batch_test.cc
@@ -70,7 +70,7 @@ void TestBidirectionalTransform(const std::vector<index_t> &space_shape,
                                const std::vector<int> &padding_data,
                                const std::vector<index_t> &batch_shape,
                                const std::vector<float> &batch_data) {
-  auto space_tensor = unique_ptr<Tensor>(new Tensor(
+  auto space_tensor = std::unique_ptr<Tensor>(new Tensor(
      GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<T>::v()));
  space_tensor->Resize(space_shape);
  {
@@ -82,7 +82,7 @@ void TestBidirectionalTransform(const std::vector<index_t> &space_shape,
    memcpy(space_ptr, space_data.data(), space_data.size() * sizeof(T));
  }
-  auto batch_tensor = unique_ptr<Tensor>(new Tensor(
+  auto batch_tensor = std::unique_ptr<Tensor>(new Tensor(
      GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<T>::v()));
  batch_tensor->Resize(batch_shape);
  {
@@ -165,7 +165,7 @@ TEST(SpaceToBatchTest, MultiBatchAndChannelData) {
 //  const int batch_size = std::accumulate(batch_shape.begin(),
 //  batch_shape.end(), 1, std::multiplies<int>());
 //
-//  auto space_tensor = unique_ptr<Tensor>(new
+//  auto space_tensor = std::unique_ptr<Tensor>(new
 //  Tensor(GetDeviceAllocator(DeviceType::OPENCL),
 //                                                    DataTypeToEnum<float>::v()));
 //  space_tensor->Resize(space_shape);
@@ -185,7 +185,7 @@ TEST(SpaceToBatchTest, MultiBatchAndChannelData) {
 //    VLOG(0) << "open space file failed";
 //  }
 //
-//  auto batch_tensor = unique_ptr<Tensor>(new
+//  auto batch_tensor = std::unique_ptr<Tensor>(new
 //  Tensor(GetDeviceAllocator(DeviceType::OPENCL),
 //                                                    DataTypeToEnum<float>::v()));
 //  std::vector<float> batch_data(batch_size, 0.0);

--- a/mace/public/BUILD
+++ b/mace/public/BUILD
+# Description:
+# MACE public API.
+#
+package(
+    default_visibility = ["//visibility:public"],
+)
+licenses(["notice"])  # Apache 2.0
+load("//mace:mace.bzl", "if_android")
+cc_library(
+    name = "public",
+    hdrs = [
+        "mace.h",
+    ],
+)
--- a/mace/core/public/mace.h
+++ b/mace/core/public/mace.h
@@ -4,6 +4,7 @@
 #ifndef MACE_CORE_MACE_H_
 #define MACE_CORE_MACE_H_
 #include <cstdint>
 #include <vector>
 #include <string>
@@ -33,6 +34,14 @@ inline const char *MaceVersion() {
 extern const char *MaceGitVersion();
+// Disable the copy and assignment operator for a class.
+#ifndef DISABLE_COPY_AND_ASSIGN
+#define DISABLE_COPY_AND_ASSIGN(classname) \
+ private:                                  \
+  classname(const classname &) = delete;   \
+  classname &operator=(const classname &) = delete
+#endif
 enum NetMode {
  INIT = 0,
  NORMAL = 1
@@ -378,5 +387,6 @@ class MaceEngine {
  std::unique_ptr<HexagonControlWrapper> hexagon_controller_;
 };
-} //  namespace mace
+}  // namespace mace
-#endif //  MACE_CORE_MACE_H_
+#endif  // MACE_CORE_MACE_H_
--- a/mace/utils/BUILD
+++ b/mace/utils/BUILD
@@ -10,61 +10,64 @@ licenses(["notice"])  # Apache 2.0
 load("//mace:mace.bzl", "if_android")
 cc_library(
-    name = "logging",
+    name = "utils",
    srcs = [
+        "command_line_flags.cc",
        "logging.cc",
    ],
    hdrs = [
+        "command_line_flags.h",
+        "env_time.h",
        "logging.h",
        "memory_logging.h",
+        "string_util.h",
+        "timer.h",
+        "tuner.h",
+        "utils.h",
    ],
    linkopts = if_android([
        "-llog",
    ]),
-)
-cc_library(
-    name = "command_line_flags",
-    srcs = [
-        "command_line_flags.cc",
-    ],
-    hdrs = [
-        "command_line_flags.h",
-    ],
    deps = [
-        ":logging",
+        "//mace/public",
    ],
 )
-cc_library(
+cc_test(
-    name = "tuner",
+    name = "utils_test",
-    hdrs = [
+    testonly = 1,
-        "timer.h",
+    srcs = [
-        "tuner.h",
+        "utils_test.cc",
    ],
+    linkopts = if_android([
+        "-pie",
+        "-lm",
+    ]),
+    linkstatic = 1,
    deps = [
-        ":logging",
+        ":utils",
-        ":utils_hdrs",
+        "@gtest//:gtest",
+        "@gtest//:gtest_main",
    ],
 )
 cc_library(
-    name = "tuner_dev",
+    name = "utils_dev",
    srcs = [
        "tuner_development.cc",
    ],
    deps = [
-        ":tuner",
+        ":utils",
    ],
 )
 cc_library(
-    name = "tuner_prod",
+    name = "utils_prod",
    srcs = [
        "tuner_production.cc",
    ],
    deps = [
-        ":tuner",
+        ":utils",
        "//mace/codegen:generated_tuning_params",
    ],
 )
@@ -81,44 +84,7 @@ cc_test(
    ]),
    linkstatic = 1,
    deps = [
-        ":tuner",
+        ":utils_dev",
-        ":tuner_dev",
-        "@gtest//:gtest",
-        "@gtest//:gtest_main",
-    ],
-)
-cc_library(
-    name = "utils_hdrs",
-    hdrs = [
-        "env_time.h",
-        "utils.h",
-    ],
-)
-cc_library(
-    name = "utils",
-    deps = [
-        ":command_line_flags",
-        ":logging",
-        ":tuner",
-        ":utils_hdrs",
-    ],
-)
-cc_test(
-    name = "utils_test",
-    testonly = 1,
-    srcs = [
-        "utils_test.cc",
-    ],
-    linkopts = if_android([
-        "-pie",
-        "-lm",
-    ]),
-    linkstatic = 1,
-    deps = [
-        ":utils_hdrs",
        "@gtest//:gtest",
        "@gtest//:gtest_main",
    ],

--- a/mace/utils/env_time.h
+++ b/mace/utils/env_time.h
@@ -12,15 +12,12 @@
 namespace mace {
-namespace utils {
 inline int64_t NowMicros() {
  struct timeval tv;
  gettimeofday(&tv, nullptr);
  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
 }
-}  // namespace testing
 }  // namespace mace
 #endif  // MACE_UTILS_ENV_TIME_H
--- a/mace/utils/logging.cc
+++ b/mace/utils/logging.cc
@@ -12,7 +12,7 @@
 #endif
 namespace mace {
-namespace internal {
+namespace logging {
 LogMessage::LogMessage(const char *fname, int line, int severity)
    : fname_(fname), line_(line), severity_(severity) {}
@@ -77,7 +77,7 @@ int64_t LogLevelStrToInt(const char *mace_env_var_val) {
  // Ideally we would use env_var / safe_strto64, but it is
  // hard to use here without pulling in a lot of dependencies,
  // so we use std:istringstream instead
-  string min_log_level(mace_env_var_val);
+  std::string min_log_level(mace_env_var_val);
  std::istringstream ss(min_log_level);
  int64_t level;
  if (!(ss >> level)) {
@@ -120,5 +120,5 @@ LogMessageFatal::~LogMessageFatal() {
  abort();
 }
-}  // namespace internal
+}  // namespace logging
 }  // namespace mace
--- a/mace/utils/logging.h
+++ b/mace/utils/logging.h
@@ -10,56 +10,21 @@
 #include <string>
 #include <vector>
+#include "mace/utils/env_time.h"
+#include "mace/public/mace.h"
+#include "mace/utils/string_util.h"
 #undef ERROR
 namespace mace {
 const int INFO = 0;            // base_logging::INFO;
 const int WARNING = 1;         // base_logging::WARNING;
 const int ERROR = 2;           // base_logging::ERROR;
 const int FATAL = 3;           // base_logging::FATAL;
 const int NUM_SEVERITIES = 4;  // base_logging::NUM_SEVERITIES;
-namespace internal {
+namespace logging {
-using std::string;
-inline void MakeStringInternal(std::stringstream & /*ss*/) {}
-template <typename T>
-inline void MakeStringInternal(std::stringstream &ss, const T &t) {
-  ss << t;
-}
-template <typename T, typename... Args>
-inline void MakeStringInternal(std::stringstream &ss,
-                               const T &t,
-                               const Args &... args) {
-  MakeStringInternal(ss, t);
-  MakeStringInternal(ss, args...);
-}
-template <typename... Args>
-string MakeString(const Args &... args) {
-  std::stringstream ss;
-  MakeStringInternal(ss, args...);
-  return ss.str();
-}
-template <typename T>
-string MakeString(const std::vector<T> &args) {
-  std::stringstream ss;
-  for (const T &arg : args) {
-    ss << arg << ", ";
-  }
-  return ss.str();
-}
-// Specializations for already-a-string types.
-template <>
-inline string MakeString(const string &str) {
-  return str;
-}
-inline string MakeString(const char *c_str) { return string(c_str); }
 class LogMessage : public std::basic_ostringstream<char> {
 public:
@@ -89,30 +54,25 @@ class LogMessageFatal : public LogMessage {
 };
 #define _MACE_LOG_INFO \
-  ::mace::internal::LogMessage(__FILE__, __LINE__, mace::INFO)
+  ::mace::logging::LogMessage(__FILE__, __LINE__, mace::INFO)
 #define _MACE_LOG_WARNING \
-  ::mace::internal::LogMessage(__FILE__, __LINE__, mace::WARNING)
+  ::mace::logging::LogMessage(__FILE__, __LINE__, mace::WARNING)
 #define _MACE_LOG_ERROR \
-  ::mace::internal::LogMessage(__FILE__, __LINE__, mace::ERROR)
+  ::mace::logging::LogMessage(__FILE__, __LINE__, mace::ERROR)
-#define _MACE_LOG_FATAL ::mace::internal::LogMessageFatal(__FILE__, __LINE__)
+#define _MACE_LOG_FATAL ::mace::logging::LogMessageFatal(__FILE__, __LINE__)
 #define _MACE_LOG_QFATAL _MACE_LOG_FATAL
 #define LOG(severity) _MACE_LOG_##severity
-#ifdef IS_MOBILE_PLAMACEORM
+// Set MACE_CPP_MIN_VLOG_LEVEL environment to update minimum log
-// Turn VLOG off when under mobile devices for considerations of binary size.
-#define VLOG_IS_ON(lvl) ((lvl) <= 0)
-#else
-// Otherwise, Set MACE_CPP_MIN_VLOG_LEVEL environment to update minimum log
 // level
 // of VLOG
-#define VLOG_IS_ON(lvl) ((lvl) <= ::mace::internal::LogMessage::MinVLogLevel())
+#define VLOG_IS_ON(lvl) ((lvl) <= ::mace::logging::LogMessage::MinVLogLevel())
-#endif
 #define VLOG(lvl)      \
  if (VLOG_IS_ON(lvl)) \
-  ::mace::internal::LogMessage(__FILE__, __LINE__, mace::INFO)
+  ::mace::logging::LogMessage(__FILE__, __LINE__, mace::INFO)
 // MACE_CHECK/MACE_ASSERT dies with a fatal error if condition is not true.
 // MACE_ASSERT is controlled by NDEBUG ('-c opt' for bazel) while MACE_CHECK
@@ -121,16 +81,14 @@ class LogMessageFatal : public LogMessage {
 //    MACE_CHECK(fp->Write(x) == 4)
 //    MACE_CHECK(fp->Write(x) == 4, "Write failed")
 // which are not correct for MACE_ASSERT.
-#define MACE_CHECK(condition, ...)              \
+#define MACE_CHECK(condition, ...) \
-  if (!(condition))                             \
+  if (!(condition))                \
-  LOG(FATAL) << "Check failed: " #condition " " \
+  LOG(FATAL) << "Check failed: " #condition " " << MakeString(__VA_ARGS__)
-             << ::mace::internal::MakeString(__VA_ARGS__)
 #ifndef NDEBUG
-#define MACE_ASSERT(condition, ...)              \
+#define MACE_ASSERT(condition, ...) \
-  if (!(condition))                              \
+  if (!(condition))                 \
-  LOG(FATAL) << "Assert failed: " #condition " " \
+  LOG(FATAL) << "Assert failed: " #condition " " << MakeString(__VA_ARGS__)
-             << ::mace::internal::MakeString(__VA_ARGS__)
 #else
 #define MACE_ASSERT(condition, ...) ((void)0)
 #endif
@@ -138,16 +96,46 @@ class LogMessageFatal : public LogMessage {
 template <typename T>
 T &&CheckNotNull(const char *file, int line, const char *exprtext, T &&t) {
  if (t == nullptr) {
-    LogMessageFatal(file, line) << string(exprtext);
+    LogMessageFatal(file, line) << std::string(exprtext);
  }
  return std::forward<T>(t);
 }
-#define MACE_CHECK_NOTNULL(val)                      \
+#define MACE_CHECK_NOTNULL(val)                     \
-  ::mace::internal::CheckNotNull(__FILE__, __LINE__, \
+  ::mace::logging::CheckNotNull(__FILE__, __LINE__, \
-                                 "'" #val "' Must be non NULL", (val))
+                                "'" #val "' Must be non NULL", (val))
+#define MACE_NOT_IMPLEMENTED MACE_CHECK(false, "not implemented")
+class LatencyLogger {
+ public:
+  LatencyLogger(int vlog_level, const std::string &message)
+      : vlog_level_(vlog_level), message_(message) {
+    if (VLOG_IS_ON(vlog_level_)) {
+      start_micros_ = NowMicros();
+    }
+  }
+  ~LatencyLogger() {
+    if (VLOG_IS_ON(vlog_level_)) {
+      int64_t stop_micros = NowMicros();
+      VLOG(vlog_level_) << message_
+                        << " latency: " << stop_micros - start_micros_ << " us";
+    }
+  };
+ private:
+  const int vlog_level_;
+  const std::string message_;
+  int64_t start_micros_;
+  DISABLE_COPY_AND_ASSIGN(LatencyLogger);
+};
+#define MACE_LATENCY_LOGGER(vlog_level, ...)              \
+  mace::logging::LatencyLogger latency_logger_##__line__( \
+      vlog_level, VLOG_IS_ON(vlog_level) ? MakeString(__VA_ARGS__) : "")
-}  // namespace internal
+}  // namespace logging
 }  // namespace mace
 #endif  // MACE_UTILS_LOGGING_H_
--- a/mace/utils/string_util.h
+++ b/mace/utils/string_util.h
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+#ifndef MACE_UTILS_STRING_UTIL_H_
+#define MACE_UTILS_STRING_UTIL_H_
+#include <sstream>
+#include <string>
+#include <vector>
+namespace mace {
+namespace {
+inline void MakeStringInternal(std::stringstream & /*ss*/) {}
+template <typename T>
+inline void MakeStringInternal(std::stringstream &ss, const T &t) {
+  ss << t;
+}
+template <typename T, typename... Args>
+inline void MakeStringInternal(std::stringstream &ss,
+                               const T &t,
+                               const Args &... args) {
+  MakeStringInternal(ss, t);
+  MakeStringInternal(ss, args...);
+}
+} // namespace
+template <typename... Args>
+std::string MakeString(const Args &... args) {
+  std::stringstream ss;
+  MakeStringInternal(ss, args...);
+  return ss.str();
+}
+template <typename T>
+std::string MakeString(const std::vector<T> &args) {
+  std::stringstream ss;
+  ss << "[";
+  const size_t size = args.size();
+  for (int i = 0; i < size; ++i) {
+    ss << args[i];
+    if (i < size - 1) {
+      ss << ", ";
+    }
+  }
+  ss << "]";
+  return ss.str();
+}
+// Specializations for already-a-string types.
+template <>
+inline std::string MakeString(const std::string &str) {
+  return str;
+}
+inline std::string MakeString(const char *c_str) { return std::string(c_str); }
+}  // namespace mace
+#endif  // MACE_UTILS_STRING_UTIL_H_
--- a/mace/utils/timer.h
+++ b/mace/utils/timer.h
@@ -6,6 +6,7 @@
 #define MACE_UTILS_TIMER_H_
 #include "mace/utils/env_time.h"
+#include "mace/utils/logging.h"
 namespace mace {
@@ -24,11 +25,11 @@ class WallClockTimer : public Timer {
  WallClockTimer() : accumulated_micros_(0) {}
  void StartTiming() override {
-    start_micros_ = mace::utils::NowMicros();
+    start_micros_ = NowMicros();
  }
  void StopTiming() override {
-    stop_micros_ = mace::utils::NowMicros();
+    stop_micros_ = NowMicros();
  }
  void AccumulateTiming() override {
@@ -54,6 +55,8 @@ class WallClockTimer : public Timer {
  double start_micros_;
  double stop_micros_;
  double accumulated_micros_;
+  DISABLE_COPY_AND_ASSIGN(WallClockTimer);
 };
 }  // namespace mace

--- a/mace/utils/tuner.h
+++ b/mace/utils/tuner.h
@@ -51,15 +51,15 @@ class Tuner {
      // tune
      std::vector<param_type> opt_param = default_param;
      RetType res = Tune<RetType>(param_generator, func, timer, &opt_param);
-      VLOG(1) << "Tuning result. "
+      VLOG(3) << "Tuning result. "
-              << param_key << ": " << internal::MakeString(opt_param);
+              << param_key << ": " << MakeString(opt_param);
      param_table_[obfucated_param_key] = opt_param;
      return res;
    } else {
      // run
      if (param_table_.find(obfucated_param_key) != param_table_.end()) {
-        VLOG(1) << param_key << ": "
+        VLOG(3) << param_key << ": "
-                << internal::MakeString(param_table_[obfucated_param_key]);
+                << MakeString(param_table_[obfucated_param_key]);
        return func(param_table_[obfucated_param_key], nullptr, nullptr);
      } else {
 #ifndef MACE_DISABLE_NO_TUNING_WARNING
@@ -82,7 +82,7 @@ class Tuner {
  Tuner &operator=(const Tuner &) = delete;
  inline void WriteRunParameters() {
-    VLOG(1) << path_;
+    VLOG(3) << path_;
    if (path_ != nullptr) {
      std::ofstream ofs(path_, std::ios::binary | std::ios::out);
      if (ofs.is_open()) {
@@ -92,7 +92,7 @@ class Tuner {
          int32_t key_size = kp.first.size();
          ofs.write(reinterpret_cast<char *>(&key_size), sizeof(key_size));
          ofs.write(kp.first.c_str(), key_size);
-          VLOG(1) << "Write tuning param: " << kp.first.c_str();
+          VLOG(3) << "Write tuning param: " << kp.first.c_str();
          auto &params = kp.second;
          int32_t params_size = params.size() * sizeof(param_type);
@@ -100,7 +100,7 @@ class Tuner {
                    sizeof(params_size));
          for (auto &param : params) {
            ofs.write(reinterpret_cast<char *>(&param), sizeof(params_size));
-            VLOG(1) << param;
+            VLOG(3) << param;
          }
        }
        ofs.close();

--- a/mace/utils/utils.h
+++ b/mace/utils/utils.h
@@ -40,19 +40,6 @@ Integer CeilQuotient(Integer a, Integer b) {
  return (a + b - 1) / b;
 }
-inline int64_t NowInMicroSec() {
-  struct timeval tv;
-  gettimeofday(&tv, nullptr);
-  return static_cast<int64_t>(tv.tv_sec * 1000000 + tv.tv_usec);
-}
-template <typename T>
-inline std::string ToString(T v) {
-  std::ostringstream ss;
-  ss << v;
-  return ss.str();
-}
 inline std::string ObfuscateString(const std::string &src,
                                   const std::string &lookup_table) {
  std::string dest;

--- a/tools/export_lib.sh
+++ b/tools/export_lib.sh
@@ -39,18 +39,18 @@ libmace_targets=(
  "//mace/kernels:kernels"
  "//mace/codegen:generated_version"
  "//mace/core:core"
-  "//mace/utils:logging"
+  "//mace/utils:utils"
 )
 libmace_dev_targets=(
  "//mace/codegen:generated_opencl_dev"
  "//mace/core:opencl_dev"
-  "//mace/utils:tuner_dev"
+  "//mace/utils:utils_dev"
 )
 libmace_prod_targets=(
  "//mace/core:opencl_prod"
-  "//mace/utils:tuner_prod"
+  "//mace/utils:utils_prod"
 )
 all_targets=(${libmace_targets[*]} ${libmace_dev_targets[*]} ${libmace_prod_targets[*]})
@@ -155,11 +155,14 @@ merge_libs "libmace_prod" "${libmace_prod_targets[*]}"
 echo "Step 5: Export lib"
 rm -rf ${EXPORT_INCLUDE_DIR}
-mkdir -p ${EXPORT_INCLUDE_DIR}/mace/core/public
+mkdir -p ${EXPORT_INCLUDE_DIR}/mace/public
+mkdir -p ${EXPORT_INCLUDE_DIR}/mace/utils
 rm -rf ${EXPORT_LIB_DIR}
 mkdir -p ${EXPORT_LIB_DIR}
-cp ${MACE_SOURCE_DIR}/mace/core/public/* ${EXPORT_INCLUDE_DIR}/mace/core/public || exit 1
+cp ${MACE_SOURCE_DIR}/mace/public/*.h ${EXPORT_INCLUDE_DIR}/mace/public/ || exit 1
+# utils is noti part of public API
+cp ${MACE_SOURCE_DIR}/mace/utils/*.h ${EXPORT_INCLUDE_DIR}/mace/utils/ || exit 1
 cp ${LIBMACE_TEMP_DIR}/libmace.a ${LIBMACE_TEMP_DIR}/libmace_dev.a ${LIBMACE_TEMP_DIR}/libmace_prod.a ${EXPORT_LIB_DIR}/ || exit 1
 echo "Step 6: Remove temporary file"