diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 9cad515c2bef7e44769ce0d452a96ed5bad7e3f0..a6f6b42fc5f3f172213bc076753dfe30f8814c8e 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -112,16 +112,13 @@ model_tests:
     - if [ -z "$TARGET_SOCS" ]; then TARGET_SOCS=random; fi
     - python tools/converter.py convert --config=${CONF_FILE}  --target_socs=$TARGET_SOCS --model_graph_format=file --model_data_format=file --cl_mem_type=buffer
     - python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --device_yml=${DEVICE_CONF_FILE} --round=1 --target_abis=armeabi-v7a,arm64 --validate --model_graph_format=file --model_data_format=file
-    - python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --device_yml=${DEVICE_CONF_FILE} --example --target_abis=armeabi-v7a,arm64 --round=1 --validate --model_graph_format=file --model_data_format=file
     - CONF_FILE=mace-models/mobilenet-v2/mobilenet-v2-host.yml
     - python tools/converter.py convert --config=${CONF_FILE} --target_socs=$TARGET_SOCS --model_graph_format=file --model_data_format=file
     - python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --round=1 --validate --model_graph_format=file --model_data_format=file --address_sanitizer
-    - python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --example --round=1 --validate --model_graph_format=file --model_data_format=file
-    - python tools/converter.py benchmark --config=${CONF_FILE} --target_socs=$TARGET_SOCS --round=5 --model_graph_format=file --model_data_format=file
+    - python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --round=5 --model_graph_format=file --model_data_format=file --benchmark
     - python tools/converter.py convert --config=${CONF_FILE} --target_socs=$TARGET_SOCS --model_graph_format=code --model_data_format=file
     - python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --round=1 --validate --model_graph_format=code --model_data_format=file
-    - python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --example --round=1 --validate --model_graph_format=code --model_data_format=file
-    - python tools/converter.py benchmark --config=${CONF_FILE} --target_socs=$TARGET_SOCS --round=5 --model_graph_format=code --model_data_format=file
+    - python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --round=5 --model_graph_format=code --model_data_format=file --benchmark
     - rm -rf mace-models
 
 quantization_tests:
@@ -141,7 +138,6 @@ quantization_tests:
       do
       python tools/converter.py convert --config=${CONF_FILE} --target_socs=$TARGET_SOCS --model_graph_format=file --model_data_format=file || exit 1;
       python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --device_yml=${DEVICE_CONF_FILE} --round=1 --validate --model_graph_format=file --model_data_format=file || exit 1;
-      python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --device_yml=${DEVICE_CONF_FILE} --example --round=1 --validate --layers=0 --model_graph_format=file --model_data_format=file || exit 1;
       done
     - rm -rf mace-models
   only:
@@ -162,7 +158,6 @@ dynamic_linking_test:
       fi
     - if [ -z "$TARGET_SOCS" ]; then TARGET_SOCS=random; fi
     - python tools/converter.py convert --config=${CONF_FILE}  --target_socs=$TARGET_SOCS --model_graph_format=file --model_data_format=file --cl_mem_type=buffer
-    - python tools/converter.py run --config=${CONF_FILE} --target_socs=$TARGET_SOCS --device_yml=${DEVICE_CONF_FILE} --example --mace_lib_type=dynamic --target_abis=armeabi-v7a,arm64 --round=1 --validate --model_graph_format=file --model_data_format=file
     - rm -rf mace-models
   only:
     - triggers
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 08d70f98bf7d68c56b8e066f0357b7751e16df6e..2f60c0a7a9c9b20a15a66de07d0acf9b0e9af0ef 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,7 +12,6 @@ option(MACE_ENABLE_HEXAGON_HTA "whether to enable Hexagon HTA support"      OFF)
 option(MACE_ENABLE_MTK_APU     "whether to enable MTK APU support"          OFF)
 option(MACE_ENABLE_TESTS       "whether to build c++ unit tests"            OFF)
 option(MACE_ENABLE_BENCHMARKS  "whether to build c++ micro benchmarks"      OFF)
-option(MACE_ENABLE_EXAMPLES    "whether to build examples"                  OFF)
 option(MACE_ENABLE_OPT_SIZE    "whether to build with optimized binary size" ON)
 option(MACE_ENABLE_OBFUSCATE   "whether to build with code obfuscation"      ON)
 option(MACE_ENABLE_CCACHE      "whether to build with ccache"                ON)
@@ -131,10 +130,6 @@ include_directories("${PROJECT_BINARY_DIR}") # proto
 add_subdirectory(include)
 add_subdirectory(mace)
 
-if(MACE_ENABLE_EXAMPLES)
-  add_subdirectory(examples)
-endif(MACE_ENABLE_EXAMPLES)
-
 if(MACE_ENABLE_TESTS OR MACE_ENABLE_BENCHMARKS)
   add_subdirectory(test)
 endif(MACE_ENABLE_TESTS OR MACE_ENABLE_BENCHMARKS)
diff --git a/docs/user_guide/advanced_usage.rst b/docs/user_guide/advanced_usage.rst
index 14c77cae38e0b6d29e2b9052ed86867be58b2f48..b86b37ef09215712bf2c5a5a20405658034b55c2 100644
--- a/docs/user_guide/advanced_usage.rst
+++ b/docs/user_guide/advanced_usage.rst
@@ -236,7 +236,7 @@ Convert model(s) to C++ code
 
     * **3. Deployment**
         * Link `libmace.a` and `${library_name}.a` to your target.
-        * Refer to \ ``mace/examples/example.cc``\ for full usage. The following list the key steps.
+        * Refer to \ ``mace/tools/mace_run.cc``\ for full usage. The following list the key steps.
 
         .. code:: cpp
 
@@ -404,7 +404,7 @@ the detailed information is in :doc:`benchmark`.
 .. code:: sh
 
     # Benchmark model, get detailed statistics of each Op.
-    python tools/converter.py benchmark --config=/path/to/model_deployment_file.yml
+    python tools/converter.py run --config=/path/to/model_deployment_file.yml --benchmark
 
 
 .. warning::
@@ -424,17 +424,17 @@ the detailed information is in :doc:`benchmark`.
         * - --omp_num_threads
           - int
           - -1
-          - ``run``/``benchmark``
+          - ``run``
           - number of threads
         * - --cpu_affinity_policy
           - int
           - 1
-          - ``run``/``benchmark``
+          - ``run``
           - 0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY
         * - --gpu_perf_hint
           - int
           - 3
-          - ``run``/``benchmark``
+          - ``run``
           - 0:DEFAULT/1:LOW/2:NORMAL/3:HIGH
         * - --gpu_priority_hint
           - int
@@ -449,7 +449,6 @@ Use ``-h`` to get detailed help.
     python tools/converter.py -h
     python tools/converter.py build -h
     python tools/converter.py run -h
-    python tools/converter.py benchmark -h
 
 Reduce Library Size
 -------------------
diff --git a/docs/user_guide/basic_usage.rst b/docs/user_guide/basic_usage.rst
index 3ec5886ef8836671c39600d20e227f78ac2903b6..759ca760dd55be1fe6dd13ae8930a91b97f42d34 100644
--- a/docs/user_guide/basic_usage.rst
+++ b/docs/user_guide/basic_usage.rst
@@ -75,8 +75,8 @@ Here we use the mobilenet-v2 model as an example.
 
     .. code:: sh
 
-        # Run example
-        python tools/converter.py run --config=/path/to/mace-models/mobilenet-v2/mobilenet-v2.yml --example
+        # Run
+        python tools/converter.py run --config=/path/to/mace-models/mobilenet-v2/mobilenet-v2.yml
 
     	# Test model run time
         python tools/converter.py run --config=/path/to/mace-models/mobilenet-v2/mobilenet-v2.yml --round=100
@@ -233,7 +233,7 @@ to run and validate your model.
     .. code:: sh
 
         # Benchmark model, get detailed statistics of each Op.
-        python tools/converter.py benchmark --config=/path/to/your/model_deployment_file.yml
+        python tools/converter.py run --config=/path/to/your/model_deployment_file.yml --benchmark
 
 
 =======================================
@@ -308,7 +308,7 @@ header files.
                 └── mace_run_static
 
 
-Please refer to \ ``mace/examples/example.cc``\ for full usage. The following list the key steps.
+Please refer to \ ``mace/tools/mace_run.cc``\ for full usage. The following list the key steps.
 
 .. code:: cpp
 
diff --git a/docs/user_guide/benchmark.rst b/docs/user_guide/benchmark.rst
index 7992c70f06375c9fc89ca64622a7ecf686d7ef67..f6a058952a098e41dcc3d7e49238bc4d4486aeaf 100644
--- a/docs/user_guide/benchmark.rst
+++ b/docs/user_guide/benchmark.rst
@@ -68,7 +68,7 @@ Usage
 
     .. code:: bash
 
-        python tools/converter.py benchmark --config=/path/to/your/model_deployment.yml
+        python tools/converter.py run --config=/path/to/your/model_deployment.yml --benchmark
 
 ======
 Output
@@ -76,29 +76,6 @@ Output
 
     .. code:: bash
 
-        I benchmark_model.cc:158 ---------------------------------------------------------------------
-        I benchmark_model.cc:158                                Warm Up
-        I benchmark_model.cc:158 ----------------------------------------------------------------------
-        I benchmark_model.cc:158 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) |   std |
-        I benchmark_model.cc:158 ----------------------------------------------------------------------
-        I benchmark_model.cc:158 |     1 |    51.481 |   51.481 |  51.481 |  51.481 |  51.481 | 0.000 |
-        I benchmark_model.cc:158 ----------------------------------------------------------------------
-        I benchmark_model.cc:158
-        I benchmark_model.cc:158 ------------------------------------------------------------------------
-        I benchmark_model.cc:158                          Run without statistics
-        I benchmark_model.cc:158 -------------------------------------------------------------------------
-        I benchmark_model.cc:158 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) |      std |
-        I benchmark_model.cc:158 -------------------------------------------------------------------------
-        I benchmark_model.cc:158 |   100 |    30.272 |   31.390 |  29.938 |  45.966 |  30.913 | 1850.983 |
-        I benchmark_model.cc:158 -------------------------------------------------------------------------
-        I benchmark_model.cc:158
-        I benchmark_model.cc:158 -----------------------------------------------------------------------
-        I benchmark_model.cc:158                           Run with statistics
-        I benchmark_model.cc:158 ------------------------------------------------------------------------
-        I benchmark_model.cc:158 | round | first(ms) | curr(ms) | min(ms) | max(ms) | avg(ms) |     std |
-        I benchmark_model.cc:158 ------------------------------------------------------------------------
-        I benchmark_model.cc:158 |   100 |    32.358 |   33.327 |  32.293 |  33.607 |  33.002 | 310.435 |
-        I benchmark_model.cc:158 ------------------------------------------------------------------------
         I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
         I statistics.cc:343                                                                                      Sort by Run Order
         I statistics.cc:343 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
diff --git a/docs/user_guide/quantization_usage.rst b/docs/user_guide/quantization_usage.rst
index 2caecf6ef16fad87cad15ced132369ea52e8e9b3..320f16f3947ac08698ce92295e9f8f9b2d142752 100644
--- a/docs/user_guide/quantization_usage.rst
+++ b/docs/user_guide/quantization_usage.rst
@@ -52,7 +52,7 @@ MACE provides tools to do statistics with following steps:
 		rename 's/^/input/' *
 
 		# Run with input tensors
-		python tools/converter.py run --config ../mace-models/inception-v3/inception-v3.yml --example
+		python tools/converter.py run --config ../mace-models/inception-v3/inception-v3.yml
 			--quantize_stat --input_dir /path/to/directory/of/input/tensors > range_log
 
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
deleted file mode 100644
index 24b69cd3ea02f2b2a63bec9e98c0797957569cbf..0000000000000000000000000000000000000000
--- a/examples/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-file(GLOB MACE_EXAMPLE_SRCS
-  cli/example.cc
-)
-add_executable(mace_example ${MACE_EXAMPLE_SRCS})
-target_link_libraries(mace_example PUBLIC
-  mace_static
-  gflags
-)
-
-install(TARGETS mace_example RUNTIME DESTINATION bin)
diff --git a/examples/cli/BUILD.bazel b/examples/cli/BUILD.bazel
deleted file mode 100644
index ce3c1ea79b3eaca2a39b31caca7bd2d7bc058407..0000000000000000000000000000000000000000
--- a/examples/cli/BUILD.bazel
+++ /dev/null
@@ -1,80 +0,0 @@
-# Examples
-load(
-    "//mace:mace.bzl",
-    "if_android",
-    "if_darwin",
-    "if_hexagon_enabled",
-    "if_hta_enabled",
-    "if_linux",
-    "if_opencl_enabled",
-    "if_openmp_enabled",
-)
-
-cc_binary(
-    name = "example_static",
-    srcs = ["example.cc"],
-    copts = [
-        "-Werror",
-        "-Wextra",
-    ] + if_opencl_enabled([
-        "-DMACE_ENABLE_OPENCL",
-    ]),
-    linkopts = [
-        "-lm",
-        "-ldl",
-    ] + if_linux(["-lpthread"]) + if_darwin(
-        ["-lpthread"],
-        default_value = ["-fuse-ld=gold"],
-    ) + if_openmp_enabled([
-        "-fopenmp",
-    ]) + if_android([
-        "-pie",
-        "-llog",
-    ]),
-    linkstatic = 1,
-    deps = [
-        "//external:gflags_nothreads",
-        "//mace/codegen:generated_mace_engine_factory",
-        "//mace/codegen:generated_models",
-        "//mace/libmace",
-    ] + if_opencl_enabled([
-        "//mace/codegen:generated_opencl_binary",
-        "//mace/codegen:generated_opencl_parameter",
-    ]) + if_hexagon_enabled([
-        "//third_party/nnlib:libhexagon",
-    ]) + if_hta_enabled([
-        "//third_party/hta",
-    ]),
-)
-
-cc_binary(
-    name = "example_dynamic",
-    srcs = ["example.cc"],
-    copts = [
-        "-Werror",
-        "-Wextra",
-        "-Wno-missing-field-initializers",
-    ] + if_opencl_enabled([
-        "-DMACE_ENABLE_OPENCL",
-    ]),
-    linkopts = [
-        "-lm",
-        "-ldl",
-    ] + if_linux(["-lpthread"]) + if_darwin(
-        ["-lpthread"],
-        default_value = ["-fuse-ld=gold"],
-    ) + if_android([
-        "-pie",
-        "-llog",
-    ]),
-    linkstatic = 0,
-    deps = [
-        "//external:gflags_nothreads",
-        "//mace/codegen:generated_mace_engine_factory",
-        "//mace/codegen:generated_models",
-        "//mace/libmace:libmace_dynamic",
-    ] + if_opencl_enabled([
-        "//mace/codegen:generated_opencl_binary",
-        "//mace/codegen:generated_opencl_parameter",
-    ]),
-)
diff --git a/examples/cli/README.md b/examples/cli/README.md
deleted file mode 100644
index 50e64f950e80afa1cb72199df3f68e0c0e7b518b..0000000000000000000000000000000000000000
--- a/examples/cli/README.md
+++ /dev/null
@@ -1,23 +0,0 @@
-Examples
-=======
-
-* Convert model
-
-```
-python tools/converter.py convert --config=/path/to/your/model_deployment_file
-```
-
-* Run example
-```
-python tools/converter.py run --config=/path/to/your/model_deployment_file --example
-```
-
-* Validate result
-```
-python tools/converter.py run --config=/path/to/your/model_deployment_file --example --validate
-```
-
-* Check the logs
-```
-adb logcat
-```
diff --git a/examples/cli/example.cc b/examples/cli/example.cc
deleted file mode 100644
index 103138447cea0ed7a54f8374e88c4cdb0e69ca92..0000000000000000000000000000000000000000
--- a/examples/cli/example.cc
+++ /dev/null
@@ -1,465 +0,0 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <dirent.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <unistd.h>
-#include <cstdint>
-#include <cstdlib>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <numeric>
-
-#include "gflags/gflags.h"
-#include "mace/port/env.h"
-#include "mace/port/file_system.h"
-#include "mace/public/mace.h"
-#include "mace/utils/logging.h"
-#include "mace/utils/memory.h"
-#include "mace/utils/string_util.h"
-// if convert model to code.
-#ifdef MODEL_GRAPH_FORMAT_CODE
-#include "mace/codegen/engine/mace_engine_factory.h"
-#endif
-
-#ifdef MACE_ENABLE_OPENCL
-namespace mace {
-const unsigned char *LoadOpenCLBinary();
-size_t OpenCLBinarySize();
-const unsigned char *LoadOpenCLParameter();
-size_t OpenCLParameterSize();
-}  // namespace mace
-#endif
-
-
-namespace mace {
-namespace examples {
-
-void ParseShape(const std::string &str, std::vector<int64_t> *shape) {
-  std::string tmp = str;
-  while (!tmp.empty()) {
-    int dim = atoi(tmp.data());
-    shape->push_back(dim);
-    size_t next_offset = tmp.find(",");
-    if (next_offset == std::string::npos) {
-      break;
-    } else {
-      tmp = tmp.substr(next_offset + 1);
-    }
-  }
-}
-
-std::string FormatName(const std::string input) {
-  std::string res = input;
-  for (size_t i = 0; i < input.size(); ++i) {
-    if (!isalnum(res[i])) res[i] = '_';
-  }
-  return res;
-}
-
-DeviceType ParseDeviceType(const std::string &device_str) {
-  if (device_str.compare("CPU") == 0) {
-    return DeviceType::CPU;
-  } else if (device_str.compare("GPU") == 0) {
-    return DeviceType::GPU;
-  } else if (device_str.compare("HEXAGON") == 0) {
-    return DeviceType::HEXAGON;
-  } else if (device_str.compare("HTA") == 0) {
-    return DeviceType::HTA;
-  } else {
-    return DeviceType::CPU;
-  }
-}
-
-DataFormat ParseDataFormat(const std::string &data_format_str) {
-  if (data_format_str == "NHWC") {
-    return DataFormat::NHWC;
-  } else if (data_format_str == "NCHW") {
-    return DataFormat::NCHW;
-  } else if (data_format_str == "OIHW") {
-    return DataFormat::OIHW;
-  } else {
-    return DataFormat::NONE;
-  }
-}
-
-DEFINE_string(model_name,
-              "",
-              "model name in model deployment file");
-DEFINE_string(input_node,
-              "",
-              "input nodes, separated by comma,"
-              "example: input_node0,input_node1");
-DEFINE_string(input_shape,
-              "",
-              "input shapes, separated by colon and comma, "
-              "example: 1,224,224,3:1,1,1,10");
-DEFINE_string(output_node,
-              "output_node0,output_node1",
-              "output nodes, separated by comma");
-DEFINE_string(output_shape,
-              "",
-              "output shapes, separated by colon and comma, "
-              "example: 1,224,224,2:1,1,1,10");
-DEFINE_string(input_data_format,
-              "NHWC",
-              "input data formats, NONE|NHWC|NCHW");
-DEFINE_string(output_data_format,
-              "NHWC",
-              "output data formats, NONE|NHWC|NCHW");
-DEFINE_string(input_file,
-              "",
-              "input file name | input file prefix for multiple inputs.");
-DEFINE_string(output_file,
-              "",
-              "output file name | output file prefix for multiple outputs");
-DEFINE_string(input_dir,
-              "",
-              "input directory name");
-DEFINE_string(output_dir,
-              "",
-              "output directory name");
-DEFINE_string(opencl_binary_file,
-              "",
-              "compiled opencl binary file path");
-DEFINE_string(opencl_parameter_file,
-              "",
-              "tuned OpenCL parameter file path");
-DEFINE_string(model_data_file,
-              "",
-              "model data file name, used when model_data_format == file");
-DEFINE_string(model_file,
-              "",
-              "model file name, used when load mace model in pb");
-DEFINE_string(device, "GPU", "CPU/GPU/HEXAGON");
-DEFINE_int32(round, 1, "round");
-DEFINE_int32(restart_round, 1, "restart round");
-DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable");
-DEFINE_int32(gpu_perf_hint, 2, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
-DEFINE_int32(gpu_priority_hint, 1, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
-DEFINE_int32(omp_num_threads, -1, "num of openmp threads");
-DEFINE_int32(cpu_affinity_policy, 1,
-             "0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY");
-
-bool RunModel(const std::vector<std::string> &input_names,
-              const std::vector<std::vector<int64_t>> &input_shapes,
-              const std::vector<DataFormat> &input_data_formats,
-              const std::vector<std::string> &output_names,
-              const std::vector<std::vector<int64_t>> &output_shapes,
-              const std::vector<DataFormat> &output_data_formats) {
-  // load model
-  DeviceType device_type = ParseDeviceType(FLAGS_device);
-  // configuration
-  // Detailed information please see mace.h
-  MaceStatus status;
-  MaceEngineConfig config(device_type);
-  status = config.SetCPUThreadPolicy(
-      FLAGS_omp_num_threads,
-      static_cast<CPUAffinityPolicy>(FLAGS_cpu_affinity_policy));
-  if (status != MaceStatus::MACE_SUCCESS) {
-    std::cerr << "Set openmp or cpu affinity failed." << std::endl;
-  }
-#ifdef MACE_ENABLE_OPENCL
-  std::shared_ptr<GPUContext> gpu_context;
-  if (device_type == DeviceType::GPU) {
-    // DO NOT USE tmp directory.
-    // Please use APP's own directory and make sure the directory exists.
-    const char *storage_path_ptr = getenv("MACE_INTERNAL_STORAGE_PATH");
-    const std::string storage_path =
-        std::string(storage_path_ptr == nullptr ?
-                    "/data/local/tmp/mace_run/interior" : storage_path_ptr);
-    std::vector<std::string> opencl_binary_paths = {FLAGS_opencl_binary_file};
-
-    gpu_context = GPUContextBuilder()
-        .SetStoragePath(storage_path)
-        .SetOpenCLBinaryPaths(opencl_binary_paths)
-        .SetOpenCLBinary(LoadOpenCLBinary(), OpenCLBinarySize())
-        .SetOpenCLParameterPath(FLAGS_opencl_parameter_file)
-        .SetOpenCLParameter(LoadOpenCLParameter(), OpenCLParameterSize())
-        .Finalize();
-
-    config.SetGPUContext(gpu_context);
-    config.SetGPUHints(
-        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
-        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
-  }
-#endif  // MACE_ENABLE_OPENCL
-
-  // Create Engine
-  std::shared_ptr<mace::MaceEngine> engine;
-  MaceStatus create_engine_status;
-
-  std::unique_ptr<mace::port::ReadOnlyMemoryRegion> model_graph_data =
-    make_unique<mace::port::ReadOnlyBufferMemoryRegion>();
-  if (FLAGS_model_file != "") {
-    auto fs = GetFileSystem();
-    auto status = fs->NewReadOnlyMemoryRegionFromFile(FLAGS_model_file.c_str(),
-        &model_graph_data);
-    if (status != MaceStatus::MACE_SUCCESS) {
-      LOG(FATAL) << "Failed to read file: " << FLAGS_model_file;
-    }
-  }
-
-  std::unique_ptr<mace::port::ReadOnlyMemoryRegion> model_weights_data =
-    make_unique<mace::port::ReadOnlyBufferMemoryRegion>();
-  if (FLAGS_model_data_file != "") {
-    auto fs = GetFileSystem();
-    auto status = fs->NewReadOnlyMemoryRegionFromFile(
-        FLAGS_model_data_file.c_str(),
-        &model_weights_data);
-    if (status != MaceStatus::MACE_SUCCESS) {
-      LOG(FATAL) << "Failed to read file: " << FLAGS_model_data_file;
-    }
-    MACE_CHECK(model_weights_data->length() > 0);
-  }
-
-  // Only choose one of the two type based on the `model_graph_format`
-  // in model deployment file(.yml).
-#ifdef MODEL_GRAPH_FORMAT_CODE
-  // if model_data_format == code, just pass an empty string("")
-  // to model_data_file parameter.
-  create_engine_status = CreateMaceEngineFromCode(
-      FLAGS_model_name,
-      reinterpret_cast<const unsigned char *>(model_weights_data->data()),
-      model_weights_data->length(),
-      input_names,
-      output_names,
-      config,
-      &engine);
-#else
-  create_engine_status = CreateMaceEngineFromProto(
-      reinterpret_cast<const unsigned char *>(model_graph_data->data()),
-      model_graph_data->length(),
-      reinterpret_cast<const unsigned char *>(model_weights_data->data()),
-      model_weights_data->length(),
-      input_names,
-      output_names,
-      config,
-      &engine);
-#endif
-
-  if (create_engine_status != MaceStatus::MACE_SUCCESS) {
-    std::cerr << "Create engine error, please check the arguments first, "
-              << "if correct, the device may not run the model, "
-              << "please fall back to other strategy."
-              << std::endl;
-    exit(1);
-  }
-
-  const size_t input_count = input_names.size();
-  const size_t output_count = output_names.size();
-
-  std::map<std::string, mace::MaceTensor> inputs;
-  std::map<std::string, mace::MaceTensor> outputs;
-  std::map<std::string, int64_t> inputs_size;
-  for (size_t i = 0; i < input_count; ++i) {
-    int64_t input_size =
-        std::accumulate(input_shapes[i].begin(), input_shapes[i].end(), 1,
-                        std::multiplies<int64_t>());
-    inputs_size[input_names[i]] = input_size;
-    // Only support float and int32 data type
-    auto buffer_in = std::shared_ptr<float>(new float[input_size],
-                                            std::default_delete<float[]>());
-    inputs[input_names[i]] = mace::MaceTensor(input_shapes[i], buffer_in,
-        input_data_formats[i]);
-  }
-
-  for (size_t i = 0; i < output_count; ++i) {
-    int64_t output_size =
-        std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1,
-                        std::multiplies<int64_t>());
-    // Only support float and int32 data type
-    auto buffer_out = std::shared_ptr<float>(new float[output_size],
-                                             std::default_delete<float[]>());
-    outputs[output_names[i]] = mace::MaceTensor(output_shapes[i], buffer_out,
-        output_data_formats[i]);
-  }
-
-  if (!FLAGS_input_dir.empty()) {
-    DIR *dir_parent;
-    struct dirent *entry;
-    dir_parent = opendir(FLAGS_input_dir.c_str());
-    if (dir_parent) {
-      while ((entry = readdir(dir_parent))) {
-        std::string file_name = std::string(entry->d_name);
-        std::string prefix = FormatName(input_names[0]);
-        if (file_name.find(prefix) == 0) {
-          std::string suffix = file_name.substr(prefix.size());
-
-          for (size_t i = 0; i < input_count; ++i) {
-            file_name = FLAGS_input_dir + "/" + FormatName(input_names[i])
-                + suffix;
-            std::ifstream in_file(file_name, std::ios::in | std::ios::binary);
-            std::cout << "Read " << file_name << std::endl;
-            if (in_file.is_open()) {
-              in_file.read(reinterpret_cast<char *>(
-                               inputs[input_names[i]].data().get()),
-                           inputs_size[input_names[i]] * sizeof(float));
-              in_file.close();
-            } else {
-              std::cerr << "Open input file failed" << std::endl;
-              return -1;
-            }
-          }
-          engine->Run(inputs, &outputs);
-
-          if (!FLAGS_output_dir.empty()) {
-            for (size_t i = 0; i < output_count; ++i) {
-              std::string output_name =
-                  FLAGS_output_dir + "/" + FormatName(output_names[i]) + suffix;
-              std::ofstream out_file(output_name, std::ios::binary);
-              if (out_file.is_open()) {
-                int64_t output_size =
-                    std::accumulate(output_shapes[i].begin(),
-                                    output_shapes[i].end(),
-                                    1,
-                                    std::multiplies<int64_t>());
-                out_file.write(
-                    reinterpret_cast<char *>(
-                        outputs[output_names[i]].data().get()),
-                    output_size * sizeof(float));
-                out_file.flush();
-                out_file.close();
-              } else {
-                std::cerr << "Open output file failed" << std::endl;
-                return -1;
-              }
-            }
-          }
-        }
-      }
-
-      closedir(dir_parent);
-    } else {
-      std::cerr << "Directory " << FLAGS_input_dir << " does not exist."
-                << std::endl;
-    }
-  } else {
-    for (size_t i = 0; i < input_count; ++i) {
-      std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]),
-                            std::ios::in | std::ios::binary);
-      if (in_file.is_open()) {
-        in_file.read(reinterpret_cast<char *>(
-                         inputs[input_names[i]].data().get()),
-                     inputs_size[input_names[i]] * sizeof(float));
-        in_file.close();
-      } else {
-        std::cerr << "Open input file failed" << std::endl;
-        return -1;
-      }
-    }
-    engine->Run(inputs, &outputs);
-    for (size_t i = 0; i < output_count; ++i) {
-      std::string output_name =
-          FLAGS_output_file + "_" + FormatName(output_names[i]);
-      std::ofstream out_file(output_name, std::ios::binary);
-      int64_t output_size =
-          std::accumulate(output_shapes[i].begin(), output_shapes[i].end(), 1,
-                          std::multiplies<int64_t>());
-      if (out_file.is_open()) {
-        out_file.write(
-            reinterpret_cast<char *>(outputs[output_names[i]].data().get()),
-            output_size * sizeof(float));
-        out_file.flush();
-        out_file.close();
-      } else {
-        std::cerr << "Open output file failed" << std::endl;
-        return -1;
-      }
-    }
-  }
-
-  std::cout << "Finished" << std::endl;
-
-  return true;
-}
-
-int Main(int argc, char **argv) {
-  std::string usage = "example run\nusage: " + std::string(argv[0])
-      + " [flags]";
-  gflags::SetUsageMessage(usage);
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-
-  std::cout << "mace version: " << MaceVersion() << std::endl;
-  std::cout << "input node: " << FLAGS_input_node << std::endl;
-  std::cout << "input shape: " << FLAGS_input_shape << std::endl;
-  std::cout << "output node: " << FLAGS_output_node << std::endl;
-  std::cout << "output shape: " << FLAGS_output_shape << std::endl;
-  std::cout << "input_file: " << FLAGS_input_file << std::endl;
-  std::cout << "output_file: " << FLAGS_output_file << std::endl;
-  std::cout << "input_dir: " << FLAGS_input_dir << std::endl;
-  std::cout << "output dir: " << FLAGS_output_dir << std::endl;
-  std::cout << "model_data_file: " << FLAGS_model_data_file << std::endl;
-  std::cout << "model_file: " << FLAGS_model_file << std::endl;
-  std::cout << "device: " << FLAGS_device << std::endl;
-  std::cout << "round: " << FLAGS_round << std::endl;
-  std::cout << "restart_round: " << FLAGS_restart_round << std::endl;
-  std::cout << "gpu_perf_hint: " << FLAGS_gpu_perf_hint << std::endl;
-  std::cout << "gpu_priority_hint: " << FLAGS_gpu_priority_hint << std::endl;
-  std::cout << "omp_num_threads: " << FLAGS_omp_num_threads << std::endl;
-  std::cout << "cpu_affinity_policy: "
-            << FLAGS_cpu_affinity_policy
-            << std::endl;
-
-  std::vector<std::string> input_names = Split(FLAGS_input_node, ',');
-  std::vector<std::string> output_names = Split(FLAGS_output_node, ',');
-  std::vector<std::string> input_shapes = Split(FLAGS_input_shape, ':');
-  std::vector<std::string> output_shapes = Split(FLAGS_output_shape, ':');
-
-  const size_t input_count = input_shapes.size();
-  const size_t output_count = output_shapes.size();
-  std::vector<std::vector<int64_t>> input_shape_vec(input_count);
-  std::vector<std::vector<int64_t>> output_shape_vec(output_count);
-  for (size_t i = 0; i < input_count; ++i) {
-    ParseShape(input_shapes[i], &input_shape_vec[i]);
-  }
-  for (size_t i = 0; i < output_count; ++i) {
-    ParseShape(output_shapes[i], &output_shape_vec[i]);
-  }
-
-  std::vector<std::string> raw_input_data_formats =
-    Split(FLAGS_input_data_format, ',');
-  std::vector<std::string> raw_output_data_formats =
-    Split(FLAGS_output_data_format, ',');
-  std::vector<DataFormat> input_data_formats(input_count);
-  std::vector<DataFormat> output_data_formats(output_count);
-  for (size_t i = 0; i < input_count; ++i) {
-    input_data_formats[i] = ParseDataFormat(raw_input_data_formats[i]);
-  }
-  for (size_t i = 0; i < output_count; ++i) {
-    output_data_formats[i] = ParseDataFormat(raw_output_data_formats[i]);
-  }
-
-  bool ret = false;
-  for (int i = 0; i < FLAGS_restart_round; ++i) {
-    std::cout << "restart round " << i << std::endl;
-    ret =
-        RunModel(input_names, input_shape_vec, input_data_formats,
-                 output_names, output_shape_vec, output_data_formats);
-  }
-  if (ret) {
-    return 0;
-  } else {
-    return -1;
-  }
-}
-
-}  // namespace examples
-}  // namespace mace
-
-int main(int argc, char **argv) { mace::examples::Main(argc, argv); }
diff --git a/mace/core/net.cc b/mace/core/net.cc
index 8c301dc728f0af53137023f4d019e9a89cf3e6ce..78d40dd7f57440055eea4c48c375071db2e6bf13 100644
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -124,6 +124,11 @@ MaceStatus SerialNet::Init() {
 }
 
 MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
+  const char *profiling = getenv("MACE_OPENCL_PROFILING");
+  bool
+  enable_opencl_profiling =
+      profiling != nullptr && strlen(profiling) == 1 && profiling[0] == '1';
+
   MACE_MEMORY_LOGGING_GUARD();
   MACE_LATENCY_LOGGER(1, "Running net");
   OpContext context(ws_, cpu_device_.get());
@@ -146,7 +151,8 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
     if (run_metadata == nullptr) {
       MACE_RETURN_IF_ERROR(op->Run(&context));
     } else {
-      if (device_type == DeviceType::CPU) {
+      if (device_type == DeviceType::CPU
+          || (device_type == DeviceType::GPU && !enable_opencl_profiling)) {
         call_stats.start_micros = NowMicros();
         MACE_RETURN_IF_ERROR(op->Run(&context));
         call_stats.end_micros = NowMicros();
diff --git a/mace/tools/validation/BUILD.bazel b/mace/tools/BUILD.bazel
similarity index 59%
rename from mace/tools/validation/BUILD.bazel
rename to mace/tools/BUILD.bazel
index 476fc15a66ec0792d657b0ad2250730ea0ff05fe..95d1f38662415e68f1a2a5898ac1081a175339db 100644
--- a/mace/tools/validation/BUILD.bazel
+++ b/mace/tools/BUILD.bazel
@@ -1,53 +1,55 @@
+# Benchmark
 # Examples
 load(
     "//mace:mace.bzl",
     "if_android",
-    "if_darwin",
+    "if_hexagon_enabled",
     "if_opencl_enabled",
     "if_openmp_enabled",
 )
 
+licenses(["notice"])  # Apache 2.0
+
 cc_binary(
     name = "mace_run_static",
-    srcs = ["mace_run.cc"],
+    srcs = [
+        "mace_run.cc",
+    ],
     copts = [
         "-Werror",
         "-Wextra",
-    ] + if_opencl_enabled([
-        "-DMACE_ENABLE_OPENCL",
-    ]),
-    linkopts = if_darwin(
-        [],
-        default_value = ["-fuse-ld=gold"],
-    ) + if_openmp_enabled([
-        "-fopenmp",
-    ]),
+        "-Wno-missing-field-initializers",
+    ] + if_opencl_enabled(["-DMACE_ENABLE_OPENCL"]),
     linkstatic = 1,
     deps = [
         "//external:gflags_nothreads",
         "//mace/codegen:generated_mace_engine_factory",
         "//mace/codegen:generated_models",
         "//mace/libmace",
+        "//mace/utils",
     ],
 )
 
 cc_binary(
     name = "mace_run_dynamic",
-    srcs = ["mace_run.cc"],
+    srcs = [
+        "mace_run.cc",
+    ],
     copts = [
         "-Werror",
         "-Wextra",
-    ] + if_opencl_enabled([
-        "-DMACE_ENABLE_OPENCL",
-    ]),
-    linkopts = if_darwin(
-        [],
-        default_value = ["-fuse-ld=gold"],
-    ) + if_openmp_enabled([
-        "-fopenmp",
+        "-Wno-missing-field-initializers",
+    ] + if_opencl_enabled(["-DMACE_ENABLE_OPENCL"]),
+    linkopts = [
+        "-lm",
+    ] + if_android([
+        "-ldl",
+        "-pie",
+        "-llog",
     ]),
     linkstatic = 0,
     deps = [
+        ":statistics",
         "//external:gflags_nothreads",
         "//mace/codegen:generated_mace_engine_factory",
         "//mace/codegen:generated_models",
diff --git a/mace/tools/CMakeLists.txt b/mace/tools/CMakeLists.txt
index a23460711dc66033a21c2b84ef6caf1caec7c9cb..a01661430d57c54e85f266f29ff01baac4f65ad1 100644
--- a/mace/tools/CMakeLists.txt
+++ b/mace/tools/CMakeLists.txt
@@ -1,5 +1,5 @@
 file(GLOB MACE_RUN_SRCS
-  validation/mace_run.cc
+  mace_run.cc
 )
 add_executable(mace_run ${MACE_RUN_SRCS})
 target_link_libraries(mace_run PUBLIC
@@ -7,25 +7,4 @@ target_link_libraries(mace_run PUBLIC
   gflags
 )
 
-
-file(GLOB MACE_BENCHMARK_MODEL_SRCS
-  benchmark/benchmark_model.cc
-)
-add_executable(benchmark_model ${MACE_BENCHMARK_MODEL_SRCS})
-target_link_libraries(benchmark_model PUBLIC
-  mace_static
-  gflags
-)
-
-file(GLOB MACE_BENCHMARK_MODEL_THROUGHPUT_SRCS
-  benchmark/benchmark_model_throughput.cc
-)
-add_executable(benchmark_model_throughput ${MACE_BENCHMARK_MODEL_THROUGHPUT_SRCS})
-target_link_libraries(benchmark_model_throughput PUBLIC
-  mace_static
-  gflags
-)
-
 install(TARGETS mace_run RUNTIME DESTINATION bin)
-install(TARGETS benchmark_model RUNTIME DESTINATION bin)
-install(TARGETS benchmark_model_throughput RUNTIME DESTINATION bin)
diff --git a/mace/tools/benchmark/BUILD.bazel b/mace/tools/benchmark/BUILD.bazel
deleted file mode 100644
index b1528e62b2589a5b282646040b56feeea4d8fe0c..0000000000000000000000000000000000000000
--- a/mace/tools/benchmark/BUILD.bazel
+++ /dev/null
@@ -1,86 +0,0 @@
-# Benchmark
-# Examples
-load(
-    "//mace:mace.bzl",
-    "if_hexagon_enabled",
-    "if_openmp_enabled",
-    "if_android",
-    "if_opencl_enabled",
-)
-
-licenses(["notice"])  # Apache 2.0
-
-cc_binary(
-    name = "benchmark_model_static",
-    srcs = [
-        "benchmark_model.cc",
-    ],
-    copts = [
-        "-Werror",
-        "-Wextra",
-        "-Wno-missing-field-initializers",
-    ] + if_opencl_enabled(["-DMACE_ENABLE_OPENCL"]),
-    linkopts = if_openmp_enabled(["-fopenmp"]),
-    linkstatic = 1,
-    deps = [
-        "//external:gflags_nothreads",
-        "//mace/codegen:generated_mace_engine_factory",
-        "//mace/codegen:generated_models",
-        "//mace/libmace",
-        "//mace/utils",
-    ],
-)
-
-cc_binary(
-    name = "benchmark_model_dynamic",
-    srcs = [
-        "benchmark_model.cc",
-    ],
-    copts = [
-        "-Werror",
-        "-Wextra",
-        "-Wno-missing-field-initializers",
-    ] + if_android(["-DMACE_ENABLE_OPENCL"]),
-    linkopts = [
-        "-lm",
-    ] + if_openmp_enabled([
-        "-fopenmp",
-    ]) + if_android([
-        "-ldl",
-        "-pie",
-        "-llog",
-    ]),
-    linkstatic = 0,
-    deps = [
-        ":statistics",
-        "//external:gflags_nothreads",
-        "//mace/codegen:generated_mace_engine_factory",
-        "//mace/codegen:generated_models",
-        "//mace/libmace:libmace_dynamic",
-    ],
-)
-
-cc_library(
-    name = "libmace_merged",
-    srcs = [
-        "libmace_merged.a",
-    ],
-    visibility = ["//visibility:private"],
-)
-
-cc_binary(
-    name = "model_throughput_test",
-    srcs = ["model_throughput_test.cc"],
-    copts = [
-        "-Werror",
-        "-Wextra",
-        "-Wno-missing-field-initializers",
-    ],
-    linkopts = if_openmp_enabled(["-fopenmp"]),
-    linkstatic = 1,
-    deps = [
-        ":libmace_merged",
-        "//external:gflags_nothreads",
-        "//mace/core",
-    ],
-)
diff --git a/mace/tools/benchmark/benchmark_model.cc b/mace/tools/benchmark/benchmark_model.cc
deleted file mode 100644
index a81c74720e92f95eacb5f4b0fe6f60084c54dbb1..0000000000000000000000000000000000000000
--- a/mace/tools/benchmark/benchmark_model.cc
+++ /dev/null
@@ -1,401 +0,0 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cstdlib>
-#include <fstream>
-#include <memory>
-#include <numeric>
-#include <thread>  // NOLINT(build/c++11)
-
-#include "gflags/gflags.h"
-#include "mace/port/env.h"
-#include "mace/port/file_system.h"
-#include "mace/public/mace.h"
-#include "mace/utils/logging.h"
-#include "mace/utils/memory.h"
-#include "mace/utils/math.h"
-#include "mace/utils/statistics.h"
-#ifdef MODEL_GRAPH_FORMAT_CODE
-#include "mace/codegen/engine/mace_engine_factory.h"
-#endif
-
-namespace mace {
-namespace benchmark {
-
-void ParseShape(const std::string &str, std::vector<int64_t> *shape) {
-  std::string tmp = str;
-  while (!tmp.empty()) {
-    int dim = atoi(tmp.data());
-    shape->push_back(dim);
-    size_t next_offset = tmp.find(",");
-    if (next_offset == std::string::npos) {
-      break;
-    } else {
-      tmp = tmp.substr(next_offset + 1);
-    }
-  }
-}
-
-std::string FormatName(const std::string input) {
-  std::string res = input;
-  for (size_t i = 0; i < input.size(); ++i) {
-    if (!::isalnum(res[i])) res[i] = '_';
-  }
-  return res;
-}
-
-DeviceType ParseDeviceType(const std::string &device_str) {
-  if (device_str.compare("CPU") == 0) {
-    return DeviceType::CPU;
-  } else if (device_str.compare("GPU") == 0) {
-    return DeviceType::GPU;
-  } else if (device_str.compare("HEXAGON") == 0) {
-    return DeviceType::HEXAGON;
-  } else {
-    return DeviceType::CPU;
-  }
-}
-
-DataFormat ParseDataFormat(const std::string &data_format_str) {
-  if (data_format_str == "NHWC") {
-    return DataFormat::NHWC;
-  } else if (data_format_str == "NCHW") {
-    return DataFormat::NCHW;
-  } else if (data_format_str == "OIHW") {
-    return DataFormat::OIHW;
-  } else {
-    return DataFormat::NONE;
-  }
-}
-
-bool RunInference(MaceEngine *engine,
-                  const std::map<std::string, mace::MaceTensor> &input_infos,
-                  std::map<std::string, mace::MaceTensor> *output_infos,
-                  int64_t *inference_time_us,
-                  OpStat *statistician) {
-  MACE_CHECK_NOTNULL(output_infos);
-  RunMetadata run_metadata;
-  RunMetadata *run_metadata_ptr = nullptr;
-  if (statistician) {
-    run_metadata_ptr = &run_metadata;
-  }
-
-  const int64_t start_time = NowMicros();
-  mace::MaceStatus s = engine->Run(input_infos, output_infos, run_metadata_ptr);
-  const int64_t end_time = NowMicros();
-
-  if (s != mace::MaceStatus::MACE_SUCCESS) {
-    LOG(ERROR) << "Error during inference.";
-    return false;
-  }
-  *inference_time_us = end_time - start_time;
-
-  if (statistician != nullptr) {
-    statistician->StatMetadata(run_metadata);
-  }
-
-  return true;
-}
-
-bool Run(const std::string &title,
-         MaceEngine *engine,
-         const std::map<std::string, mace::MaceTensor> &input_infos,
-         std::map<std::string, mace::MaceTensor> *output_infos,
-         int num_runs,
-         double max_time_sec,
-         int64_t *total_time_us,
-         int64_t *actual_num_runs,
-         OpStat *statistician) {
-  MACE_CHECK_NOTNULL(output_infos);
-  *total_time_us = 0;
-
-  TimeInfo<int64_t> time_info;
-
-  bool util_max_time = (num_runs <= 0);
-  for (int i = 0; util_max_time || i < num_runs; ++i) {
-    int64_t inference_time_us = 0;
-    bool s = RunInference(engine, input_infos, output_infos,
-                          &inference_time_us, statistician);
-    time_info.UpdateTime(inference_time_us);
-    (*total_time_us) += inference_time_us;
-    ++(*actual_num_runs);
-
-    if (max_time_sec > 0 && (*total_time_us / 1000000.0) > max_time_sec) {
-      break;
-    }
-
-    if (!s) {
-      LOG(INFO) << "Failed on run " << i;
-      return s;
-    }
-  }
-
-  std::stringstream stream(time_info.ToString(title));
-  stream << std::endl;
-  for (std::string line; std::getline(stream, line);) {
-    LOG(INFO) << line;
-  }
-  return true;
-}
-
-DEFINE_string(model_name, "", "model name in yaml");
-DEFINE_string(device, "CPU", "Device [CPU|GPU|DSP]");
-DEFINE_string(input_node, "input_node0,input_node1",
-              "input nodes, separated by comma");
-DEFINE_string(output_node, "output_node0,output_node1",
-              "output nodes, separated by comma");
-DEFINE_string(input_shape, "", "input shape, separated by colon and comma");
-DEFINE_string(output_shape, "", "output shape, separated by colon and comma");
-DEFINE_string(input_data_format,
-              "NHWC",
-              "input data formats, NONE|NHWC|NCHW");
-DEFINE_string(output_data_format,
-              "NHWC",
-              "output data formats, NONE|NHWC|NCHW");
-DEFINE_string(input_file, "", "input file name");
-DEFINE_int32(max_num_runs, 100, "max number of runs");
-DEFINE_double(max_seconds, 10.0, "max number of seconds to run");
-DEFINE_int32(warmup_runs, 1, "how many runs to initialize model");
-DEFINE_string(opencl_binary_file,
-              "",
-              "compiled opencl binary file path");
-DEFINE_string(opencl_parameter_file,
-              "",
-              "tuned OpenCL parameter file path");
-DEFINE_string(model_data_file, "",
-              "model data file name, used when EMBED_MODEL_DATA set to 0");
-DEFINE_string(model_file, "",
-              "model file name, used when load mace model in pb");
-DEFINE_int32(gpu_perf_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
-DEFINE_int32(gpu_priority_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
-DEFINE_int32(omp_num_threads, -1, "num of openmp threads");
-DEFINE_int32(cpu_affinity_policy, 1,
-             "0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY");
-
-int Main(int argc, char **argv) {
-  MACE_CHECK(FLAGS_device != "HEXAGON",
-             "Model benchmark tool do not support DSP.");
-  std::string usage = "benchmark model\nusage: " + std::string(argv[0])
-      + " [flags]";
-  gflags::SetUsageMessage(usage);
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-
-  LOG(INFO) << "Model name: [" << FLAGS_model_name << "]";
-  LOG(INFO) << "Model_file: " << FLAGS_model_file;
-  LOG(INFO) << "Device: [" << FLAGS_device << "]";
-  LOG(INFO) << "gpu_perf_hint: [" << FLAGS_gpu_perf_hint << "]";
-  LOG(INFO) << "gpu_priority_hint: [" << FLAGS_gpu_priority_hint << "]";
-  LOG(INFO) << "omp_num_threads: [" << FLAGS_omp_num_threads << "]";
-  LOG(INFO) << "cpu_affinity_policy: [" << FLAGS_cpu_affinity_policy << "]";
-  LOG(INFO) << "Input node: [" << FLAGS_input_node<< "]";
-  LOG(INFO) << "Input shapes: [" << FLAGS_input_shape << "]";
-  LOG(INFO) << "Output node: [" << FLAGS_output_node<< "]";
-  LOG(INFO) << "output shapes: [" << FLAGS_output_shape << "]";
-  LOG(INFO) << "Warmup runs: [" << FLAGS_warmup_runs << "]";
-  LOG(INFO) << "Num runs: [" << FLAGS_max_num_runs << "]";
-  LOG(INFO) << "Max run seconds: [" << FLAGS_max_seconds << "]";
-
-  std::unique_ptr<OpStat> statistician(new OpStat());
-
-  std::vector<std::string> input_names = Split(FLAGS_input_node, ',');
-  std::vector<std::string> output_names = Split(FLAGS_output_node, ',');
-  std::vector<std::string> input_shapes = Split(FLAGS_input_shape, ':');
-  std::vector<std::string> output_shapes = Split(FLAGS_output_shape, ':');
-
-  const size_t input_count = input_shapes.size();
-  const size_t output_count = output_shapes.size();
-  std::vector<std::vector<int64_t>> input_shape_vec(input_count);
-  std::vector<std::vector<int64_t>> output_shape_vec(output_count);
-  for (size_t i = 0; i < input_count; ++i) {
-    ParseShape(input_shapes[i], &input_shape_vec[i]);
-  }
-  for (size_t i = 0; i < output_count; ++i) {
-    ParseShape(output_shapes[i], &output_shape_vec[i]);
-  }
-
-  std::vector<std::string> raw_input_data_formats =
-      Split(FLAGS_input_data_format, ',');
-  std::vector<std::string> raw_output_data_formats =
-      Split(FLAGS_output_data_format, ',');
-  std::vector<DataFormat> input_data_formats(input_count);
-  std::vector<DataFormat> output_data_formats(output_count);
-  for (size_t i = 0; i < input_count; ++i) {
-    input_data_formats[i] = ParseDataFormat(raw_input_data_formats[i]);
-  }
-  for (size_t i = 0; i < output_count; ++i) {
-    output_data_formats[i] = ParseDataFormat(raw_output_data_formats[i]);
-  }
-
-  mace::DeviceType device_type = ParseDeviceType(FLAGS_device);
-
-  // configuration
-  MaceStatus mace_status;
-  MaceEngineConfig config(device_type);
-  mace_status = config.SetCPUThreadPolicy(
-      FLAGS_omp_num_threads,
-      static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
-  if (mace_status != MaceStatus::MACE_SUCCESS) {
-    LOG(INFO) << "Set openmp or cpu affinity failed.";
-  }
-#ifdef MACE_ENABLE_OPENCL
-  std::shared_ptr<GPUContext> gpu_context;
-  if (device_type == DeviceType::GPU) {
-    // DO NOT USE tmp directory.
-    // Please use APP's own directory and make sure the directory exists.
-    const char *storage_path_ptr = getenv("MACE_INTERNAL_STORAGE_PATH");
-    const std::string storage_path =
-        std::string(storage_path_ptr == nullptr ?
-                    "/data/local/tmp/mace_run/interior" : storage_path_ptr);
-    std::vector<std::string> opencl_binary_paths = {FLAGS_opencl_binary_file};
-
-    gpu_context = GPUContextBuilder()
-        .SetStoragePath(storage_path)
-        .SetOpenCLBinaryPaths(opencl_binary_paths)
-        .SetOpenCLParameterPath(FLAGS_opencl_parameter_file)
-        .Finalize();
-
-    config.SetGPUContext(gpu_context);
-    config.SetGPUHints(
-        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
-        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
-  }
-#endif  // MACE_ENABLE_OPENCL
-
-  // Create Engine
-  std::shared_ptr<mace::MaceEngine> engine;
-  MaceStatus create_engine_status;
-  // Create Engine
-  std::unique_ptr<mace::port::ReadOnlyMemoryRegion> model_graph_data =
-    make_unique<mace::port::ReadOnlyBufferMemoryRegion>();
-  if (FLAGS_model_file != "") {
-    auto fs = GetFileSystem();
-    auto status = fs->NewReadOnlyMemoryRegionFromFile(FLAGS_model_file.c_str(),
-        &model_graph_data);
-    if (status != MaceStatus::MACE_SUCCESS) {
-      LOG(FATAL) << "Failed to read file: " << FLAGS_model_file;
-    }
-  }
-
-  std::unique_ptr<mace::port::ReadOnlyMemoryRegion> model_weights_data =
-    make_unique<mace::port::ReadOnlyBufferMemoryRegion>();
-  if (FLAGS_model_data_file != "") {
-    auto fs = GetFileSystem();
-    auto status = fs->NewReadOnlyMemoryRegionFromFile(
-        FLAGS_model_data_file.c_str(),
-        &model_weights_data);
-    if (status != MaceStatus::MACE_SUCCESS) {
-      LOG(FATAL) << "Failed to read file: " << FLAGS_model_data_file;
-    }
-    MACE_CHECK(model_weights_data->length() > 0);
-  }
-
-#ifdef MODEL_GRAPH_FORMAT_CODE
-  create_engine_status = CreateMaceEngineFromCode(FLAGS_model_name,
-      reinterpret_cast<const unsigned char *>(model_weights_data->data()),
-      model_weights_data->length(),
-      input_names,
-      output_names,
-      config,
-      &engine);
-#else
-  create_engine_status = CreateMaceEngineFromProto(
-      reinterpret_cast<const unsigned char *>(model_graph_data->data()),
-      model_graph_data->length(),
-      reinterpret_cast<const unsigned char *>(model_weights_data->data()),
-      model_weights_data->length(),
-      input_names,
-      output_names,
-      config,
-      &engine);
-#endif
-  if (create_engine_status != MaceStatus::MACE_SUCCESS) {
-    LOG(FATAL) << "Create engine error, please check the arguments";
-  }
-
-  std::map<std::string, mace::MaceTensor> inputs;
-  std::map<std::string, mace::MaceTensor> outputs;
-  for (size_t i = 0; i < input_count; ++i) {
-    // only support float and int32, use char for generalization
-    int64_t input_size =
-        std::accumulate(input_shape_vec[i].begin(), input_shape_vec[i].end(), 4,
-                        std::multiplies<int64_t>());
-    auto buffer_in = std::shared_ptr<char>(new char[input_size],
-                                            std::default_delete<char[]>());
-    // load input
-    std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]),
-                          std::ios::in | std::ios::binary);
-    if (in_file.is_open()) {
-      in_file.read(buffer_in.get(), input_size);
-      in_file.close();
-    } else {
-      LOG(INFO) << "Open input file failed";
-      return -1;
-    }
-    inputs[input_names[i]] = mace::MaceTensor(input_shape_vec[i], buffer_in,
-        input_data_formats[i]);
-  }
-
-  for (size_t i = 0; i < output_count; ++i) {
-    // only support float and int32, use char for generalization
-    int64_t output_size =
-        std::accumulate(output_shape_vec[i].begin(),
-                        output_shape_vec[i].end(), 4,
-                        std::multiplies<int64_t>());
-    auto buffer_out = std::shared_ptr<char>(new char[output_size],
-                                            std::default_delete<char[]>());
-    outputs[output_names[i]] = mace::MaceTensor(output_shape_vec[i],
-                                                buffer_out,
-                                                output_data_formats[i]);
-  }
-
-  int64_t warmup_time_us = 0;
-  int64_t num_warmup_runs = 0;
-  if (FLAGS_warmup_runs > 0) {
-    bool status =
-        Run("Warm Up", engine.get(), inputs, &outputs,
-            FLAGS_warmup_runs, -1.0,
-            &warmup_time_us, &num_warmup_runs, nullptr);
-    if (!status) {
-      LOG(ERROR) << "Failed at warm up run";
-    }
-  }
-
-  int64_t no_stat_time_us = 0;
-  int64_t no_stat_runs = 0;
-  bool status =
-      Run("Run without statistics", engine.get(), inputs, &outputs,
-          FLAGS_max_num_runs, FLAGS_max_seconds,
-          &no_stat_time_us, &no_stat_runs, nullptr);
-  if (!status) {
-    LOG(ERROR) << "Failed at normal no-stat run";
-  }
-
-  int64_t stat_time_us = 0;
-  int64_t stat_runs = 0;
-  status = Run("Run with statistics", engine.get(), inputs, &outputs,
-               FLAGS_max_num_runs, FLAGS_max_seconds,
-               &stat_time_us, &stat_runs, statistician.get());
-  if (!status) {
-    LOG(ERROR) << "Failed at normal stat run";
-  }
-
-  statistician->PrintStat();
-
-  return 0;
-}
-
-}  // namespace benchmark
-}  // namespace mace
-
-int main(int argc, char **argv) { mace::benchmark::Main(argc, argv); }
diff --git a/mace/tools/benchmark/benchmark_model_throughput.cc b/mace/tools/benchmark/benchmark_model_throughput.cc
deleted file mode 100644
index a8fcf7596da4600e88d41c6ce9c5d54777a8e91b..0000000000000000000000000000000000000000
--- a/mace/tools/benchmark/benchmark_model_throughput.cc
+++ /dev/null
@@ -1,391 +0,0 @@
-// Copyright 2018 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/**
- * Usage:
- * throughput_test \
- *          --input_shape=1,224,224,3   \
- *          --output_shape=1,224,224,2   \
- *          --input_file=input_data \
- *          --cpu_model_data_file=cpu_model_data.data \
- *          --gpu_model_data_file=gpu_model_data.data \
- *          --dsp_model_data_file=dsp_model_data.data \
- *          --run_seconds=10
- */
-#include <cstdint>
-#include <cstdlib>
-#include <fstream>
-#include <iostream>
-#include <numeric>
-#include <thread>  // NOLINT(build/c++11)
-
-#include "gflags/gflags.h"
-#include "mace/public/mace.h"
-#include "mace/port/env.h"
-#include "mace/utils/logging.h"
-#include "mace/core/types.h"
-
-namespace mace {
-
-#ifdef MACE_CPU_MODEL_TAG
-namespace MACE_CPU_MODEL_TAG {
-
-extern const unsigned char *LoadModelData(const char *model_data_file);
-
-extern void UnloadModelData(const unsigned char *model_data);
-
-extern NetDef CreateNet(const unsigned char *model_data);
-
-extern const std::string ModelChecksum();
-
-}  // namespace MACE_CPU_MODEL_TAG
-#endif
-
-#ifdef MACE_GPU_MODEL_TAG
-namespace MACE_GPU_MODEL_TAG {
-
-extern const unsigned char *LoadModelData(const char *model_data_file);
-
-extern void UnloadModelData(const unsigned char *model_data);
-
-extern NetDef CreateNet(const unsigned char *model_data);
-
-extern const std::string ModelChecksum();
-
-}  // namespace MACE_GPU_MODEL_TAG
-#endif
-
-#ifdef MACE_DSP_MODEL_TAG
-namespace MACE_DSP_MODEL_TAG {
-
-extern const unsigned char *LoadModelData(const char *model_data_file);
-
-extern void UnloadModelData(const unsigned char *model_data);
-
-extern NetDef CreateNet(const unsigned char *model_data);
-
-extern const std::string ModelChecksum();
-
-}  // namespace MACE_DSP_MODEL_TAG
-#endif
-
-namespace benchmark {
-
-void Split(const std::string &str,
-           char delims,
-           std::vector<std::string> *result) {
-  MACE_CHECK_NOTNULL(result);
-  std::string tmp = str;
-  while (!tmp.empty()) {
-    size_t next_offset = tmp.find(delims);
-    result->push_back(tmp.substr(0, next_offset));
-    if (next_offset == std::string::npos) {
-      break;
-    } else {
-      tmp = tmp.substr(next_offset + 1);
-    }
-  }
-}
-
-void SplitAndParseToInts(const std::string &str,
-                         char delims,
-                         std::vector<int64_t> *result) {
-  MACE_CHECK_NOTNULL(result);
-  std::string tmp = str;
-  while (!tmp.empty()) {
-    index_t dim = atoi(tmp.data());
-    result->push_back(dim);
-    size_t next_offset = tmp.find(delims);
-    if (next_offset == std::string::npos) {
-      break;
-    } else {
-      tmp = tmp.substr(next_offset + 1);
-    }
-  }
-}
-
-void ParseShape(const std::string &str, std::vector<int64_t> *shape) {
-  std::string tmp = str;
-  while (!tmp.empty()) {
-    index_t dim = atoi(tmp.data());
-    shape->push_back(dim);
-    size_t next_offset = tmp.find(",");
-    if (next_offset == std::string::npos) {
-      break;
-    } else {
-      tmp = tmp.substr(next_offset + 1);
-    }
-  }
-}
-
-std::string FormatName(const std::string input) {
-  std::string res = input;
-  for (size_t i = 0; i < input.size(); ++i) {
-    if (!::isalnum(res[i])) res[i] = '_';
-  }
-  return res;
-}
-
-DeviceType ParseDeviceType(const std::string &device_str) {
-  if (device_str.compare("CPU") == 0) {
-    return DeviceType::CPU;
-  } else if (device_str.compare("GPU") == 0) {
-    return DeviceType::GPU;
-  } else if (device_str.compare("HEXAGON") == 0) {
-    return DeviceType::HEXAGON;
-  } else {
-    return DeviceType::CPU;
-  }
-}
-
-DEFINE_string(input_node, "input_node0,input_node1",
-              "input nodes, separated by comma");
-DEFINE_string(output_node, "output_node0,output_node1",
-              "output nodes, separated by comma");
-DEFINE_string(input_shape, "1,224,224,3", "input shape, separated by comma");
-DEFINE_string(output_shape, "1,224,224,2", "output shape, separated by comma");
-DEFINE_string(input_file, "", "input file name");
-DEFINE_string(cpu_model_data_file, "", "cpu model data file name");
-DEFINE_string(gpu_model_data_file, "", "gpu model data file name");
-DEFINE_string(dsp_model_data_file, "", "dsp model data file name");
-DEFINE_int32(run_seconds, 10, "run seconds");
-
-int Main(int argc, char **argv) {
-  std::string usage = "model throughput test\nusage: " + std::string(argv[0])
-      + " [flags]";
-  gflags::SetUsageMessage(usage);
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-
-  LOG(INFO) << "mace version: " << MaceVersion();
-#ifdef MACE_CPU_MODEL_TAG
-  LOG(INFO) << "cpu model checksum: "
-            << mace::MACE_CPU_MODEL_TAG::ModelChecksum();
-#endif
-#ifdef MACE_GPU_MODEL_TAG
-  LOG(INFO) << "gpu model checksum: "
-            << mace::MACE_GPU_MODEL_TAG::ModelChecksum();
-#endif
-#ifdef MACE_DSP_MODEL_TAG
-  LOG(INFO) << "dsp model checksum: "
-            << mace::MACE_DSP_MODEL_TAG::ModelChecksum();
-#endif
-  LOG(INFO) << "Input node: [" << FLAGS_input_node<< "]";
-  LOG(INFO) << "input_shape: " << FLAGS_input_shape;
-  LOG(INFO) << "Output node: [" << FLAGS_output_node<< "]";
-  LOG(INFO) << "output_shape: " << FLAGS_output_shape;
-  LOG(INFO) << "input_file: " << FLAGS_input_file;
-  LOG(INFO) << "cpu_model_data_file: " << FLAGS_cpu_model_data_file;
-  LOG(INFO) << "gpu_model_data_file: " << FLAGS_gpu_model_data_file;
-  LOG(INFO) << "dsp_model_data_file: " << FLAGS_dsp_model_data_file;
-  LOG(INFO) << "run_seconds: " << FLAGS_run_seconds;
-
-  std::vector<std::string> input_names;
-  std::vector<std::string> output_names;
-  std::vector<std::string> input_shapes;
-  std::vector<std::string> output_shapes;
-  Split(FLAGS_input_node, ',', &input_names);
-  Split(FLAGS_output_node, ',', &output_names);
-  Split(FLAGS_input_shape, ':', &input_shapes);
-  Split(FLAGS_output_shape, ':', &output_shapes);
-
-  const size_t input_count = input_shapes.size();
-  const size_t output_count = output_shapes.size();
-  std::vector<std::vector<int64_t>> input_shape_vec(input_count);
-  std::vector<std::vector<int64_t>> output_shape_vec(output_count);
-  for (size_t i = 0; i < input_count; ++i) {
-    ParseShape(input_shapes[i], &input_shape_vec[i]);
-  }
-  for (size_t i = 0; i < output_count; ++i) {
-    ParseShape(output_shapes[i], &output_shape_vec[i]);
-  }
-
-  std::map<std::string, mace::MaceTensor> inputs;
-  std::map<std::string, mace::MaceTensor> cpu_outputs;
-  std::map<std::string, mace::MaceTensor> gpu_outputs;
-  std::map<std::string, mace::MaceTensor> dsp_outputs;
-  for (size_t i = 0; i < input_count; ++i) {
-    // Allocate input and output
-    int64_t input_size =
-        std::accumulate(input_shape_vec[i].begin(), input_shape_vec[i].end(), 1,
-                        std::multiplies<int64_t>());
-    auto buffer_in = std::shared_ptr<float>(new float[input_size],
-                                            std::default_delete<float[]>());
-    // load input
-    std::ifstream in_file(FLAGS_input_file + "_" + FormatName(input_names[i]),
-                          std::ios::in | std::ios::binary);
-    if (in_file.is_open()) {
-      in_file.read(reinterpret_cast<char *>(buffer_in.get()),
-                   input_size * sizeof(float));
-      in_file.close();
-    } else {
-      LOG(FATAL) << "Open input file failed";
-    }
-    inputs[input_names[i]] = mace::MaceTensor(input_shape_vec[i], buffer_in);
-  }
-
-  for (size_t i = 0; i < output_count; ++i) {
-    int64_t output_size =
-        std::accumulate(output_shape_vec[i].begin(),
-                        output_shape_vec[i].end(), 1,
-                        std::multiplies<int64_t>());
-    auto buffer_out = std::shared_ptr<float>(new float[output_size],
-                                             std::default_delete<float[]>());
-    cpu_outputs[output_names[i]] = mace::MaceTensor(output_shape_vec[i],
-                                                    buffer_out);
-    gpu_outputs[output_names[i]] = mace::MaceTensor(output_shape_vec[i],
-                                                    buffer_out);
-    dsp_outputs[output_names[i]] = mace::MaceTensor(output_shape_vec[i],
-                                                    buffer_out);
-  }
-
-#if defined(MACE_CPU_MODEL_TAG) || \
-    defined(MACE_GPU_MODEL_TAG) || \
-    defined(MACE_DSP_MODEL_TAG)
-  int64_t t0, t1, init_micros;
-#endif
-
-#ifdef MACE_CPU_MODEL_TAG
-  /* --------------------- CPU init ----------------------- */
-  LOG(INFO) << "Load & init cpu model and warm up";
-  const unsigned char *cpu_model_data =
-      mace::MACE_CPU_MODEL_TAG::LoadModelData(
-      FLAGS_cpu_model_data_file.c_str());
-  NetDef cpu_net_def = mace::MACE_CPU_MODEL_TAG::CreateNet(cpu_model_data);
-
-  mace::MaceEngine cpu_engine(&cpu_net_def, DeviceType::CPU, input_names,
-                              output_names);
-
-  LOG(INFO) << "CPU Warm up run";
-  t0 = NowMicros();
-  cpu_engine.Run(inputs, &cpu_outputs);
-  t1 = NowMicros();
-  LOG(INFO) << "CPU 1st warm up run latency: " << t1 - t0 << " us";
-#endif
-
-#ifdef MACE_GPU_MODEL_TAG
-  /* --------------------- GPU init ----------------------- */
-  LOG(INFO) << "Load & init gpu model and warm up";
-  const unsigned char *gpu_model_data =
-      mace::MACE_GPU_MODEL_TAG::LoadModelData(
-      FLAGS_gpu_model_data_file.c_str());
-  NetDef gpu_net_def = mace::MACE_GPU_MODEL_TAG::CreateNet(gpu_model_data);
-
-  mace::MaceEngine gpu_engine(&gpu_net_def, DeviceType::GPU, input_names,
-                              output_names);
-  mace::MACE_GPU_MODEL_TAG::UnloadModelData(gpu_model_data);
-
-  LOG(INFO) << "GPU Warm up run";
-  t0 = NowMicros();
-  gpu_engine.Run(inputs, &gpu_outputs);
-  t1 = NowMicros();
-  LOG(INFO) << "GPU 1st warm up run latency: " << t1 - t0 << " us";
-#endif
-
-#ifdef MACE_DSP_MODEL_TAG
-  /* --------------------- DSP init ----------------------- */
-  LOG(INFO) << "Load & init dsp model and warm up";
-  const unsigned char *dsp_model_data =
-      mace::MACE_DSP_MODEL_TAG::LoadModelData(
-      FLAGS_dsp_model_data_file.c_str());
-  NetDef dsp_net_def = mace::MACE_DSP_MODEL_TAG::CreateNet(dsp_model_data);
-
-  mace::MaceEngine dsp_engine(&dsp_net_def, DeviceType::HEXAGON, input_names,
-                              output_names);
-  mace::MACE_DSP_MODEL_TAG::UnloadModelData(dsp_model_data);
-
-  LOG(INFO) << "DSP Warm up run";
-  t0 = NowMicros();
-  dsp_engine.Run(inputs, &dsp_outputs);
-  t1 = NowMicros();
-  LOG(INFO) << "DSP 1st warm up run latency: " << t1 - t0 << " us";
-#endif
-
-#if defined(MACE_CPU_MODEL_TAG) || \
-    defined(MACE_GPU_MODEL_TAG) || \
-    defined(MACE_DSP_MODEL_TAG)
-  double cpu_throughput = 0;
-  double gpu_throughput = 0;
-  double dsp_throughput = 0;
-  int64_t run_micros = FLAGS_run_seconds * 1000000;
-#endif
-
-#ifdef MACE_CPU_MODEL_TAG
-  std::thread cpu_thread([&]() {
-    int64_t frames = 0;
-    int64_t micros = 0;
-    int64_t start = NowMicros();
-    for (; micros < run_micros; ++frames) {
-      cpu_engine.Run(inputs, &cpu_outputs);
-      int64_t end = NowMicros();
-      micros = end - start;
-    }
-    cpu_throughput = frames * 1000000.0 / micros;
-  });
-#endif
-
-#ifdef MACE_GPU_MODEL_TAG
-  std::thread gpu_thread([&]() {
-    int64_t frames = 0;
-    int64_t micros = 0;
-    int64_t start = NowMicros();
-    for (; micros < run_micros; ++frames) {
-      gpu_engine.Run(inputs, &gpu_outputs);
-      int64_t end = NowMicros();
-      micros = end - start;
-    }
-    gpu_throughput = frames * 1000000.0 / micros;
-  });
-#endif
-
-#ifdef MACE_DSP_MODEL_TAG
-  std::thread dsp_thread([&]() {
-    int64_t frames = 0;
-    int64_t micros = 0;
-    int64_t start = NowMicros();
-    for (; micros < run_micros; ++frames) {
-      dsp_engine.Run(inputs, &dsp_outputs);
-      int64_t end = NowMicros();
-      micros = end - start;
-    }
-    dsp_throughput = frames * 1000000.0 / micros;
-  });
-#endif
-
-  double total_throughput = 0;
-
-#ifdef MACE_CPU_MODEL_TAG
-  cpu_thread.join();
-  LOG(INFO) << "CPU throughput: " << cpu_throughput << " f/s";
-  total_throughput += cpu_throughput;
-#endif
-#ifdef MACE_GPU_MODEL_TAG
-  gpu_thread.join();
-  LOG(INFO) << "GPU throughput: " << gpu_throughput << " f/s";
-  total_throughput += gpu_throughput;
-#endif
-#ifdef MACE_DSP_MODEL_TAG
-  dsp_thread.join();
-  LOG(INFO) << "DSP throughput: " << dsp_throughput << " f/s";
-  total_throughput += dsp_throughput;
-#endif
-
-  LOG(INFO) << "Total throughput: " << total_throughput << " f/s";
-
-  return 0;
-}
-
-}  // namespace benchmark
-}  // namespace mace
-
-int main(int argc, char **argv) { mace::benchmark::Main(argc, argv); }
diff --git a/mace/tools/validation/mace_run.cc b/mace/tools/mace_run.cc
similarity index 96%
rename from mace/tools/validation/mace_run.cc
rename to mace/tools/mace_run.cc
index 01ee3fb1d6c1aef548981ac52adc9ab406a1964f..f43e38d8c2657fafaba62cb5fe4a991bb69eaf0f 100644
--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/mace_run.cc
@@ -38,6 +38,7 @@
 #include "mace/utils/logging.h"
 #include "mace/utils/memory.h"
 #include "mace/utils/string_util.h"
+#include "mace/utils/statistics.h"
 
 #ifdef MODEL_GRAPH_FORMAT_CODE
 #include "mace/codegen/engine/mace_engine_factory.h"
@@ -45,7 +46,6 @@
 
 namespace mace {
 namespace tools {
-namespace validation {
 
 void ParseShape(const std::string &str, std::vector<int64_t> *shape) {
   std::string tmp = str;
@@ -124,7 +124,6 @@ DEFINE_string(input_file,
 DEFINE_string(output_file,
               "",
               "output file name | output file prefix for multiple outputs");
-// TODO(liyin): support batch validation
 DEFINE_string(input_dir,
               "",
               "input directory name");
@@ -152,6 +151,7 @@ DEFINE_int32(gpu_priority_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
 DEFINE_int32(omp_num_threads, -1, "num of openmp threads");
 DEFINE_int32(cpu_affinity_policy, 1,
              "0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY");
+DEFINE_bool(benchmark, false, "enable benchmark op");
 
 bool RunModel(const std::string &model_name,
               const std::vector<std::string> &input_names,
@@ -352,6 +352,7 @@ bool RunModel(const std::string &model_name,
   }
 
   double model_run_millis = -1;
+  benchmark::OpStat op_stat;
   if (FLAGS_round > 0) {
     LOG(INFO) << "Run model";
     int64_t total_run_duration = 0;
@@ -364,9 +365,15 @@ bool RunModel(const std::string &model_name,
             info_log.get(), MakeString(i));
       }
       MaceStatus run_status;
+      RunMetadata metadata;
+      RunMetadata *metadata_ptr = nullptr;
+      if (FLAGS_benchmark) {
+        metadata_ptr = &metadata;
+      }
+
       while (true) {
         int64_t t0 = NowMicros();
-        run_status = engine->Run(inputs, &outputs);
+        run_status = engine->Run(inputs, &outputs, metadata_ptr);
         if (run_status != MaceStatus::MACE_SUCCESS) {
           LOG(ERROR) << "Mace run model runtime error, retry ... errcode: "
                      << run_status.information();
@@ -399,6 +406,9 @@ bool RunModel(const std::string &model_name,
         } else {
           int64_t t1 = NowMicros();
           total_run_duration += (t1 - t0);
+          if (FLAGS_benchmark) {
+            op_stat.StatMetadata(metadata);
+          }
           break;
         }
       }
@@ -407,14 +417,6 @@ bool RunModel(const std::string &model_name,
     LOG(INFO) << "Average latency: " << model_run_millis << " ms";
   }
 
-  // Metrics reporting tools depends on the format, keep in consistent
-  printf("========================================================\n");
-  printf("     capability(CPU)        init      warmup     run_avg\n");
-  printf("========================================================\n");
-  printf("time %15.3f %11.3f %11.3f %11.3f\n",
-         cpu_capability, init_millis, warmup_millis, model_run_millis);
-
-
   for (size_t i = 0; i < output_count; ++i) {
     std::string output_name =
         FLAGS_output_file + "_" + FormatName(output_names[i]);
@@ -431,6 +433,16 @@ bool RunModel(const std::string &model_name,
               << output_size << " done.";
   }
 
+  // Metrics reporting tools depends on the format, keep in consistent
+  printf("========================================================\n");
+  printf("     capability(CPU)        init      warmup     run_avg\n");
+  printf("========================================================\n");
+  printf("time %15.3f %11.3f %11.3f %11.3f\n",
+         cpu_capability, init_millis, warmup_millis, model_run_millis);
+  if (FLAGS_benchmark) {
+    op_stat.PrintStat();
+  }
+
   return true;
 }
 
@@ -448,6 +460,10 @@ int Main(int argc, char **argv) {
     return 0;
   }
 
+  if (FLAGS_benchmark) {
+    setenv("MACE_OPENCL_PROFILING", "1", 1);
+  }
+
   LOG(INFO) << "model name: " << FLAGS_model_name;
   LOG(INFO) << "mace version: " << MaceVersion();
   LOG(INFO) << "input node: " << FLAGS_input_node;
@@ -517,8 +533,9 @@ int Main(int argc, char **argv) {
   return -1;
 }
 
-}  // namespace validation
 }  // namespace tools
 }  // namespace mace
 
-int main(int argc, char **argv) { mace::tools::validation::Main(argc, argv); }
+int main(int argc, char **argv) {
+  mace::tools::Main(argc, argv);
+}
diff --git a/tools/common.py b/tools/common.py
index a7a3cfdb882c662f25aa6006295b585ed655424c..a45bf37a645f4c78a90b16df54d2bc7304044b64 100644
--- a/tools/common.py
+++ b/tools/common.py
@@ -447,14 +447,10 @@ BUILD_TMP_DIR_NAME = '_tmp'
 BUILD_DOWNLOADS_DIR = BUILD_OUTPUT_DIR + '/downloads'
 BUILD_TMP_GENERAL_OUTPUT_DIR_NAME = 'general'
 MODEL_OUTPUT_DIR_NAME = 'model'
-EXAMPLE_STATIC_NAME = "example_static"
-EXAMPLE_DYNAMIC_NAME = "example_dynamic"
-EXAMPLE_STATIC_TARGET = "//examples/cli:" + EXAMPLE_STATIC_NAME
-EXAMPLE_DYNAMIC_TARGET = "//examples/cli:" + EXAMPLE_DYNAMIC_NAME
 MACE_RUN_STATIC_NAME = "mace_run_static"
 MACE_RUN_DYNAMIC_NAME = "mace_run_dynamic"
-MACE_RUN_STATIC_TARGET = "//mace/tools/validation:" + MACE_RUN_STATIC_NAME
-MACE_RUN_DYNAMIC_TARGET = "//mace/tools/validation:" + MACE_RUN_DYNAMIC_NAME
+MACE_RUN_STATIC_TARGET = "//mace/tools:" + MACE_RUN_STATIC_NAME
+MACE_RUN_DYNAMIC_TARGET = "//mace/tools:" + MACE_RUN_DYNAMIC_NAME
 CL_COMPILED_BINARY_FILE_NAME = "mace_cl_compiled_program.bin"
 BUILD_TMP_OPENCL_BIN_DIR = 'opencl_bin'
 LIBMACE_DYNAMIC_PATH = "bazel-bin/mace/libmace/libmace.so"
@@ -474,11 +470,6 @@ LIBMACE_STATIC_TARGET = "//mace/libmace:libmace_static"
 LIBMACE_STATIC_PATH = "bazel-genfiles/mace/libmace/libmace.a"
 MODEL_LIB_TARGET = "//mace/codegen:generated_models"
 MODEL_LIB_PATH = "bazel-bin/mace/codegen/libgenerated_models.a"
-QUANTIZE_STAT_TARGET = "//mace/tools/quantization:quantize_stat"
-BM_MODEL_STATIC_NAME = "benchmark_model_static"
-BM_MODEL_DYNAMIC_NAME = "benchmark_model_dynamic"
-BM_MODEL_STATIC_TARGET = "//mace/tools/benchmark:" + BM_MODEL_STATIC_NAME
-BM_MODEL_DYNAMIC_TARGET = "//mace/tools/benchmark:" + BM_MODEL_DYNAMIC_NAME
 
 
 ################################
@@ -508,7 +499,6 @@ class ModuleName(object):
     YAML_CONFIG = 'YAML CONFIG'
     MODEL_CONVERTER = 'Model Converter'
     RUN = 'RUN'
-    BENCHMARK = 'Benchmark'
 
 
 #################################
diff --git a/tools/converter.py b/tools/converter.py
index aca556aedcc01c7c6bd9f78acb9c883030071918..4a0aed7b4649caac632e19983223653fa1fff6f3 100644
--- a/tools/converter.py
+++ b/tools/converter.py
@@ -891,7 +891,7 @@ def build_mace_run(configs, target_abi, toolchain, enable_openmp,
         mace_check(os.path.exists(ENGINE_CODEGEN_DIR),
                    ModuleName.RUN,
                    "You should convert model first.")
-        build_arg = "--per_file_copt=mace/tools/validation/mace_run.cc@-DMODEL_GRAPH_FORMAT_CODE"  # noqa
+        build_arg = "--per_file_copt=mace/tools/mace_run.cc@-DMODEL_GRAPH_FORMAT_CODE"  # noqa
 
     sh_commands.bazel_build(
         mace_run_target,
@@ -912,86 +912,6 @@ def build_mace_run(configs, target_abi, toolchain, enable_openmp,
                                        mace_lib_type == MACELibType.dynamic)
 
 
-def build_example(configs, target_abi, toolchain, enable_openmp, mace_lib_type,
-                  cl_binary_to_code, device, debug_mode):
-    library_name = configs[YAMLKeyword.library_name]
-
-    build_tmp_binary_dir = get_build_binary_dir(library_name, target_abi)
-    if os.path.exists(build_tmp_binary_dir):
-        sh.rm("-rf", build_tmp_binary_dir)
-    os.makedirs(build_tmp_binary_dir)
-
-    if cl_binary_to_code:
-        sh_commands.gen_opencl_binary_cpps(
-            get_opencl_binary_output_path(
-                library_name, target_abi, device),
-            get_opencl_parameter_output_path(
-                library_name, target_abi, device),
-            OPENCL_CODEGEN_DIR + '/opencl_binary.cc',
-            OPENCL_CODEGEN_DIR + '/opencl_parameter.cc')
-    else:
-        sh_commands.gen_opencl_binary_cpps(
-            "", "",
-            OPENCL_CODEGEN_DIR + '/opencl_binary.cc',
-            OPENCL_CODEGEN_DIR + '/opencl_parameter.cc')
-
-    libmace_target = LIBMACE_STATIC_TARGET
-    if mace_lib_type == MACELibType.dynamic:
-        libmace_target = LIBMACE_SO_TARGET
-
-    sh_commands.bazel_build(libmace_target,
-                            abi=target_abi,
-                            toolchain=toolchain,
-                            enable_openmp=enable_openmp,
-                            enable_opencl=get_opencl_mode(configs),
-                            enable_quantize=get_quantize_mode(configs),
-                            enable_hexagon=get_hexagon_mode(configs),
-                            enable_hta=get_hta_mode(configs),
-                            enable_apu=get_apu_mode(configs),
-                            address_sanitizer=flags.address_sanitizer,
-                            symbol_hidden=get_symbol_hidden_mode(debug_mode, mace_lib_type),  # noqa
-                            debug_mode=debug_mode)
-
-    if os.path.exists(LIB_CODEGEN_DIR):
-        sh.rm("-rf", LIB_CODEGEN_DIR)
-    sh.mkdir("-p", LIB_CODEGEN_DIR)
-
-    build_arg = ""
-    if configs[YAMLKeyword.model_graph_format] == ModelFormat.code:
-        mace_check(os.path.exists(ENGINE_CODEGEN_DIR),
-                   ModuleName.RUN,
-                   "You should convert model first.")
-        model_lib_path = get_model_lib_output_path(library_name,
-                                                   target_abi)
-        sh.cp("-f", model_lib_path, LIB_CODEGEN_DIR)
-        build_arg = "--per_file_copt=examples/cli/example.cc@-DMODEL_GRAPH_FORMAT_CODE"  # noqa
-
-    if mace_lib_type == MACELibType.dynamic:
-        example_target = EXAMPLE_DYNAMIC_TARGET
-        sh.cp("-f", LIBMACE_DYNAMIC_PATH, LIB_CODEGEN_DIR)
-    else:
-        example_target = EXAMPLE_STATIC_TARGET
-        sh.cp("-f", LIBMACE_STATIC_PATH, LIB_CODEGEN_DIR)
-
-    sh_commands.bazel_build(example_target,
-                            abi=target_abi,
-                            toolchain=toolchain,
-                            enable_openmp=enable_openmp,
-                            enable_opencl=get_opencl_mode(configs),
-                            enable_quantize=get_quantize_mode(configs),
-                            enable_hexagon=get_hexagon_mode(configs),
-                            enable_hta=get_hta_mode(configs),
-                            enable_apu=get_apu_mode(configs),
-                            address_sanitizer=flags.address_sanitizer,
-                            debug_mode=debug_mode,
-                            extra_args=build_arg)
-
-    target_bin = "/".join(sh_commands.bazel_target_to_bin(example_target))
-    sh.cp("-f", target_bin, build_tmp_binary_dir)
-    if os.path.exists(LIB_CODEGEN_DIR):
-        sh.rm("-rf", LIB_CODEGEN_DIR)
-
-
 def print_package_summary(package_path):
     title = "Library"
     header = ["key", "value"]
@@ -1024,23 +944,13 @@ def run_mace(flags):
                 # get toolchain
                 toolchain = infer_toolchain(target_abi)
                 device = DeviceWrapper(dev)
-                if flags.example:
-                    build_example(configs,
-                                  target_abi,
-                                  toolchain,
-                                  flags.enable_openmp,
-                                  flags.mace_lib_type,
-                                  flags.cl_binary_to_code,
-                                  device,
-                                  flags.debug_mode)
-                else:
-                    build_mace_run(configs,
-                                   target_abi,
-                                   toolchain,
-                                   flags.enable_openmp,
-                                   flags.address_sanitizer,
-                                   flags.mace_lib_type,
-                                   flags.debug_mode)
+                build_mace_run(configs,
+                               target_abi,
+                               toolchain,
+                               flags.enable_openmp,
+                               flags.address_sanitizer,
+                               flags.mace_lib_type,
+                               flags.debug_mode)
                 # run
                 start_time = time.time()
                 with device.lock():
@@ -1058,90 +968,6 @@ def run_mace(flags):
     print_package_summary(package_path)
 
 
-################################
-#  benchmark model
-################################
-def build_benchmark_model(configs,
-                          target_abi,
-                          toolchain,
-                          enable_openmp,
-                          mace_lib_type,
-                          debug_mode):
-    library_name = configs[YAMLKeyword.library_name]
-
-    link_dynamic = mace_lib_type == MACELibType.dynamic
-    if link_dynamic:
-        benchmark_target = BM_MODEL_DYNAMIC_TARGET
-    else:
-        benchmark_target = BM_MODEL_STATIC_TARGET
-
-    build_arg = ""
-    if configs[YAMLKeyword.model_graph_format] == ModelFormat.code:
-        mace_check(os.path.exists(ENGINE_CODEGEN_DIR),
-                   ModuleName.BENCHMARK,
-                   "You should convert model first.")
-        build_arg = "--per_file_copt=mace/tools/benchmark/benchmark_model.cc@-DMODEL_GRAPH_FORMAT_CODE"  # noqa
-
-    sh_commands.bazel_build(benchmark_target,
-                            abi=target_abi,
-                            toolchain=toolchain,
-                            enable_openmp=enable_openmp,
-                            enable_opencl=get_opencl_mode(configs),
-                            enable_quantize=get_quantize_mode(configs),
-                            enable_hexagon=get_hexagon_mode(configs),
-                            enable_hta=get_hta_mode(configs),
-                            enable_apu=get_apu_mode(configs),
-                            symbol_hidden=get_symbol_hidden_mode(debug_mode, mace_lib_type),  # noqa
-                            debug_mode=debug_mode,
-                            extra_args=build_arg)
-    # clear tmp binary dir
-    build_tmp_binary_dir = get_build_binary_dir(library_name, target_abi)
-    if os.path.exists(build_tmp_binary_dir):
-        sh.rm("-rf", build_tmp_binary_dir)
-    os.makedirs(build_tmp_binary_dir)
-
-    target_bin = "/".join(sh_commands.bazel_target_to_bin(benchmark_target))
-    sh.cp("-f", target_bin, build_tmp_binary_dir)
-
-
-def benchmark_model(flags):
-    configs = format_model_config(flags)
-
-    clear_build_dirs(configs[YAMLKeyword.library_name])
-
-    target_socs = configs[YAMLKeyword.target_socs]
-    device_list = DeviceManager.list_devices(flags.device_yml)
-    if target_socs and TargetSOCTag.all not in target_socs:
-        device_list = [dev for dev in device_list
-                       if dev[YAMLKeyword.target_socs].lower() in target_socs]
-    for target_abi in configs[YAMLKeyword.target_abis]:
-        if flags.target_socs == TargetSOCTag.random:
-            target_devices = sh_commands.choose_a_random_device(
-                device_list, target_abi)
-        else:
-            target_devices = device_list
-        # build benchmark_model binary
-        for dev in target_devices:
-            if target_abi in dev[YAMLKeyword.target_abis]:
-                toolchain = infer_toolchain(target_abi)
-                build_benchmark_model(configs,
-                                      target_abi,
-                                      toolchain,
-                                      flags.enable_openmp,
-                                      flags.mace_lib_type,
-                                      flags.debug_mode)
-                device = DeviceWrapper(dev)
-                start_time = time.time()
-                with device.lock():
-                    device.bm_specific_target(flags, configs, target_abi)
-                elapse_minutes = (time.time() - start_time) / 60
-                print("Elapse time: %f minutes." % elapse_minutes)
-            else:
-                six.print_('There is no abi %s with soc %s' %
-                           (target_abi, dev[YAMLKeyword.target_socs]),
-                           file=sys.stderr)
-
-
 ################################
 # parsing arguments
 ################################
@@ -1210,60 +1036,61 @@ def parse_args():
         '--address_sanitizer',
         action="store_true",
         help="Whether to use address sanitizer to check memory error")
-    run_bm_parent_parser = argparse.ArgumentParser(add_help=False)
-    run_bm_parent_parser.add_argument(
+
+    parser = argparse.ArgumentParser()
+    subparsers = parser.add_subparsers()
+    convert = subparsers.add_parser(
+        'convert',
+        parents=[all_type_parent_parser, convert_run_parent_parser],
+        help='convert to mace model (file or code)')
+    convert.add_argument(
+        "--cl_mem_type",
+        type=str,
+        default=None,
+        help="Which type of OpenCL memory type to use [image | buffer].")
+    convert.set_defaults(func=convert_func)
+
+    run = subparsers.add_parser(
+        'run',
+        parents=[all_type_parent_parser,
+                 convert_run_parent_parser],
+        help='run model in command line')
+    run.set_defaults(func=run_mace)
+    run.add_argument(
         "--mace_lib_type",
         type=str_to_mace_lib_type,
         default=DefaultValues.mace_lib_type,
         help="[static | dynamic], Which type MACE library to use.")
-    run_bm_parent_parser.add_argument(
+    run.add_argument(
         "--enable_openmp",
         action="store_true",
         help="Enable openmp for multiple thread.")
-    run_bm_parent_parser.add_argument(
+    run.add_argument(
         "--omp_num_threads",
         type=int,
         default=DefaultValues.omp_num_threads,
         help="num of openmp threads")
-    run_bm_parent_parser.add_argument(
+    run.add_argument(
         "--cpu_affinity_policy",
         type=int,
         default=DefaultValues.cpu_affinity_policy,
         help="0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY")
-    run_bm_parent_parser.add_argument(
+    run.add_argument(
         "--gpu_perf_hint",
         type=int,
         default=DefaultValues.gpu_perf_hint,
         help="0:DEFAULT/1:LOW/2:NORMAL/3:HIGH")
-    run_bm_parent_parser.add_argument(
+    run.add_argument(
         "--gpu_priority_hint",
         type=int,
         default=DefaultValues.gpu_priority_hint,
         help="0:DEFAULT/1:LOW/2:NORMAL/3:HIGH")
-    run_bm_parent_parser.add_argument(
+    run.add_argument(
         "--device_yml",
         type=str,
         default='',
         help='embedded linux device config yml file'
     )
-    parser = argparse.ArgumentParser()
-    subparsers = parser.add_subparsers()
-    convert = subparsers.add_parser(
-        'convert',
-        parents=[all_type_parent_parser, convert_run_parent_parser],
-        help='convert to mace model (file or code)')
-    convert.add_argument(
-        "--cl_mem_type",
-        type=str,
-        default=None,
-        help="Which type of OpenCL memory type to use [image | buffer].")
-    convert.set_defaults(func=convert_func)
-    run = subparsers.add_parser(
-        'run',
-        parents=[all_type_parent_parser, run_bm_parent_parser,
-                 convert_run_parent_parser],
-        help='run model in command line')
-    run.set_defaults(func=run_mace)
     run.add_argument(
         "--disable_tuning",
         action="store_true",
@@ -1318,10 +1145,6 @@ def parse_args():
         type=float,
         default=0.0,
         help="[mock runtime failure ratio].")
-    run.add_argument(
-        "--example",
-        action="store_true",
-        help="whether to run example.")
     run.add_argument(
         "--quantize_stat",
         action="store_true",
@@ -1340,21 +1163,10 @@ def parse_args():
         "--cl_binary_to_code",
         action="store_true",
         help="convert OpenCL binaries to cpp.")
-    benchmark = subparsers.add_parser(
-        'benchmark',
-        parents=[all_type_parent_parser, run_bm_parent_parser],
-        help='benchmark model for detail information')
-    benchmark.set_defaults(func=benchmark_model)
-    benchmark.add_argument(
-        "--max_num_runs",
-        type=int,
-        default=100,
-        help="max number of runs.")
-    benchmark.add_argument(
-        "--max_seconds",
-        type=float,
-        default=10.0,
-        help="max number of seconds to run.")
+    run.add_argument(
+        "--benchmark",
+        action="store_true",
+        help="enable op benchmark.")
     return parser.parse_known_args()
 
 
diff --git a/tools/device.py b/tools/device.py
index 5706196990826dfc04a47896ac65a9105ce4c699..66d8d1e6adc7dcf288f81d03be920f7cf40e5213 100644
--- a/tools/device.py
+++ b/tools/device.py
@@ -186,6 +186,7 @@ class DeviceWrapper:
                    link_dynamic=False,
                    quantize_stat=False,
                    layers_validate_file="",
+                   benchmark=False,
                    ):
         six.print_("* Run '%s' with round=%s, restart_round=%s, tuning=%s, "
                    "out_of_range_check=%s, omp_num_threads=%s, "
@@ -343,6 +344,9 @@ class DeviceWrapper:
                 "--opencl_parameter_file=%s/%s" %
                 (self.data_dir, os.path.basename(opencl_parameter_file)),
             ])
+            if benchmark:
+                cmd.append("--benchmark=%s" % benchmark)
+
             cmd = ' '.join(cmd)
             cmd_file_name = "%s-%s-%s" % ('cmd_file',
                                           model_tag,
@@ -473,16 +477,10 @@ class DeviceWrapper:
         build_tmp_binary_dir = get_build_binary_dir(library_name, target_abi)
         # get target name for run
         mace_lib_type = flags.mace_lib_type
-        if flags.example:
-            if mace_lib_type == MACELibType.static:
-                target_name = EXAMPLE_STATIC_NAME
-            else:
-                target_name = EXAMPLE_DYNAMIC_NAME
+        if mace_lib_type == MACELibType.static:
+            target_name = MACE_RUN_STATIC_NAME
         else:
-            if mace_lib_type == MACELibType.static:
-                target_name = MACE_RUN_STATIC_NAME
-            else:
-                target_name = MACE_RUN_DYNAMIC_NAME
+            target_name = MACE_RUN_DYNAMIC_NAME
         link_dynamic = mace_lib_type == MACELibType.dynamic
 
         if target_abi != ABIType.host:
@@ -557,7 +555,8 @@ class DeviceWrapper:
             input_dir=flags.input_dir,
             output_dir=flags.output_dir,
             layers_validate_file=output_config[
-                YAMLKeyword.model_file_path]
+                YAMLKeyword.model_file_path],
+            benchmark=flags.benchmark,
         )
 
     def get_output_map(self,
@@ -621,7 +620,6 @@ class DeviceWrapper:
 
             tuning = False
             if not flags.address_sanitizer \
-                    and not flags.example \
                     and target_abi != ABIType.host \
                     and (configs[YAMLKeyword.target_socs]
                          or flags.target_socs) \
@@ -859,254 +857,6 @@ class DeviceWrapper:
         with open(report_filename, 'a') as f:
             f.write(data_str)
 
-    def benchmark_model(self,
-                        abi,
-                        benchmark_binary_dir,
-                        benchmark_binary_name,
-                        vlog_level,
-                        embed_model_data,
-                        model_output_dir,
-                        mace_model_dir,
-                        input_nodes,
-                        output_nodes,
-                        input_shapes,
-                        output_shapes,
-                        input_data_formats,
-                        output_data_formats,
-                        max_num_runs,
-                        max_seconds,
-                        model_tag,
-                        device_type,
-                        model_graph_format,
-                        opencl_binary_file,
-                        opencl_parameter_file,
-                        libmace_dynamic_library_path,
-                        omp_num_threads=-1,
-                        cpu_affinity_policy=1,
-                        gpu_perf_hint=3,
-                        gpu_priority_hint=3,
-                        input_file_name='model_input',
-                        link_dynamic=False):
-        six.print_('* Benchmark for %s' % model_tag)
-        mace_model_path = ''
-        if model_graph_format == ModelFormat.file:
-            mace_model_path = '%s/%s.pb' % (mace_model_dir, model_tag)
-
-        model_data_file = ""
-        if not embed_model_data:
-            if self.system == SystemType.host:
-                model_data_file = "%s/%s.data" % (mace_model_dir, model_tag)
-            else:
-                model_data_file = "%s/%s.data" % (self.data_dir, model_tag)
-
-        if abi == ABIType.host:
-            libmace_dynamic_lib_dir_path = \
-                os.path.dirname(libmace_dynamic_library_path)
-            p = subprocess.Popen(
-                [
-                    'env',
-                    'LD_LIBRARY_PATH=%s' % libmace_dynamic_lib_dir_path,
-                    'MACE_CPP_MIN_VLOG_LEVEL=%s' % vlog_level,
-                    '%s/%s' % (benchmark_binary_dir, benchmark_binary_name),
-                    '--model_name=%s' % model_tag,
-                    '--input_node=%s' % ','.join(input_nodes),
-                    '--output_node=%s' % ','.join(output_nodes),
-                    '--input_shape=%s' % ':'.join(input_shapes),
-                    '--output_shape=%s' % ':'.join(output_shapes),
-                    "--input_data_format=%s" % ",".join(input_data_formats),
-                    "--output_data_format=%s" % ",".join(output_data_formats),
-                    '--input_file=%s/%s' % (model_output_dir, input_file_name),
-                    "--model_data_file=%s" % model_data_file,
-                    '--max_num_runs=%d' % max_num_runs,
-                    '--max_seconds=%f' % max_seconds,
-                    '--device=%s' % device_type,
-                    '--omp_num_threads=%s' % omp_num_threads,
-                    '--cpu_affinity_policy=%s' % cpu_affinity_policy,
-                    '--gpu_perf_hint=%s' % gpu_perf_hint,
-                    '--gpu_priority_hint=%s' % gpu_priority_hint,
-                    '--model_file=%s' % mace_model_path
-                ])
-            p.wait()
-        elif self.system in [SystemType.android, SystemType.arm_linux]:
-            self.exec_command('mkdir -p %s' % self.data_dir)
-            internal_storage_dir = self.create_internal_storage_dir()
-            for input_name in input_nodes:
-                formatted_name = formatted_file_name(input_file_name,
-                                                     input_name)
-                self.push('%s/%s' % (model_output_dir, formatted_name),
-                          self.data_dir)
-            if not embed_model_data:
-                self.push('%s/%s.data' % (mace_model_dir, model_tag),
-                          self.data_dir)
-            if device_type == common.DeviceType.GPU:
-                if os.path.exists(opencl_binary_file):
-                    self.push(opencl_binary_file, self.data_dir)
-                if os.path.exists(opencl_parameter_file):
-                    self.push(opencl_parameter_file, self.data_dir)
-            mace_model_device_path = ''
-            if model_graph_format == ModelFormat.file:
-                mace_model_device_path = '%s/%s.pb' % \
-                                         (self.data_dir, model_tag)
-                self.push(mace_model_path, mace_model_device_path)
-            if link_dynamic:
-                self.push(libmace_dynamic_library_path, self.data_dir)
-                if self.system == SystemType.android:
-                    sh_commands.push_depended_so_libs(
-                        libmace_dynamic_library_path, abi, self.data_dir,
-                        self.address)
-            self.rm('%s/%s' % (self.data_dir, benchmark_binary_name))
-            self.push('%s/%s' % (benchmark_binary_dir, benchmark_binary_name),
-                      self.data_dir)
-
-            cmd = [
-                'LD_LIBRARY_PATH=%s' % self.data_dir,
-                'MACE_CPP_MIN_VLOG_LEVEL=%s' % vlog_level,
-                'MACE_RUN_PARAMETER_PATH=%s/mace_run.config' % self.data_dir,
-                'MACE_INTERNAL_STORAGE_PATH=%s' % internal_storage_dir,
-                'MACE_OPENCL_PROFILING=1',
-                '%s/%s' % (self.data_dir, benchmark_binary_name),
-                '--model_name=%s' % model_tag,
-                '--input_node=%s' % ','.join(input_nodes),
-                '--output_node=%s' % ','.join(output_nodes),
-                '--input_shape=%s' % ':'.join(input_shapes),
-                '--output_shape=%s' % ':'.join(output_shapes),
-                "--input_data_format=%s" % ",".join(input_data_formats),
-                "--output_data_format=%s" % ",".join(output_data_formats),
-                '--input_file=%s/%s' % (self.data_dir, input_file_name),
-                "--model_data_file=%s" % model_data_file,
-                '--max_num_runs=%d' % max_num_runs,
-                '--max_seconds=%f' % max_seconds,
-                '--device=%s' % device_type,
-                '--omp_num_threads=%s' % omp_num_threads,
-                '--cpu_affinity_policy=%s' % cpu_affinity_policy,
-                '--gpu_perf_hint=%s' % gpu_perf_hint,
-                '--gpu_priority_hint=%s' % gpu_priority_hint,
-                '--model_file=%s' % mace_model_device_path,
-                '--opencl_binary_file=%s/%s' %
-                (self.data_dir, os.path.basename(opencl_binary_file)),
-                '--opencl_parameter_file=%s/%s' %
-                (self.data_dir, os.path.basename(opencl_parameter_file))
-            ]
-
-            cmd = ' '.join(cmd)
-            cmd_file_name = '%s-%s-%s' % \
-                            ('cmd_file', model_tag, str(time.time()))
-
-            cmd_file_path = '%s/%s' % (self.data_dir, cmd_file_name)
-            tmp_cmd_file = '%s/%s' % ('/tmp', cmd_file_name)
-            with open(tmp_cmd_file, 'w') as f:
-                f.write(cmd)
-            self.push(tmp_cmd_file, cmd_file_path)
-            os.remove(tmp_cmd_file)
-
-            if self.system == SystemType.android:
-                sh.adb('-s', self.address, 'shell', 'sh', cmd_file_path,
-                       _fg=True)
-            elif self.system == SystemType.arm_linux:
-                sh.ssh('%s@%s' % (self.username, self.address),
-                       'sh', cmd_file_path, _fg=True)
-            self.rm(cmd_file_path)
-            six.print_('Benchmark done! \n')
-
-    def bm_specific_target(self, flags, configs, target_abi):
-        library_name = configs[YAMLKeyword.library_name]
-        embed_model_data = \
-            configs[YAMLKeyword.model_data_format] == ModelFormat.code
-        opencl_output_bin_path = ''
-        opencl_parameter_path = ''
-        link_dynamic = flags.mace_lib_type == MACELibType.dynamic
-
-        if link_dynamic:
-            bm_model_binary_name = BM_MODEL_DYNAMIC_NAME
-        else:
-            bm_model_binary_name = BM_MODEL_STATIC_NAME
-        build_tmp_binary_dir = get_build_binary_dir(library_name, target_abi)
-        if (configs[YAMLKeyword.target_socs] or flags.target_socs)\
-                and target_abi != ABIType.host:
-            opencl_output_bin_path = get_opencl_binary_output_path(
-                library_name, target_abi, self
-            )
-            opencl_parameter_path = get_opencl_parameter_output_path(
-                library_name, target_abi, self
-            )
-
-        for model_name in configs[YAMLKeyword.models]:
-            check_model_converted(library_name,
-                                  model_name,
-                                  configs[YAMLKeyword.model_graph_format],
-                                  configs[YAMLKeyword.model_data_format],
-                                  target_abi)
-            MaceLogger.header(
-                StringFormatter.block(
-                    'Benchmark model %s on %s' % (model_name,
-                                                  self.device_name)))
-            model_config = configs[YAMLKeyword.models][model_name]
-            model_runtime = model_config[YAMLKeyword.runtime]
-            subgraphs = model_config[YAMLKeyword.subgraphs]
-
-            model_output_base_dir, model_output_dir, mace_model_dir = \
-                get_build_model_dirs(library_name, model_name,
-                                     target_abi, self,
-                                     model_config[YAMLKeyword.model_file_path])
-            if os.path.exists(model_output_dir):
-                sh.rm('-rf', model_output_dir)
-            os.makedirs(model_output_dir)
-
-            if target_abi != ABIType.host:
-                self.clear_data_dir()
-            sh_commands.gen_input(
-                model_output_dir,
-                subgraphs[0][YAMLKeyword.input_tensors],
-                subgraphs[0][YAMLKeyword.input_shapes],
-                subgraphs[0][YAMLKeyword.validation_inputs_data],
-                input_ranges=subgraphs[0][YAMLKeyword.input_ranges],
-                input_data_types=subgraphs[0][YAMLKeyword.input_data_types]
-            )
-            runtime_list = []
-            if target_abi == ABIType.host:
-                runtime_list.append(RuntimeType.cpu)
-            elif model_runtime == RuntimeType.cpu_gpu:
-                runtime_list.extend([RuntimeType.cpu, RuntimeType.gpu])
-            else:
-                runtime_list.append(model_runtime)
-            for runtime in runtime_list:
-                device_type = parse_device_type(runtime)
-                if not subgraphs[0][YAMLKeyword.check_tensors]:
-                    output_nodes = subgraphs[0][YAMLKeyword.output_tensors]
-                    output_shapes = subgraphs[0][YAMLKeyword.output_shapes]
-                else:
-                    output_nodes = subgraphs[0][YAMLKeyword.check_tensors]
-                    output_shapes = subgraphs[0][YAMLKeyword.check_shapes]
-                self.benchmark_model(
-                    abi=target_abi,
-                    benchmark_binary_dir=build_tmp_binary_dir,
-                    benchmark_binary_name=bm_model_binary_name,
-                    vlog_level=0,
-                    embed_model_data=embed_model_data,
-                    model_output_dir=model_output_dir,
-                    input_nodes=subgraphs[0][YAMLKeyword.input_tensors],
-                    output_nodes=output_nodes,
-                    input_shapes=subgraphs[0][YAMLKeyword.input_shapes],
-                    output_shapes=output_shapes,
-                    input_data_formats=subgraphs[0][
-                        YAMLKeyword.input_data_formats],
-                    output_data_formats=subgraphs[0][
-                        YAMLKeyword.output_data_formats],
-                    max_num_runs=flags.max_num_runs,
-                    max_seconds=flags.max_seconds,
-                    mace_model_dir=mace_model_dir,
-                    model_tag=model_name,
-                    device_type=device_type,
-                    model_graph_format=configs[YAMLKeyword.model_graph_format],
-                    omp_num_threads=flags.omp_num_threads,
-                    cpu_affinity_policy=flags.cpu_affinity_policy,
-                    gpu_perf_hint=flags.gpu_perf_hint,
-                    gpu_priority_hint=flags.gpu_priority_hint,
-                    opencl_binary_file=opencl_output_bin_path,
-                    opencl_parameter_file=opencl_parameter_path,
-                    libmace_dynamic_library_path=LIBMACE_DYNAMIC_PATH,
-                    link_dynamic=link_dynamic)
-
     def run(self,
             abi,
             host_bin_path,
diff --git a/tools/sh_commands.py b/tools/sh_commands.py
index e9d051b3c718e1621e00c7160567944c29b940ae..1b69feb9bb8e190ea7a082e58313534cc2902403 100644
--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -661,10 +661,10 @@ def update_mace_run_binary(build_tmp_binary_dir, link_dynamic=False):
     if os.path.exists(mace_run_filepath):
         sh.rm("-rf", mace_run_filepath)
     if link_dynamic:
-        sh.cp("-f", "bazel-bin/mace/tools/validation/mace_run_dynamic",
+        sh.cp("-f", "bazel-bin/mace/tools/mace_run_dynamic",
               build_tmp_binary_dir)
     else:
-        sh.cp("-f", "bazel-bin/mace/tools/validation/mace_run_static",
+        sh.cp("-f", "bazel-bin/mace/tools/mace_run_static",
               build_tmp_binary_dir)
 
 
@@ -865,120 +865,3 @@ def packaging_lib(libmace_output_dir, project_name):
             _fg=True)
     six.print_("Packaging Done!\n")
     return tar_package_path
-
-
-################################
-# benchmark
-################################
-def build_run_throughput_test(abi,
-                              serialno,
-                              vlog_level,
-                              run_seconds,
-                              merged_lib_file,
-                              model_input_dir,
-                              embed_model_data,
-                              input_nodes,
-                              output_nodes,
-                              input_shapes,
-                              output_shapes,
-                              cpu_model_tag,
-                              gpu_model_tag,
-                              dsp_model_tag,
-                              apu_model_tag,
-                              phone_data_dir,
-                              strip="always",
-                              input_file_name="model_input"):
-    six.print_("* Build and run throughput_test")
-
-    model_tag_build_flag = ""
-    if cpu_model_tag:
-        model_tag_build_flag += "--copt=-DMACE_CPU_MODEL_TAG=%s " % \
-                                cpu_model_tag
-    if gpu_model_tag:
-        model_tag_build_flag += "--copt=-DMACE_GPU_MODEL_TAG=%s " % \
-                                gpu_model_tag
-    if dsp_model_tag:
-        model_tag_build_flag += "--copt=-DMACE_DSP_MODEL_TAG=%s " % \
-                                dsp_model_tag
-    if apu_model_tag:
-        model_tag_build_flag += "--copt=-DMACE_APU_MODEL_TAG=%s " % \
-                                apu_model_tag
-    sh.cp("-f", merged_lib_file, "mace/benchmark/libmace_merged.a")
-    sh.bazel(
-        "build",
-        "-c",
-        "opt",
-        "--strip",
-        strip,
-        "--verbose_failures",
-        "//mace/benchmark:model_throughput_test",
-        "--crosstool_top=//external:android/crosstool",
-        "--host_crosstool_top=@bazel_tools//tools/cpp:toolchain",
-        "--cpu=%s" % abi,
-        "--copt=-std=c++11",
-        "--copt=-D_GLIBCXX_USE_C99_MATH_TR1",
-        "--copt=-Werror=return-type",
-        "--copt=-O3",
-        "--define",
-        "neon=true",
-        "--define",
-        "openmp=true",
-        model_tag_build_flag,
-        _fg=True)
-
-    sh.rm("mace/benchmark/libmace_merged.a")
-    sh.adb("-s",
-           serialno,
-           "shell",
-           "mkdir",
-           "-p",
-           phone_data_dir)
-    adb_push("%s/%s_%s" % (model_input_dir, input_file_name,
-                           ",".join(input_nodes)),
-             phone_data_dir,
-             serialno)
-    adb_push("bazel-bin/mace/benchmark/model_throughput_test",
-             phone_data_dir,
-             serialno)
-    if not embed_model_data:
-        adb_push("codegen/models/%s/%s.data" % cpu_model_tag,
-                 phone_data_dir,
-                 serialno)
-        adb_push("codegen/models/%s/%s.data" % gpu_model_tag,
-                 phone_data_dir,
-                 serialno)
-        adb_push("codegen/models/%s/%s.data" % dsp_model_tag,
-                 phone_data_dir,
-                 serialno)
-
-    adb_push("third_party/nnlib/%s/libhexagon_controller.so" % abi,
-             phone_data_dir,
-             serialno)
-    if apu_model_tag:
-        adb_push("third_party/apu/libapu-frontend.so",
-                 phone_data_dir,
-                 serialno)
-    sh.adb(
-        "-s",
-        serialno,
-        "shell",
-        "LD_LIBRARY_PATH=%s" % phone_data_dir,
-        "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
-        "MACE_RUN_PARAMETER_PATH=%s/mace_run.config" %
-        phone_data_dir,
-        "%s/model_throughput_test" % phone_data_dir,
-        "--input_node=%s" % ",".join(input_nodes),
-        "--output_node=%s" % ",".join(output_nodes),
-        "--input_shape=%s" % ":".join(input_shapes),
-        "--output_shape=%s" % ":".join(output_shapes),
-        "--input_file=%s/%s" % (phone_data_dir, input_file_name),
-        "--cpu_model_data_file=%s/%s.data" % (phone_data_dir,
-                                              cpu_model_tag),
-        "--gpu_model_data_file=%s/%s.data" % (phone_data_dir,
-                                              gpu_model_tag),
-        "--dsp_model_data_file=%s/%s.data" % (phone_data_dir,
-                                              dsp_model_tag),
-        "--run_seconds=%s" % run_seconds,
-        _fg=True)
-
-    six.print_("throughput_test done!\n")