diff --git a/mace/core/BUILD b/mace/core/BUILD
index 19b6ecc166dda756bf9c00c53bf5fb24f93c2f72..417c1374d901af30154ae242a8c525cd73f9e8f6 100644
--- a/mace/core/BUILD
+++ b/mace/core/BUILD
@@ -42,7 +42,10 @@ cc_library(
             "runtime/opencl/*.h",
         ],
     )) + if_hexagon_enabled(glob(["runtime/hexagon/*.h"])),
-    copts = if_openmp_enabled(["-fopenmp"]) + if_android([
+    copts = if_openmp_enabled([
+        "-fopenmp",
+        "-DMACE_ENABLE_OPENMP",
+    ]) + if_android([
         "-DMACE_ENABLE_OPENCL",
     ]) + if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]),
     linkopts = ["-ldl"] + if_android([
diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc
index 39c7c12547fdfec4c5fb28ab6c238cc527d49e59..88754f2378a6b35e554a29b107928046f626e8a5 100644
--- a/mace/core/runtime/cpu/cpu_runtime.cc
+++ b/mace/core/runtime/cpu/cpu_runtime.cc
@@ -14,7 +14,10 @@
 
 #include "mace/core/runtime/cpu/cpu_runtime.h"
 
+#ifdef MACE_ENABLE_OPENMP
 #include <omp.h>
+#endif
+
 #include <unistd.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
@@ -30,6 +33,24 @@ namespace mace {
 
 namespace {
 
+int GetCPUCount() {
+  char path[32];
+  int cpu_count = 0;
+  int result = 0;
+
+  while (true) {
+    snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d", cpu_count);
+    result = access(path, F_OK);
+    if (result != 0) {
+      if (errno != ENOENT) {
+        LOG(ERROR) << "Access " << path << " failed, errno: " << errno;
+      }
+      return cpu_count;
+    }
+    cpu_count++;
+  }
+}
+
 int GetCPUMaxFreq(int cpu_id) {
   char path[64];
   snprintf(path, sizeof(path),
@@ -99,7 +120,11 @@ MaceStatus GetCPUBigLittleCoreIDs(std::vector<int> *big_core_ids,
                                   std::vector<int> *little_core_ids) {
   MACE_CHECK_NOTNULL(big_core_ids);
   MACE_CHECK_NOTNULL(little_core_ids);
+#ifdef MACE_ENABLE_OPENMP
   int cpu_count = omp_get_num_procs();
+#else
+  int cpu_count = GetCPUCount();
+#endif
   std::vector<int> cpu_max_freq(cpu_count);
   std::vector<int> cpu_ids(cpu_count);
 
@@ -141,7 +166,11 @@ void SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
   VLOG(1) << "Set OpenMP threads number: " << omp_num_threads
           << ", CPU core IDs: " << MakeString(cpu_ids);
 
+#ifdef MACE_ENABLE_OPENMP
   omp_set_num_threads(omp_num_threads);
+#else
+  LOG(WARNING) << "OpenMP not enabled. Set OpenMP threads number failed.";
+#endif
 
   // compute mask
   cpu_set_t mask;
@@ -150,18 +179,27 @@ void SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
     CPU_SET(cpu_id, &mask);
   }
 
+#ifdef MACE_ENABLE_OPENMP
 #pragma omp parallel for
   for (int i = 0; i < omp_num_threads; ++i) {
     SetThreadAffinity(mask);
   }
+#else
+  SetThreadAffinity(mask);
+  LOG(INFO) << "SetThreadAffinity: " << mask.__bits[0];
+#endif
 }
 
 MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint,
                                              CPUAffinityPolicy policy) {
   if (policy == CPUAffinityPolicy::AFFINITY_NONE) {
+#ifdef MACE_ENABLE_OPENMP
     if (omp_num_threads_hint > 0) {
       omp_set_num_threads(std::min(omp_num_threads_hint, omp_get_num_procs()));
     }
+#else
+    LOG(WARNING) << "OpenMP not enabled. Set OpenMP threads number failed.";
+#endif
     return MACE_SUCCESS;
   }
 
diff --git a/tools/mace_tools.py b/tools/mace_tools.py
index 851b6be104fda1d2dc673065d443bfac6b5c7796..5ed245344b2265e7bab25c9e277d0ee0df63c038 100644
--- a/tools/mace_tools.py
+++ b/tools/mace_tools.py
@@ -170,26 +170,33 @@ def tuning_run(runtime,
                phone_data_dir,
                tuning=False,
                limit_opencl_kernel_time=0,
-               option_args=""):
+               omp_num_threads=-1,
+               cpu_affinity_policy=1,
+               gpu_perf_hint=3,
+               gpu_priority_hint=3):
     stdout = sh_commands.tuning_run(
-            target_abi,
-            serialno,
-            vlog_level,
-            embed_model_data,
-            model_output_dir,
-            input_nodes,
-            output_nodes,
-            input_shapes,
-            output_shapes,
-            model_name,
-            device_type,
-            running_round,
-            restart_round,
-            limit_opencl_kernel_time,
-            tuning,
-            out_of_range_check,
-            phone_data_dir,
-            option_args)
+        target_abi,
+        serialno,
+        vlog_level,
+        embed_model_data,
+        model_output_dir,
+        input_nodes,
+        output_nodes,
+        input_shapes,
+        output_shapes,
+        model_name,
+        device_type,
+        running_round,
+        restart_round,
+        limit_opencl_kernel_time,
+        tuning,
+        out_of_range_check,
+        phone_data_dir,
+        omp_num_threads,
+        cpu_affinity_policy,
+        gpu_perf_hint,
+        gpu_priority_hint
+    )
 
     if running_round > 0 and FLAGS.collect_report:
         model_benchmark_stdout_processor(
@@ -201,16 +208,19 @@ def build_mace_run_prod(hexagon_mode, runtime, target_abi,
                         model_output_dir, input_nodes, output_nodes,
                         input_shapes, output_shapes, model_name, device_type,
                         running_round, restart_round, tuning,
-                        limit_opencl_kernel_time, phone_data_dir):
+                        limit_opencl_kernel_time, phone_data_dir,
+                        enable_openmp):
     mace_run_target = "//mace/tools/validation:mace_run"
     if runtime == "gpu":
         gen_opencl_and_tuning_code(target_abi, serialno, [], False)
         sh_commands.bazel_build(
-                mace_run_target,
-                abi=target_abi,
-                model_tag=model_name,
-                production_mode=False,
-                hexagon_mode=hexagon_mode)
+            mace_run_target,
+            abi=target_abi,
+            model_tag=model_name,
+            production_mode=False,
+            hexagon_mode=hexagon_mode,
+            enable_openmp=enable_openmp
+        )
         sh_commands.update_mace_run_lib(model_output_dir, target_abi,
                                         model_name, embed_model_data)
 
@@ -230,21 +240,25 @@ def build_mace_run_prod(hexagon_mode, runtime, target_abi,
         gen_opencl_and_tuning_code(target_abi, serialno, [model_output_dir],
                                    True)
         sh_commands.bazel_build(
-                mace_run_target,
-                abi=target_abi,
-                model_tag=model_name,
-                production_mode=True,
-                hexagon_mode=hexagon_mode)
+            mace_run_target,
+            abi=target_abi,
+            model_tag=model_name,
+            production_mode=True,
+            hexagon_mode=hexagon_mode,
+            enable_openmp=enable_openmp
+        )
         sh_commands.update_mace_run_lib(model_output_dir, target_abi,
                                         model_name, embed_model_data)
     else:
         gen_opencl_and_tuning_code(target_abi, serialno, [], False)
         sh_commands.bazel_build(
-                mace_run_target,
-                abi=target_abi,
-                model_tag=model_name,
-                production_mode=True,
-                hexagon_mode=hexagon_mode)
+            mace_run_target,
+            abi=target_abi,
+            model_tag=model_name,
+            production_mode=True,
+            hexagon_mode=hexagon_mode,
+            enable_openmp=enable_openmp
+        )
         sh_commands.update_mace_run_lib(model_output_dir, target_abi,
                                         model_name, embed_model_data)
 
@@ -344,6 +358,31 @@ def parse_args():
         type="bool",
         default="false",
         help="Enable out of range check for opencl.")
+    parser.add_argument(
+        "--enable_openmp",
+        type="bool",
+        default="true",
+        help="Enable openmp.")
+    parser.add_argument(
+        "--omp_num_threads",
+        type=int,
+        default=-1,
+        help="num of openmp threads")
+    parser.add_argument(
+        "--cpu_affinity_policy",
+        type=int,
+        default=1,
+        help="0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY")
+    parser.add_argument(
+        "--gpu_perf_hint",
+        type=int,
+        default=3,
+        help="0:DEFAULT/1:LOW/2:NORMAL/3:HIGH")
+    parser.add_argument(
+        "--gpu_priority_hint",
+        type=int,
+        default=3,
+        help="0:DEFAULT/1:LOW/2:NORMAL/3:HIGH")
     parser.add_argument(
         "--collect_report",
         type="bool",
@@ -358,8 +397,7 @@ def parse_args():
 
 
 def process_models(project_name, configs, embed_model_data, vlog_level,
-                   target_abi, phone_data_dir, option_args,
-                   target_soc="", serialno=""):
+                   target_abi, phone_data_dir, target_soc="", serialno=""):
     hexagon_mode = get_hexagon_mode(configs)
     model_output_dirs = []
     for model_name in configs["models"]:
@@ -450,7 +488,8 @@ def process_models(project_name, configs, embed_model_data, vlog_level,
                                 FLAGS.restart_round,
                                 FLAGS.tuning,
                                 model_config["limit_opencl_kernel_time"],
-                                phone_data_dir)
+                                phone_data_dir,
+                                FLAGS.enable_openmp)
 
         if FLAGS.mode == "run" or FLAGS.mode == "validate" or \
                 FLAGS.mode == "all":
@@ -469,7 +508,11 @@ def process_models(project_name, configs, embed_model_data, vlog_level,
                        FLAGS.round,
                        FLAGS.restart_round,
                        FLAGS.out_of_range_check,
-                       phone_data_dir)
+                       phone_data_dir,
+                       omp_num_threads=FLAGS.omp_num_threads,
+                       cpu_affinity_policy=FLAGS.cpu_affinity_policy,
+                       gpu_perf_hint=FLAGS.gpu_perf_hint,
+                       gpu_priority_hint=FLAGS.gpu_priority_hint)
 
         if FLAGS.mode == "benchmark":
             gen_opencl_and_tuning_code(
@@ -487,7 +530,10 @@ def process_models(project_name, configs, embed_model_data, vlog_level,
                                         device_type,
                                         hexagon_mode,
                                         phone_data_dir,
-                                        option_args)
+                                        FLAGS.omp_num_threads,
+                                        FLAGS.cpu_affinity_policy,
+                                        FLAGS.gpu_perf_hint,
+                                        FLAGS.gpu_priority_hint)
 
         if FLAGS.mode == "validate" or FLAGS.mode == "all":
             sh_commands.validate_model(target_abi,
@@ -573,9 +619,6 @@ def main(unused_args):
         sh_commands.gen_mace_version()
         sh_commands.gen_encrypted_opencl_source()
 
-    option_args = ' '.join(
-        [arg for arg in unused_args if arg.startswith('--')])
-
     target_socs = get_target_socs(configs)
 
     embed_model_data = configs.get("embed_model_data", 1)
@@ -597,13 +640,12 @@ def main(unused_args):
                               props["ro.product.model"]))
                         process_models(project_name, configs, embed_model_data,
                                        vlog_level, target_abi, phone_data_dir,
-                                       option_args, target_soc, serialno)
+                                       target_soc, serialno)
             else:
                 print("====================================================")
                 print("Run on host")
                 process_models(project_name, configs, embed_model_data,
-                               vlog_level, target_abi, phone_data_dir,
-                               option_args)
+                               vlog_level, target_abi, phone_data_dir)
 
     if FLAGS.mode == "build" or FLAGS.mode == "all":
         sh_commands.packaging_lib(FLAGS.output_dir, project_name)
diff --git a/tools/sh_commands.py b/tools/sh_commands.py
index b4027f66db7dd8113c8267177d7d380fc90583e0..a4aa06656cea32fdfed05d66a6bd9c2baaf0a745 100644
--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -273,7 +273,8 @@ def bazel_build(target,
                 production_mode=False,
                 hexagon_mode=False,
                 disable_no_tuning_warning=False,
-                debug=False):
+                debug=False,
+                enable_openmp=True):
     print("* Build %s with ABI %s" % (target, abi))
     stdout_buff = []
     process_output = make_output_processor(stdout_buff)
@@ -292,7 +293,7 @@ def bazel_build(target,
             "--copt=-DMACE_MODEL_TAG=%s" % model_tag,
             "--copt=-O3",
             "--define",
-            "openmp=true",
+            "openmp=%s" % str(enable_openmp).lower(),
             "--define",
             "production=%s" % str(production_mode).lower(),
             _out=process_output,
@@ -320,7 +321,7 @@ def bazel_build(target,
             "--define",
             "neon=true",
             "--define",
-            "openmp=true",
+            "openmp=%s" % str(enable_openmp).lower(),
             "--define",
             "production=%s" % str(production_mode).lower(),
             "--define",
@@ -576,15 +577,21 @@ def tuning_run(abi,
                tuning,
                out_of_range_check,
                phone_data_dir,
-               option_args="",
+               omp_num_threads=-1,
+               cpu_affinity_policy=1,
+               gpu_perf_hint=3,
+               gpu_priority_hint=3,
                input_file_name="model_input",
                output_file_name="model_out"):
     print("* Run '%s' with round=%s, restart_round=%s, tuning=%s, "
-          "out_of_range_check=%s" %
+          "out_of_range_check=%s, omp_num_threads=%s, cpu_affinity_policy=%s, "
+          "gpu_perf_hint=%s, gpu_priority_hint=%s" %
           (model_tag, running_round, restart_round, str(tuning),
-              str(out_of_range_check)))
+           str(out_of_range_check), omp_num_threads, cpu_affinity_policy,
+           gpu_perf_hint, gpu_priority_hint))
     if abi == "host":
-        p = subprocess.Popen([
+        p = subprocess.Popen(
+            [
                 "env",
                 "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
                 "%s/mace_run" % model_output_dir,
@@ -598,9 +605,13 @@ def tuning_run(abi,
                 "--device=%s" % device_type,
                 "--round=%s" % running_round,
                 "--restart_round=%s" % restart_round,
-                "%s" % option_args],
-                stderr=subprocess.PIPE,
-                stdout=subprocess.PIPE)
+                "--omp_num_threads=%s" % omp_num_threads,
+                "--cpu_affinity_policy=%s" % cpu_affinity_policy,
+                "--gpu_perf_hint=%s" % gpu_perf_hint,
+                "--gpu_priority_hint=%s" % gpu_priority_hint,
+            ],
+            stderr=subprocess.PIPE,
+            stdout=subprocess.PIPE)
         out, err = p.communicate()
         stdout = err + out
         print stdout
@@ -627,33 +638,36 @@ def tuning_run(abi,
         stdout_buff = []
         process_output = make_output_processor(stdout_buff)
         p = sh.adb(
-                "-s",
-                serialno,
-                "shell",
-                "LD_LIBRARY_PATH=%s" % phone_data_dir,
-                "MACE_TUNING=%s" % int(tuning),
-                "MACE_OUT_OF_RANGE_CHECK=%s" % int(out_of_range_check),
-                "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
-                "MACE_RUN_PARAMETER_PATH=%s/mace_run.config" %
-                phone_data_dir,
-                "MACE_CL_PROGRAM_PATH=%s/cl_program" % phone_data_dir,
-                "MACE_LIMIT_OPENCL_KERNEL_TIME=%s" %
-                limit_opencl_kernel_time,
-                "%s/mace_run" % phone_data_dir,
-                "--input_node=%s" % ",".join(input_nodes),
-                "--output_node=%s" % ",".join(output_nodes),
-                "--input_shape=%s" % ":".join(input_shapes),
-                "--output_shape=%s" % ":".join(output_shapes),
-                "--input_file=%s/%s" % (phone_data_dir, input_file_name),
-                "--output_file=%s/%s" % (phone_data_dir, output_file_name),
-                "--model_data_file=%s/%s.data" % (phone_data_dir, model_tag),
-                "--device=%s" % device_type,
-                "--round=%s" % running_round,
-                "--restart_round=%s" % restart_round,
-                "%s" % option_args,
-                _out=process_output,
-                _bg=True,
-                _err_to_out=True)
+            "-s",
+            serialno,
+            "shell",
+            "LD_LIBRARY_PATH=%s" % phone_data_dir,
+            "MACE_TUNING=%s" % int(tuning),
+            "MACE_OUT_OF_RANGE_CHECK=%s" % int(out_of_range_check),
+            "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
+            "MACE_RUN_PARAMETER_PATH=%s/mace_run.config" %
+            phone_data_dir,
+            "MACE_CL_PROGRAM_PATH=%s/cl_program" % phone_data_dir,
+            "MACE_LIMIT_OPENCL_KERNEL_TIME=%s" %
+            limit_opencl_kernel_time,
+            "%s/mace_run" % phone_data_dir,
+            "--input_node=%s" % ",".join(input_nodes),
+            "--output_node=%s" % ",".join(output_nodes),
+            "--input_shape=%s" % ":".join(input_shapes),
+            "--output_shape=%s" % ":".join(output_shapes),
+            "--input_file=%s/%s" % (phone_data_dir, input_file_name),
+            "--output_file=%s/%s" % (phone_data_dir, output_file_name),
+            "--model_data_file=%s/%s.data" % (phone_data_dir, model_tag),
+            "--device=%s" % device_type,
+            "--round=%s" % running_round,
+            "--restart_round=%s" % restart_round,
+            "--omp_num_threads=%s" % omp_num_threads,
+            "--cpu_affinity_policy=%s" % cpu_affinity_policy,
+            "--gpu_perf_hint=%s" % gpu_perf_hint,
+            "--gpu_priority_hint=%s" % gpu_priority_hint,
+            _out=process_output,
+            _bg=True,
+            _err_to_out=True)
         p.wait()
         print("Running finished!\n")
         return "".join(stdout_buff)
@@ -900,7 +914,10 @@ def benchmark_model(abi,
                     device_type,
                     hexagon_mode,
                     phone_data_dir,
-                    option_args="",
+                    omp_num_threads=-1,
+                    cpu_affinity_policy=1,
+                    gpu_perf_hint=3,
+                    gpu_priority_hint=3,
                     input_file_name="model_input",
                     output_file_name="model_out"):
     print("* Benchmark for %s" % model_tag)
@@ -924,7 +941,8 @@ def benchmark_model(abi,
     stdout_buff = []
     process_output = make_output_processor(stdout_buff)
     if abi == "host":
-        p = subprocess.Popen([
+        p = subprocess.Popen(
+            [
                 "env",
                 "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
                 "%s/benchmark_model" % model_output_dir,
@@ -935,7 +953,11 @@ def benchmark_model(abi,
                 "--input_file=%s/%s" % (model_output_dir, input_file_name),
                 "--model_data_file=%s/%s.data" % (model_output_dir, model_tag),
                 "--device=%s" % device_type,
-                "%s" % option_args])
+                "--omp_num_threads=%s" % omp_num_threads,
+                "--cpu_affinity_policy=%s" % cpu_affinity_policy,
+                "--gpu_perf_hint=%s" % gpu_perf_hint,
+                "--gpu_priority_hint=%s" % gpu_priority_hint,
+            ])
         p.wait()
     else:
         sh.adb("-s", serialno, "shell", "mkdir", "-p", phone_data_dir)
@@ -951,26 +973,29 @@ def benchmark_model(abi,
             adb_push("%s/%s.data" % (model_output_dir, model_tag),
                      phone_data_dir, serialno)
         p = sh.adb(
-                "-s",
-                serialno,
-                "shell",
-                "LD_LIBRARY_PATH=%s" % phone_data_dir,
-                "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
-                "MACE_RUN_PARAMETER_PATH=%s/mace_run.config" %
-                phone_data_dir,
-                "MACE_OPENCL_PROFILING=1",
-                "%s/benchmark_model" % phone_data_dir,
-                "--input_node=%s" % ",".join(input_nodes),
-                "--output_node=%s" % ",".join(output_nodes),
-                "--input_shape=%s" % ":".join(input_shapes),
-                "--output_shape=%s" % ":".join(output_shapes),
-                "--input_file=%s/%s" % (phone_data_dir, input_file_name),
-                "--model_data_file=%s/%s.data" % (phone_data_dir, model_tag),
-                "--device=%s" % device_type,
-                "%s" % option_args,
-                _out=process_output,
-                _bg=True,
-                _err_to_out=True)
+            "-s",
+            serialno,
+            "shell",
+            "LD_LIBRARY_PATH=%s" % phone_data_dir,
+            "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
+            "MACE_RUN_PARAMETER_PATH=%s/mace_run.config" %
+            phone_data_dir,
+            "MACE_OPENCL_PROFILING=1",
+            "%s/benchmark_model" % phone_data_dir,
+            "--input_node=%s" % ",".join(input_nodes),
+            "--output_node=%s" % ",".join(output_nodes),
+            "--input_shape=%s" % ":".join(input_shapes),
+            "--output_shape=%s" % ":".join(output_shapes),
+            "--input_file=%s/%s" % (phone_data_dir, input_file_name),
+            "--model_data_file=%s/%s.data" % (phone_data_dir, model_tag),
+            "--device=%s" % device_type,
+            "--omp_num_threads=%s" % omp_num_threads,
+            "--cpu_affinity_policy=%s" % cpu_affinity_policy,
+            "--gpu_perf_hint=%s" % gpu_perf_hint,
+            "--gpu_priority_hint=%s" % gpu_priority_hint,
+            _out=process_output,
+            _bg=True,
+            _err_to_out=True)
         p.wait()
 
     print("Benchmark done!\n")