Merge branch 'runtime' into 'master'

remove gpu/dsp openmp dependency See merge request !446

Merge branch 'runtime' into 'master'
remove gpu/dsp openmp dependency See merge request !446
c9490fd3 · Liangliang He · db100927 · 31fe19f4 · c9490fd3 · c9490fd3
4 changed file
--- a/mace/core/BUILD
+++ b/mace/core/BUILD
@@ -42,7 +42,10 @@ cc_library(
            "runtime/opencl/*.h",
        ],
    )) + if_hexagon_enabled(glob(["runtime/hexagon/*.h"])),
-    copts = if_openmp_enabled(["-fopenmp"]) + if_android([
+    copts = if_openmp_enabled([
+        "-fopenmp",
+        "-DMACE_ENABLE_OPENMP",
+    ]) + if_android([
        "-DMACE_ENABLE_OPENCL",
    ]) + if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]),
    linkopts = ["-ldl"] + if_android([

--- a/mace/core/runtime/cpu/cpu_runtime.cc
+++ b/mace/core/runtime/cpu/cpu_runtime.cc
@@ -14,7 +14,10 @@

 #include "mace/core/runtime/cpu/cpu_runtime.h"

+#ifdef MACE_ENABLE_OPENMP
 #include <omp.h>
+#endif
+
 #include <unistd.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
@@ -30,6 +33,24 @@ namespace mace {

 namespace {

+int GetCPUCount() {
+  char path[32];
+  int cpu_count = 0;
+  int result = 0;
+
+  while (true) {
+    snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d", cpu_count);
+    result = access(path, F_OK);
+    if (result != 0) {
+      if (errno != ENOENT) {
+        LOG(ERROR) << "Access " << path << " failed, errno: " << errno;
+      }
+      return cpu_count;
+    }
+    cpu_count++;
+  }
+}
+
 int GetCPUMaxFreq(int cpu_id) {
  char path[64];
  snprintf(path, sizeof(path),
@@ -99,7 +120,11 @@ MaceStatus GetCPUBigLittleCoreIDs(std::vector<int> *big_core_ids,
                                  std::vector<int> *little_core_ids) {
  MACE_CHECK_NOTNULL(big_core_ids);
  MACE_CHECK_NOTNULL(little_core_ids);
+#ifdef MACE_ENABLE_OPENMP
  int cpu_count = omp_get_num_procs();
+#else
+  int cpu_count = GetCPUCount();
+#endif
  std::vector<int> cpu_max_freq(cpu_count);
  std::vector<int> cpu_ids(cpu_count);

@@ -141,7 +166,11 @@ void SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
  VLOG(1) << "Set OpenMP threads number: " << omp_num_threads
          << ", CPU core IDs: " << MakeString(cpu_ids);

+#ifdef MACE_ENABLE_OPENMP
  omp_set_num_threads(omp_num_threads);
+#else
+  LOG(WARNING) << "OpenMP not enabled. Set OpenMP threads number failed.";
+#endif

  // compute mask
  cpu_set_t mask;
@@ -150,18 +179,27 @@ void SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
    CPU_SET(cpu_id, &mask);
  }

+#ifdef MACE_ENABLE_OPENMP
 #pragma omp parallel for
  for (int i = 0; i < omp_num_threads; ++i) {
    SetThreadAffinity(mask);
  }
+#else
+  SetThreadAffinity(mask);
+  LOG(INFO) << "SetThreadAffinity: " << mask.__bits[0];
+#endif
 }

 MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint,
                                             CPUAffinityPolicy policy) {
  if (policy == CPUAffinityPolicy::AFFINITY_NONE) {
+#ifdef MACE_ENABLE_OPENMP
    if (omp_num_threads_hint > 0) {
      omp_set_num_threads(std::min(omp_num_threads_hint, omp_get_num_procs()));
    }
+#else
+    LOG(WARNING) << "OpenMP not enabled. Set OpenMP threads number failed.";
+#endif
    return MACE_SUCCESS;
  }


--- a/tools/mace_tools.py
+++ b/tools/mace_tools.py
@@ -170,26 +170,33 @@ def tuning_run(runtime,
               phone_data_dir,
               tuning=False,
               limit_opencl_kernel_time=0,
-               option_args=""):
+               omp_num_threads=-1,
+               cpu_affinity_policy=1,
+               gpu_perf_hint=3,
+               gpu_priority_hint=3):
    stdout = sh_commands.tuning_run(
-            target_abi,
-            serialno,
-            vlog_level,
-            embed_model_data,
-            model_output_dir,
-            input_nodes,
-            output_nodes,
-            input_shapes,
-            output_shapes,
-            model_name,
-            device_type,
-            running_round,
-            restart_round,
-            limit_opencl_kernel_time,
-            tuning,
-            out_of_range_check,
-            phone_data_dir,
-            option_args)
+        target_abi,
+        serialno,
+        vlog_level,
+        embed_model_data,
+        model_output_dir,
+        input_nodes,
+        output_nodes,
+        input_shapes,
+        output_shapes,
+        model_name,
+        device_type,
+        running_round,
+        restart_round,
+        limit_opencl_kernel_time,
+        tuning,
+        out_of_range_check,
+        phone_data_dir,
+        omp_num_threads,
+        cpu_affinity_policy,
+        gpu_perf_hint,
+        gpu_priority_hint
+    )

    if running_round > 0 and FLAGS.collect_report:
        model_benchmark_stdout_processor(
@@ -201,16 +208,19 @@ def build_mace_run_prod(hexagon_mode, runtime, target_abi,
                        model_output_dir, input_nodes, output_nodes,
                        input_shapes, output_shapes, model_name, device_type,
                        running_round, restart_round, tuning,
-                        limit_opencl_kernel_time, phone_data_dir):
+                        limit_opencl_kernel_time, phone_data_dir,
+                        enable_openmp):
    mace_run_target = "//mace/tools/validation:mace_run"
    if runtime == "gpu":
        gen_opencl_and_tuning_code(target_abi, serialno, [], False)
        sh_commands.bazel_build(
-                mace_run_target,
-                abi=target_abi,
-                model_tag=model_name,
-                production_mode=False,
-                hexagon_mode=hexagon_mode)
+            mace_run_target,
+            abi=target_abi,
+            model_tag=model_name,
+            production_mode=False,
+            hexagon_mode=hexagon_mode,
+            enable_openmp=enable_openmp
+        )
        sh_commands.update_mace_run_lib(model_output_dir, target_abi,
                                        model_name, embed_model_data)

@@ -230,21 +240,25 @@ def build_mace_run_prod(hexagon_mode, runtime, target_abi,
        gen_opencl_and_tuning_code(target_abi, serialno, [model_output_dir],
                                   True)
        sh_commands.bazel_build(
-                mace_run_target,
-                abi=target_abi,
-                model_tag=model_name,
-                production_mode=True,
-                hexagon_mode=hexagon_mode)
+            mace_run_target,
+            abi=target_abi,
+            model_tag=model_name,
+            production_mode=True,
+            hexagon_mode=hexagon_mode,
+            enable_openmp=enable_openmp
+        )
        sh_commands.update_mace_run_lib(model_output_dir, target_abi,
                                        model_name, embed_model_data)
    else:
        gen_opencl_and_tuning_code(target_abi, serialno, [], False)
        sh_commands.bazel_build(
-                mace_run_target,
-                abi=target_abi,
-                model_tag=model_name,
-                production_mode=True,
-                hexagon_mode=hexagon_mode)
+            mace_run_target,
+            abi=target_abi,
+            model_tag=model_name,
+            production_mode=True,
+            hexagon_mode=hexagon_mode,
+            enable_openmp=enable_openmp
+        )
        sh_commands.update_mace_run_lib(model_output_dir, target_abi,
                                        model_name, embed_model_data)

@@ -344,6 +358,31 @@ def parse_args():
        type="bool",
        default="false",
        help="Enable out of range check for opencl.")
+    parser.add_argument(
+        "--enable_openmp",
+        type="bool",
+        default="true",
+        help="Enable openmp.")
+    parser.add_argument(
+        "--omp_num_threads",
+        type=int,
+        default=-1,
+        help="num of openmp threads")
+    parser.add_argument(
+        "--cpu_affinity_policy",
+        type=int,
+        default=1,
+        help="0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY")
+    parser.add_argument(
+        "--gpu_perf_hint",
+        type=int,
+        default=3,
+        help="0:DEFAULT/1:LOW/2:NORMAL/3:HIGH")
+    parser.add_argument(
+        "--gpu_priority_hint",
+        type=int,
+        default=3,
+        help="0:DEFAULT/1:LOW/2:NORMAL/3:HIGH")
    parser.add_argument(
        "--collect_report",
        type="bool",
@@ -358,8 +397,7 @@ def parse_args():


 def process_models(project_name, configs, embed_model_data, vlog_level,
-                   target_abi, phone_data_dir, option_args,
-                   target_soc="", serialno=""):
+                   target_abi, phone_data_dir, target_soc="", serialno=""):
    hexagon_mode = get_hexagon_mode(configs)
    model_output_dirs = []
    for model_name in configs["models"]:
@@ -450,7 +488,8 @@ def process_models(project_name, configs, embed_model_data, vlog_level,
                                FLAGS.restart_round,
                                FLAGS.tuning,
                                model_config["limit_opencl_kernel_time"],
-                                phone_data_dir)
+                                phone_data_dir,
+                                FLAGS.enable_openmp)

        if FLAGS.mode == "run" or FLAGS.mode == "validate" or \
                FLAGS.mode == "all":
@@ -469,7 +508,11 @@ def process_models(project_name, configs, embed_model_data, vlog_level,
                       FLAGS.round,
                       FLAGS.restart_round,
                       FLAGS.out_of_range_check,
-                       phone_data_dir)
+                       phone_data_dir,
+                       omp_num_threads=FLAGS.omp_num_threads,
+                       cpu_affinity_policy=FLAGS.cpu_affinity_policy,
+                       gpu_perf_hint=FLAGS.gpu_perf_hint,
+                       gpu_priority_hint=FLAGS.gpu_priority_hint)

        if FLAGS.mode == "benchmark":
            gen_opencl_and_tuning_code(
@@ -487,7 +530,10 @@ def process_models(project_name, configs, embed_model_data, vlog_level,
                                        device_type,
                                        hexagon_mode,
                                        phone_data_dir,
-                                        option_args)
+                                        FLAGS.omp_num_threads,
+                                        FLAGS.cpu_affinity_policy,
+                                        FLAGS.gpu_perf_hint,
+                                        FLAGS.gpu_priority_hint)

        if FLAGS.mode == "validate" or FLAGS.mode == "all":
            sh_commands.validate_model(target_abi,
@@ -573,9 +619,6 @@ def main(unused_args):
        sh_commands.gen_mace_version()
        sh_commands.gen_encrypted_opencl_source()

-    option_args = ' '.join(
-        [arg for arg in unused_args if arg.startswith('--')])
-
    target_socs = get_target_socs(configs)

    embed_model_data = configs.get("embed_model_data", 1)
@@ -597,13 +640,12 @@ def main(unused_args):
                              props["ro.product.model"]))
                        process_models(project_name, configs, embed_model_data,
                                       vlog_level, target_abi, phone_data_dir,
-                                       option_args, target_soc, serialno)
+                                       target_soc, serialno)
            else:
                print("====================================================")
                print("Run on host")
                process_models(project_name, configs, embed_model_data,
-                               vlog_level, target_abi, phone_data_dir,
-                               option_args)
+                               vlog_level, target_abi, phone_data_dir)

    if FLAGS.mode == "build" or FLAGS.mode == "all":
        sh_commands.packaging_lib(FLAGS.output_dir, project_name)

--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -273,7 +273,8 @@ def bazel_build(target,
                production_mode=False,
                hexagon_mode=False,
                disable_no_tuning_warning=False,
-                debug=False):
+                debug=False,
+                enable_openmp=True):
    print("* Build %s with ABI %s" % (target, abi))
    stdout_buff = []
    process_output = make_output_processor(stdout_buff)
@@ -292,7 +293,7 @@ def bazel_build(target,
            "--copt=-DMACE_MODEL_TAG=%s" % model_tag,
            "--copt=-O3",
            "--define",
-            "openmp=true",
+            "openmp=%s" % str(enable_openmp).lower(),
            "--define",
            "production=%s" % str(production_mode).lower(),
            _out=process_output,
@@ -320,7 +321,7 @@ def bazel_build(target,
            "--define",
            "neon=true",
            "--define",
-            "openmp=true",
+            "openmp=%s" % str(enable_openmp).lower(),
            "--define",
            "production=%s" % str(production_mode).lower(),
            "--define",
@@ -576,15 +577,21 @@ def tuning_run(abi,
               tuning,
               out_of_range_check,
               phone_data_dir,
-               option_args="",
+               omp_num_threads=-1,
+               cpu_affinity_policy=1,
+               gpu_perf_hint=3,
+               gpu_priority_hint=3,
               input_file_name="model_input",
               output_file_name="model_out"):
    print("* Run '%s' with round=%s, restart_round=%s, tuning=%s, "
-          "out_of_range_check=%s" %
+          "out_of_range_check=%s, omp_num_threads=%s, cpu_affinity_policy=%s, "
+          "gpu_perf_hint=%s, gpu_priority_hint=%s" %
          (model_tag, running_round, restart_round, str(tuning),
-              str(out_of_range_check)))
+           str(out_of_range_check), omp_num_threads, cpu_affinity_policy,
+           gpu_perf_hint, gpu_priority_hint))
    if abi == "host":
-        p = subprocess.Popen([
+        p = subprocess.Popen(
+            [
                "env",
                "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
                "%s/mace_run" % model_output_dir,
@@ -598,9 +605,13 @@ def tuning_run(abi,
                "--device=%s" % device_type,
                "--round=%s" % running_round,
                "--restart_round=%s" % restart_round,
-                "%s" % option_args],
-                stderr=subprocess.PIPE,
-                stdout=subprocess.PIPE)
+                "--omp_num_threads=%s" % omp_num_threads,
+                "--cpu_affinity_policy=%s" % cpu_affinity_policy,
+                "--gpu_perf_hint=%s" % gpu_perf_hint,
+                "--gpu_priority_hint=%s" % gpu_priority_hint,
+            ],
+            stderr=subprocess.PIPE,
+            stdout=subprocess.PIPE)
        out, err = p.communicate()
        stdout = err + out
        print stdout
@@ -627,33 +638,36 @@ def tuning_run(abi,
        stdout_buff = []
        process_output = make_output_processor(stdout_buff)
        p = sh.adb(
-                "-s",
-                serialno,
-                "shell",
-                "LD_LIBRARY_PATH=%s" % phone_data_dir,
-                "MACE_TUNING=%s" % int(tuning),
-                "MACE_OUT_OF_RANGE_CHECK=%s" % int(out_of_range_check),
-                "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
-                "MACE_RUN_PARAMETER_PATH=%s/mace_run.config" %
-                phone_data_dir,
-                "MACE_CL_PROGRAM_PATH=%s/cl_program" % phone_data_dir,
-                "MACE_LIMIT_OPENCL_KERNEL_TIME=%s" %
-                limit_opencl_kernel_time,
-                "%s/mace_run" % phone_data_dir,
-                "--input_node=%s" % ",".join(input_nodes),
-                "--output_node=%s" % ",".join(output_nodes),
-                "--input_shape=%s" % ":".join(input_shapes),
-                "--output_shape=%s" % ":".join(output_shapes),
-                "--input_file=%s/%s" % (phone_data_dir, input_file_name),
-                "--output_file=%s/%s" % (phone_data_dir, output_file_name),
-                "--model_data_file=%s/%s.data" % (phone_data_dir, model_tag),
-                "--device=%s" % device_type,
-                "--round=%s" % running_round,
-                "--restart_round=%s" % restart_round,
-                "%s" % option_args,
-                _out=process_output,
-                _bg=True,
-                _err_to_out=True)
+            "-s",
+            serialno,
+            "shell",
+            "LD_LIBRARY_PATH=%s" % phone_data_dir,
+            "MACE_TUNING=%s" % int(tuning),
+            "MACE_OUT_OF_RANGE_CHECK=%s" % int(out_of_range_check),
+            "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
+            "MACE_RUN_PARAMETER_PATH=%s/mace_run.config" %
+            phone_data_dir,
+            "MACE_CL_PROGRAM_PATH=%s/cl_program" % phone_data_dir,
+            "MACE_LIMIT_OPENCL_KERNEL_TIME=%s" %
+            limit_opencl_kernel_time,
+            "%s/mace_run" % phone_data_dir,
+            "--input_node=%s" % ",".join(input_nodes),
+            "--output_node=%s" % ",".join(output_nodes),
+            "--input_shape=%s" % ":".join(input_shapes),
+            "--output_shape=%s" % ":".join(output_shapes),
+            "--input_file=%s/%s" % (phone_data_dir, input_file_name),
+            "--output_file=%s/%s" % (phone_data_dir, output_file_name),
+            "--model_data_file=%s/%s.data" % (phone_data_dir, model_tag),
+            "--device=%s" % device_type,
+            "--round=%s" % running_round,
+            "--restart_round=%s" % restart_round,
+            "--omp_num_threads=%s" % omp_num_threads,
+            "--cpu_affinity_policy=%s" % cpu_affinity_policy,
+            "--gpu_perf_hint=%s" % gpu_perf_hint,
+            "--gpu_priority_hint=%s" % gpu_priority_hint,
+            _out=process_output,
+            _bg=True,
+            _err_to_out=True)
        p.wait()
        print("Running finished!\n")
        return "".join(stdout_buff)
@@ -900,7 +914,10 @@ def benchmark_model(abi,
                    device_type,
                    hexagon_mode,
                    phone_data_dir,
-                    option_args="",
+                    omp_num_threads=-1,
+                    cpu_affinity_policy=1,
+                    gpu_perf_hint=3,
+                    gpu_priority_hint=3,
                    input_file_name="model_input",
                    output_file_name="model_out"):
    print("* Benchmark for %s" % model_tag)
@@ -924,7 +941,8 @@ def benchmark_model(abi,
    stdout_buff = []
    process_output = make_output_processor(stdout_buff)
    if abi == "host":
-        p = subprocess.Popen([
+        p = subprocess.Popen(
+            [
                "env",
                "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
                "%s/benchmark_model" % model_output_dir,
@@ -935,7 +953,11 @@ def benchmark_model(abi,
                "--input_file=%s/%s" % (model_output_dir, input_file_name),
                "--model_data_file=%s/%s.data" % (model_output_dir, model_tag),
                "--device=%s" % device_type,
-                "%s" % option_args])
+                "--omp_num_threads=%s" % omp_num_threads,
+                "--cpu_affinity_policy=%s" % cpu_affinity_policy,
+                "--gpu_perf_hint=%s" % gpu_perf_hint,
+                "--gpu_priority_hint=%s" % gpu_priority_hint,
+            ])
        p.wait()
    else:
        sh.adb("-s", serialno, "shell", "mkdir", "-p", phone_data_dir)
@@ -951,26 +973,29 @@ def benchmark_model(abi,
            adb_push("%s/%s.data" % (model_output_dir, model_tag),
                     phone_data_dir, serialno)
        p = sh.adb(
-                "-s",
-                serialno,
-                "shell",
-                "LD_LIBRARY_PATH=%s" % phone_data_dir,
-                "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
-                "MACE_RUN_PARAMETER_PATH=%s/mace_run.config" %
-                phone_data_dir,
-                "MACE_OPENCL_PROFILING=1",
-                "%s/benchmark_model" % phone_data_dir,
-                "--input_node=%s" % ",".join(input_nodes),
-                "--output_node=%s" % ",".join(output_nodes),
-                "--input_shape=%s" % ":".join(input_shapes),
-                "--output_shape=%s" % ":".join(output_shapes),
-                "--input_file=%s/%s" % (phone_data_dir, input_file_name),
-                "--model_data_file=%s/%s.data" % (phone_data_dir, model_tag),
-                "--device=%s" % device_type,
-                "%s" % option_args,
-                _out=process_output,
-                _bg=True,
-                _err_to_out=True)
+            "-s",
+            serialno,
+            "shell",
+            "LD_LIBRARY_PATH=%s" % phone_data_dir,
+            "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level,
+            "MACE_RUN_PARAMETER_PATH=%s/mace_run.config" %
+            phone_data_dir,
+            "MACE_OPENCL_PROFILING=1",
+            "%s/benchmark_model" % phone_data_dir,
+            "--input_node=%s" % ",".join(input_nodes),
+            "--output_node=%s" % ",".join(output_nodes),
+            "--input_shape=%s" % ":".join(input_shapes),
+            "--output_shape=%s" % ":".join(output_shapes),
+            "--input_file=%s/%s" % (phone_data_dir, input_file_name),
+            "--model_data_file=%s/%s.data" % (phone_data_dir, model_tag),
+            "--device=%s" % device_type,
+            "--omp_num_threads=%s" % omp_num_threads,
+            "--cpu_affinity_policy=%s" % cpu_affinity_policy,
+            "--gpu_perf_hint=%s" % gpu_perf_hint,
+            "--gpu_priority_hint=%s" % gpu_priority_hint,
+            _out=process_output,
+            _bg=True,
+            _err_to_out=True)
        p.wait()

    print("Benchmark done!\n")