diff --git a/mace/core/BUILD b/mace/core/BUILD index 19b6ecc166dda756bf9c00c53bf5fb24f93c2f72..417c1374d901af30154ae242a8c525cd73f9e8f6 100644 --- a/mace/core/BUILD +++ b/mace/core/BUILD @@ -42,7 +42,10 @@ cc_library( "runtime/opencl/*.h", ], )) + if_hexagon_enabled(glob(["runtime/hexagon/*.h"])), - copts = if_openmp_enabled(["-fopenmp"]) + if_android([ + copts = if_openmp_enabled([ + "-fopenmp", + "-DMACE_ENABLE_OPENMP", + ]) + if_android([ "-DMACE_ENABLE_OPENCL", ]) + if_hexagon_enabled(["-DMACE_ENABLE_HEXAGON"]), linkopts = ["-ldl"] + if_android([ diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc index 39c7c12547fdfec4c5fb28ab6c238cc527d49e59..88754f2378a6b35e554a29b107928046f626e8a5 100644 --- a/mace/core/runtime/cpu/cpu_runtime.cc +++ b/mace/core/runtime/cpu/cpu_runtime.cc @@ -14,7 +14,10 @@ #include "mace/core/runtime/cpu/cpu_runtime.h" +#ifdef MACE_ENABLE_OPENMP #include +#endif + #include #include #include @@ -30,6 +33,24 @@ namespace mace { namespace { +int GetCPUCount() { + char path[32]; + int cpu_count = 0; + int result = 0; + + while (true) { + snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d", cpu_count); + result = access(path, F_OK); + if (result != 0) { + if (errno != ENOENT) { + LOG(ERROR) << "Access " << path << " failed, errno: " << errno; + } + return cpu_count; + } + cpu_count++; + } +} + int GetCPUMaxFreq(int cpu_id) { char path[64]; snprintf(path, sizeof(path), @@ -99,7 +120,11 @@ MaceStatus GetCPUBigLittleCoreIDs(std::vector *big_core_ids, std::vector *little_core_ids) { MACE_CHECK_NOTNULL(big_core_ids); MACE_CHECK_NOTNULL(little_core_ids); +#ifdef MACE_ENABLE_OPENMP int cpu_count = omp_get_num_procs(); +#else + int cpu_count = GetCPUCount(); +#endif std::vector cpu_max_freq(cpu_count); std::vector cpu_ids(cpu_count); @@ -141,7 +166,11 @@ void SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads, VLOG(1) << "Set OpenMP threads number: " << omp_num_threads << ", CPU core IDs: " << MakeString(cpu_ids); +#ifdef MACE_ENABLE_OPENMP omp_set_num_threads(omp_num_threads); +#else + LOG(WARNING) << "OpenMP not enabled. Set OpenMP threads number failed."; +#endif // compute mask cpu_set_t mask; @@ -150,18 +179,27 @@ void SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads, CPU_SET(cpu_id, &mask); } +#ifdef MACE_ENABLE_OPENMP #pragma omp parallel for for (int i = 0; i < omp_num_threads; ++i) { SetThreadAffinity(mask); } +#else + SetThreadAffinity(mask); + LOG(INFO) << "SetThreadAffinity: " << mask.__bits[0]; +#endif } MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint, CPUAffinityPolicy policy) { if (policy == CPUAffinityPolicy::AFFINITY_NONE) { +#ifdef MACE_ENABLE_OPENMP if (omp_num_threads_hint > 0) { omp_set_num_threads(std::min(omp_num_threads_hint, omp_get_num_procs())); } +#else + LOG(WARNING) << "OpenMP not enabled. Set OpenMP threads number failed."; +#endif return MACE_SUCCESS; } diff --git a/tools/mace_tools.py b/tools/mace_tools.py index 851b6be104fda1d2dc673065d443bfac6b5c7796..5ed245344b2265e7bab25c9e277d0ee0df63c038 100644 --- a/tools/mace_tools.py +++ b/tools/mace_tools.py @@ -170,26 +170,33 @@ def tuning_run(runtime, phone_data_dir, tuning=False, limit_opencl_kernel_time=0, - option_args=""): + omp_num_threads=-1, + cpu_affinity_policy=1, + gpu_perf_hint=3, + gpu_priority_hint=3): stdout = sh_commands.tuning_run( - target_abi, - serialno, - vlog_level, - embed_model_data, - model_output_dir, - input_nodes, - output_nodes, - input_shapes, - output_shapes, - model_name, - device_type, - running_round, - restart_round, - limit_opencl_kernel_time, - tuning, - out_of_range_check, - phone_data_dir, - option_args) + target_abi, + serialno, + vlog_level, + embed_model_data, + model_output_dir, + input_nodes, + output_nodes, + input_shapes, + output_shapes, + model_name, + device_type, + running_round, + restart_round, + limit_opencl_kernel_time, + tuning, + out_of_range_check, + phone_data_dir, + omp_num_threads, + cpu_affinity_policy, + gpu_perf_hint, + gpu_priority_hint + ) if running_round > 0 and FLAGS.collect_report: model_benchmark_stdout_processor( @@ -201,16 +208,19 @@ def build_mace_run_prod(hexagon_mode, runtime, target_abi, model_output_dir, input_nodes, output_nodes, input_shapes, output_shapes, model_name, device_type, running_round, restart_round, tuning, - limit_opencl_kernel_time, phone_data_dir): + limit_opencl_kernel_time, phone_data_dir, + enable_openmp): mace_run_target = "//mace/tools/validation:mace_run" if runtime == "gpu": gen_opencl_and_tuning_code(target_abi, serialno, [], False) sh_commands.bazel_build( - mace_run_target, - abi=target_abi, - model_tag=model_name, - production_mode=False, - hexagon_mode=hexagon_mode) + mace_run_target, + abi=target_abi, + model_tag=model_name, + production_mode=False, + hexagon_mode=hexagon_mode, + enable_openmp=enable_openmp + ) sh_commands.update_mace_run_lib(model_output_dir, target_abi, model_name, embed_model_data) @@ -230,21 +240,25 @@ def build_mace_run_prod(hexagon_mode, runtime, target_abi, gen_opencl_and_tuning_code(target_abi, serialno, [model_output_dir], True) sh_commands.bazel_build( - mace_run_target, - abi=target_abi, - model_tag=model_name, - production_mode=True, - hexagon_mode=hexagon_mode) + mace_run_target, + abi=target_abi, + model_tag=model_name, + production_mode=True, + hexagon_mode=hexagon_mode, + enable_openmp=enable_openmp + ) sh_commands.update_mace_run_lib(model_output_dir, target_abi, model_name, embed_model_data) else: gen_opencl_and_tuning_code(target_abi, serialno, [], False) sh_commands.bazel_build( - mace_run_target, - abi=target_abi, - model_tag=model_name, - production_mode=True, - hexagon_mode=hexagon_mode) + mace_run_target, + abi=target_abi, + model_tag=model_name, + production_mode=True, + hexagon_mode=hexagon_mode, + enable_openmp=enable_openmp + ) sh_commands.update_mace_run_lib(model_output_dir, target_abi, model_name, embed_model_data) @@ -344,6 +358,31 @@ def parse_args(): type="bool", default="false", help="Enable out of range check for opencl.") + parser.add_argument( + "--enable_openmp", + type="bool", + default="true", + help="Enable openmp.") + parser.add_argument( + "--omp_num_threads", + type=int, + default=-1, + help="num of openmp threads") + parser.add_argument( + "--cpu_affinity_policy", + type=int, + default=1, + help="0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY") + parser.add_argument( + "--gpu_perf_hint", + type=int, + default=3, + help="0:DEFAULT/1:LOW/2:NORMAL/3:HIGH") + parser.add_argument( + "--gpu_priority_hint", + type=int, + default=3, + help="0:DEFAULT/1:LOW/2:NORMAL/3:HIGH") parser.add_argument( "--collect_report", type="bool", @@ -358,8 +397,7 @@ def parse_args(): def process_models(project_name, configs, embed_model_data, vlog_level, - target_abi, phone_data_dir, option_args, - target_soc="", serialno=""): + target_abi, phone_data_dir, target_soc="", serialno=""): hexagon_mode = get_hexagon_mode(configs) model_output_dirs = [] for model_name in configs["models"]: @@ -450,7 +488,8 @@ def process_models(project_name, configs, embed_model_data, vlog_level, FLAGS.restart_round, FLAGS.tuning, model_config["limit_opencl_kernel_time"], - phone_data_dir) + phone_data_dir, + FLAGS.enable_openmp) if FLAGS.mode == "run" or FLAGS.mode == "validate" or \ FLAGS.mode == "all": @@ -469,7 +508,11 @@ def process_models(project_name, configs, embed_model_data, vlog_level, FLAGS.round, FLAGS.restart_round, FLAGS.out_of_range_check, - phone_data_dir) + phone_data_dir, + omp_num_threads=FLAGS.omp_num_threads, + cpu_affinity_policy=FLAGS.cpu_affinity_policy, + gpu_perf_hint=FLAGS.gpu_perf_hint, + gpu_priority_hint=FLAGS.gpu_priority_hint) if FLAGS.mode == "benchmark": gen_opencl_and_tuning_code( @@ -487,7 +530,10 @@ def process_models(project_name, configs, embed_model_data, vlog_level, device_type, hexagon_mode, phone_data_dir, - option_args) + FLAGS.omp_num_threads, + FLAGS.cpu_affinity_policy, + FLAGS.gpu_perf_hint, + FLAGS.gpu_priority_hint) if FLAGS.mode == "validate" or FLAGS.mode == "all": sh_commands.validate_model(target_abi, @@ -573,9 +619,6 @@ def main(unused_args): sh_commands.gen_mace_version() sh_commands.gen_encrypted_opencl_source() - option_args = ' '.join( - [arg for arg in unused_args if arg.startswith('--')]) - target_socs = get_target_socs(configs) embed_model_data = configs.get("embed_model_data", 1) @@ -597,13 +640,12 @@ def main(unused_args): props["ro.product.model"])) process_models(project_name, configs, embed_model_data, vlog_level, target_abi, phone_data_dir, - option_args, target_soc, serialno) + target_soc, serialno) else: print("====================================================") print("Run on host") process_models(project_name, configs, embed_model_data, - vlog_level, target_abi, phone_data_dir, - option_args) + vlog_level, target_abi, phone_data_dir) if FLAGS.mode == "build" or FLAGS.mode == "all": sh_commands.packaging_lib(FLAGS.output_dir, project_name) diff --git a/tools/sh_commands.py b/tools/sh_commands.py index b4027f66db7dd8113c8267177d7d380fc90583e0..a4aa06656cea32fdfed05d66a6bd9c2baaf0a745 100644 --- a/tools/sh_commands.py +++ b/tools/sh_commands.py @@ -273,7 +273,8 @@ def bazel_build(target, production_mode=False, hexagon_mode=False, disable_no_tuning_warning=False, - debug=False): + debug=False, + enable_openmp=True): print("* Build %s with ABI %s" % (target, abi)) stdout_buff = [] process_output = make_output_processor(stdout_buff) @@ -292,7 +293,7 @@ def bazel_build(target, "--copt=-DMACE_MODEL_TAG=%s" % model_tag, "--copt=-O3", "--define", - "openmp=true", + "openmp=%s" % str(enable_openmp).lower(), "--define", "production=%s" % str(production_mode).lower(), _out=process_output, @@ -320,7 +321,7 @@ def bazel_build(target, "--define", "neon=true", "--define", - "openmp=true", + "openmp=%s" % str(enable_openmp).lower(), "--define", "production=%s" % str(production_mode).lower(), "--define", @@ -576,15 +577,21 @@ def tuning_run(abi, tuning, out_of_range_check, phone_data_dir, - option_args="", + omp_num_threads=-1, + cpu_affinity_policy=1, + gpu_perf_hint=3, + gpu_priority_hint=3, input_file_name="model_input", output_file_name="model_out"): print("* Run '%s' with round=%s, restart_round=%s, tuning=%s, " - "out_of_range_check=%s" % + "out_of_range_check=%s, omp_num_threads=%s, cpu_affinity_policy=%s, " + "gpu_perf_hint=%s, gpu_priority_hint=%s" % (model_tag, running_round, restart_round, str(tuning), - str(out_of_range_check))) + str(out_of_range_check), omp_num_threads, cpu_affinity_policy, + gpu_perf_hint, gpu_priority_hint)) if abi == "host": - p = subprocess.Popen([ + p = subprocess.Popen( + [ "env", "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level, "%s/mace_run" % model_output_dir, @@ -598,9 +605,13 @@ def tuning_run(abi, "--device=%s" % device_type, "--round=%s" % running_round, "--restart_round=%s" % restart_round, - "%s" % option_args], - stderr=subprocess.PIPE, - stdout=subprocess.PIPE) + "--omp_num_threads=%s" % omp_num_threads, + "--cpu_affinity_policy=%s" % cpu_affinity_policy, + "--gpu_perf_hint=%s" % gpu_perf_hint, + "--gpu_priority_hint=%s" % gpu_priority_hint, + ], + stderr=subprocess.PIPE, + stdout=subprocess.PIPE) out, err = p.communicate() stdout = err + out print stdout @@ -627,33 +638,36 @@ def tuning_run(abi, stdout_buff = [] process_output = make_output_processor(stdout_buff) p = sh.adb( - "-s", - serialno, - "shell", - "LD_LIBRARY_PATH=%s" % phone_data_dir, - "MACE_TUNING=%s" % int(tuning), - "MACE_OUT_OF_RANGE_CHECK=%s" % int(out_of_range_check), - "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level, - "MACE_RUN_PARAMETER_PATH=%s/mace_run.config" % - phone_data_dir, - "MACE_CL_PROGRAM_PATH=%s/cl_program" % phone_data_dir, - "MACE_LIMIT_OPENCL_KERNEL_TIME=%s" % - limit_opencl_kernel_time, - "%s/mace_run" % phone_data_dir, - "--input_node=%s" % ",".join(input_nodes), - "--output_node=%s" % ",".join(output_nodes), - "--input_shape=%s" % ":".join(input_shapes), - "--output_shape=%s" % ":".join(output_shapes), - "--input_file=%s/%s" % (phone_data_dir, input_file_name), - "--output_file=%s/%s" % (phone_data_dir, output_file_name), - "--model_data_file=%s/%s.data" % (phone_data_dir, model_tag), - "--device=%s" % device_type, - "--round=%s" % running_round, - "--restart_round=%s" % restart_round, - "%s" % option_args, - _out=process_output, - _bg=True, - _err_to_out=True) + "-s", + serialno, + "shell", + "LD_LIBRARY_PATH=%s" % phone_data_dir, + "MACE_TUNING=%s" % int(tuning), + "MACE_OUT_OF_RANGE_CHECK=%s" % int(out_of_range_check), + "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level, + "MACE_RUN_PARAMETER_PATH=%s/mace_run.config" % + phone_data_dir, + "MACE_CL_PROGRAM_PATH=%s/cl_program" % phone_data_dir, + "MACE_LIMIT_OPENCL_KERNEL_TIME=%s" % + limit_opencl_kernel_time, + "%s/mace_run" % phone_data_dir, + "--input_node=%s" % ",".join(input_nodes), + "--output_node=%s" % ",".join(output_nodes), + "--input_shape=%s" % ":".join(input_shapes), + "--output_shape=%s" % ":".join(output_shapes), + "--input_file=%s/%s" % (phone_data_dir, input_file_name), + "--output_file=%s/%s" % (phone_data_dir, output_file_name), + "--model_data_file=%s/%s.data" % (phone_data_dir, model_tag), + "--device=%s" % device_type, + "--round=%s" % running_round, + "--restart_round=%s" % restart_round, + "--omp_num_threads=%s" % omp_num_threads, + "--cpu_affinity_policy=%s" % cpu_affinity_policy, + "--gpu_perf_hint=%s" % gpu_perf_hint, + "--gpu_priority_hint=%s" % gpu_priority_hint, + _out=process_output, + _bg=True, + _err_to_out=True) p.wait() print("Running finished!\n") return "".join(stdout_buff) @@ -900,7 +914,10 @@ def benchmark_model(abi, device_type, hexagon_mode, phone_data_dir, - option_args="", + omp_num_threads=-1, + cpu_affinity_policy=1, + gpu_perf_hint=3, + gpu_priority_hint=3, input_file_name="model_input", output_file_name="model_out"): print("* Benchmark for %s" % model_tag) @@ -924,7 +941,8 @@ def benchmark_model(abi, stdout_buff = [] process_output = make_output_processor(stdout_buff) if abi == "host": - p = subprocess.Popen([ + p = subprocess.Popen( + [ "env", "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level, "%s/benchmark_model" % model_output_dir, @@ -935,7 +953,11 @@ def benchmark_model(abi, "--input_file=%s/%s" % (model_output_dir, input_file_name), "--model_data_file=%s/%s.data" % (model_output_dir, model_tag), "--device=%s" % device_type, - "%s" % option_args]) + "--omp_num_threads=%s" % omp_num_threads, + "--cpu_affinity_policy=%s" % cpu_affinity_policy, + "--gpu_perf_hint=%s" % gpu_perf_hint, + "--gpu_priority_hint=%s" % gpu_priority_hint, + ]) p.wait() else: sh.adb("-s", serialno, "shell", "mkdir", "-p", phone_data_dir) @@ -951,26 +973,29 @@ def benchmark_model(abi, adb_push("%s/%s.data" % (model_output_dir, model_tag), phone_data_dir, serialno) p = sh.adb( - "-s", - serialno, - "shell", - "LD_LIBRARY_PATH=%s" % phone_data_dir, - "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level, - "MACE_RUN_PARAMETER_PATH=%s/mace_run.config" % - phone_data_dir, - "MACE_OPENCL_PROFILING=1", - "%s/benchmark_model" % phone_data_dir, - "--input_node=%s" % ",".join(input_nodes), - "--output_node=%s" % ",".join(output_nodes), - "--input_shape=%s" % ":".join(input_shapes), - "--output_shape=%s" % ":".join(output_shapes), - "--input_file=%s/%s" % (phone_data_dir, input_file_name), - "--model_data_file=%s/%s.data" % (phone_data_dir, model_tag), - "--device=%s" % device_type, - "%s" % option_args, - _out=process_output, - _bg=True, - _err_to_out=True) + "-s", + serialno, + "shell", + "LD_LIBRARY_PATH=%s" % phone_data_dir, + "MACE_CPP_MIN_VLOG_LEVEL=%s" % vlog_level, + "MACE_RUN_PARAMETER_PATH=%s/mace_run.config" % + phone_data_dir, + "MACE_OPENCL_PROFILING=1", + "%s/benchmark_model" % phone_data_dir, + "--input_node=%s" % ",".join(input_nodes), + "--output_node=%s" % ",".join(output_nodes), + "--input_shape=%s" % ":".join(input_shapes), + "--output_shape=%s" % ":".join(output_shapes), + "--input_file=%s/%s" % (phone_data_dir, input_file_name), + "--model_data_file=%s/%s.data" % (phone_data_dir, model_tag), + "--device=%s" % device_type, + "--omp_num_threads=%s" % omp_num_threads, + "--cpu_affinity_policy=%s" % cpu_affinity_policy, + "--gpu_perf_hint=%s" % gpu_perf_hint, + "--gpu_priority_hint=%s" % gpu_priority_hint, + _out=process_output, + _bg=True, + _err_to_out=True) p.wait() print("Benchmark done!\n")