diff --git a/.gitignore b/.gitignore index 894e0a843ad7e71d30cce504be890cf4755dea4c..b3a51dd83dbcba3ce393e112b6df5fb02310ba4a 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ mace/codegen/models/ mace/codegen/opencl/ mace/codegen/opencl_bin/ mace/codegen/version/ +build/ diff --git a/mace/benchmark/BUILD b/mace/benchmark/BUILD index 457922879b8b4c630c70ed852e8a4e3138bd6793..2cdc00e180175be1237545a8b1d4e7cbfb876633 100644 --- a/mace/benchmark/BUILD +++ b/mace/benchmark/BUILD @@ -1,7 +1,7 @@ # Benchmark # Examples load( - "//:mace.bzl", + "//mace:mace.bzl", "if_production_mode", "if_not_production_mode", "if_hexagon_enabled", @@ -16,7 +16,7 @@ cc_library( hdrs = ["stat_summarizer.h"], linkstatic = 1, deps = [ - "@mace//:mace_headers", + "//mace/core", ], ) @@ -29,17 +29,9 @@ cc_binary( linkstatic = 1, deps = [ ":stat_summarizer", - "//mace/codegen:generated_models", "//external:gflags_nothreads", - ] + if_hexagon_enabled([ - "//lib/hexagon:hexagon", - ]) + if_production_mode([ - "@mace//:mace_prod", - "//codegen:generated_opencl_prod", - "//codegen:generated_tuning_params", - ]) + if_not_production_mode([ - "@mace//:mace_dev", - ]), + "//mace/codegen:generated_models", + ], ) cc_library( @@ -58,9 +50,6 @@ cc_binary( deps = [ ":libmace_merged", "//external:gflags_nothreads", - "//lib/hexagon", - "@mace//:mace", - "@mace//:mace_headers", - "@mace//:mace_prod", + "//mace/core", ], ) diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc index 26bfeb618bdeff76c56e9ceb2cb33930a43dc461..cf33586af0a6e0fcfc553d42ac9b6b349f63e404 100644 --- a/mace/benchmark/benchmark_model.cc +++ b/mace/benchmark/benchmark_model.cc @@ -5,7 +5,7 @@ #include "gflags/gflags.h" #include "mace/public/mace.h" #include "mace/utils/logging.h" -#include "benchmark/stat_summarizer.h" +#include "mace/benchmark/stat_summarizer.h" #include #include @@ -204,6 +204,11 @@ DEFINE_bool(show_summary, true, "whether to show a summary of the stats"); DEFINE_bool(show_flops, true, "whether to estimate the model's FLOPs"); DEFINE_int32(warmup_runs, 1, "how many runs to initialize model"); DEFINE_string(model_data_file, "", "model data file name, used when EMBED_MODEL_DATA set to 0"); +DEFINE_string(gpu_type, "ADRENO", "ADRENO/MALI"); +DEFINE_int32(gpu_perf_hint, 2, "0:NA/1:LOW/2:NORMAL/3:HIGH"); +DEFINE_int32(gpu_priority_hint, 1, "0:NA/1:LOW/2:NORMAL/3:HIGH"); +DEFINE_int32(omp_num_threads, 8, "num of openmp threads"); +DEFINE_int32(cpu_power_option, 0, "0:DEFAULT/1:HIGH_PERFORMANCE/2:BATTERY_SAVE"); int Main(int argc, char **argv) { MACE_CHECK(FLAGS_device != "HEXAGON", "Model benchmark tool do not support DSP."); @@ -212,6 +217,11 @@ int Main(int argc, char **argv) { LOG(INFO) << "Benchmark name: [" << FLAGS_benchmark_name << "]"; LOG(INFO) << "Device: [" << FLAGS_device << "]"; + LOG(INFO) << "gpu_type: [" << FLAGS_gpu_type << "]"; + LOG(INFO) << "gpu_perf_hint: [" << FLAGS_gpu_perf_hint << "]"; + LOG(INFO) << "gpu_priority_hint: [" << FLAGS_gpu_priority_hint << "]"; + LOG(INFO) << "omp_num_threads: [" << FLAGS_omp_num_threads << "]"; + LOG(INFO) << "cpu_power_option: [" << FLAGS_cpu_power_option << "]"; LOG(INFO) << "Input node: [" << FLAGS_input_node<< "]"; LOG(INFO) << "Input shapes: [" << FLAGS_input_shape << "]"; LOG(INFO) << "Output node: [" << FLAGS_output_node<< "]"; @@ -246,6 +256,21 @@ int Main(int argc, char **argv) { device_type = OPENCL; } + // config runtime + if (device_type == OPENCL) { + GPUType gpu_type = ADRENO; + if (FLAGS_gpu_type == "MALI") gpu_type = MALI; + mace::ConfigOpenCLRuntime( + gpu_type, + static_cast(FLAGS_gpu_perf_hint), + static_cast(FLAGS_gpu_priority_hint)); + } + else if (device_type == CPU) { + mace::ConfigCPURuntime( + FLAGS_omp_num_threads, + static_cast(FLAGS_cpu_power_option)); + } + std::vector input_names = str_util::Split(FLAGS_input_node, ','); std::vector output_names = str_util::Split(FLAGS_output_node, ','); std::vector input_shapes = str_util::Split(FLAGS_input_shape, ':'); diff --git a/mace/benchmark/stat_summarizer.cc b/mace/benchmark/stat_summarizer.cc index 68801fb26aef120b74a23cf561056f9f57398c6f..ca8c5ec3db112ff4f6b3ea571b7b3a01c327bb2e 100644 --- a/mace/benchmark/stat_summarizer.cc +++ b/mace/benchmark/stat_summarizer.cc @@ -2,7 +2,7 @@ // Copyright (c) 2017 XiaoMi All rights reserved. // -#include "benchmark/stat_summarizer.h" +#include "mace/benchmark/stat_summarizer.h" #include "mace/public/mace.h" #include "mace/utils/logging.h" diff --git a/mace/core/BUILD b/mace/core/BUILD index 55eb4b63af667fdc83bf64e238ce5169095744fa..8114e6d8b8b47a2936454412626ed679e918d186 100644 --- a/mace/core/BUILD +++ b/mace/core/BUILD @@ -14,6 +14,7 @@ load( "if_not_hexagon_enabled", "if_production_mode", "if_not_production_mode", + "if_openmp_enabled", ) cc_library( @@ -21,6 +22,7 @@ cc_library( srcs = glob( [ "*.cc", + "runtime/cpu/*.cc", "runtime/opencl/*.cc", "runtime/hexagon/*.cc", ], @@ -37,9 +39,11 @@ cc_library( ]), hdrs = glob([ "*.h", + "runtime/cpu/*.h", "runtime/opencl/*.h", "runtime/hexagon/*.h", ]), + copts = if_openmp_enabled(["-fopenmp"]), linkopts = ["-ldl"] + if_android([ "-pie", "-lm", diff --git a/mace/core/mace.cc b/mace/core/mace.cc index be34f532443f6560ade7dfd13154c608d525d02c..e4d25c7d114dc5c9a28e457a6bb0888a887aa92a 100644 --- a/mace/core/mace.cc +++ b/mace/core/mace.cc @@ -5,6 +5,8 @@ #include "mace/public/mace.h" #include "mace/core/net.h" #include "mace/core/runtime/hexagon/hexagon_control_wrapper.h" +#include "mace/core/runtime/opencl/opencl_runtime.h" +#include "mace/core/runtime/cpu/cpu_runtime.h" #include "mace/core/types.h" namespace mace { @@ -349,6 +351,21 @@ const OperatorDef &NetDef::op(const int idx) const { return op_[idx]; } +void ConfigOpenCLRuntime(GPUType gpu_type, + GPUPerfHint gpu_perf_hint, + GPUPriorityHint gpu_priority_hint) { + LOG(INFO) << "Config OpenCL Runtime: gpu_type: " << gpu_type + << ", gpu_perf_hint: " << gpu_perf_hint + << ", gpu_priority_hint: " << gpu_priority_hint; + OpenCLRuntime::CreateGlobal(gpu_type, gpu_perf_hint, gpu_priority_hint); +} + +void ConfigCPURuntime(int omp_num_threads, CPUPowerOption power_option) { + LOG(INFO) << "Config CPU Runtime: omp_num_threads: " << omp_num_threads + << ", cpu_power_option: " << power_option; + SetCPURuntime(omp_num_threads, power_option); +} + // Mace Engine MaceEngine::MaceEngine(const NetDef *net_def, DeviceType device_type) : op_registry_(new OperatorRegistry()), diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc new file mode 100644 index 0000000000000000000000000000000000000000..4c2cd851d4bd502a1096b8523a85d9e3cf61fcfd --- /dev/null +++ b/mace/core/runtime/cpu/cpu_runtime.cc @@ -0,0 +1,107 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/public/mace.h" +#include "mace/utils/logging.h" +#include +#include +#include + +namespace mace { + +namespace { + +static int GetCPUMaxFreq(int cpu_id) { + char path[64]; + sprintf(path, + "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", + cpu_id); + FILE *fp = fopen(path, "rb"); + if (!fp) return 0; + + int freq = 0; + fscanf(fp, "%d", &freq); + fclose(fp); + return freq; +} + +static void SortCPUIdsByMaxFreqAsc(std::vector &cpu_ids) { + int cpu_count = cpu_ids.size(); + std::vector cpu_max_freq; + cpu_max_freq.resize(cpu_count); + + // set cpu max frequency + for (int i = 0; i < cpu_count; ++i) { + cpu_max_freq[i] = GetCPUMaxFreq(i); + cpu_ids[i] = i; + } + + // sort cpu ids by max frequency asc, bubble sort + for (int i = 0; i < cpu_count - 1; ++i) { + for (int j = i + 1; j < cpu_count; ++j) { + if (cpu_max_freq[i] > cpu_max_freq[j]) { + int tmp = cpu_ids[i]; + cpu_ids[i] = cpu_ids[j]; + cpu_ids[j] = tmp; + + tmp = cpu_max_freq[i]; + cpu_max_freq[i] = cpu_max_freq[j]; + cpu_max_freq[j] = tmp; + } + } + } +} + +static void SetThreadAffinity(cpu_set_t mask) { + int sys_call_res; + pid_t pid = gettid(); + + // TODO: when set omp num threads to 1, sometiomes return EINVAL(22) error + // https://linux.die.net/man/2/sched_setaffinity + sys_call_res = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask); + if (sys_call_res != 0) { + LOG(FATAL) << "syscall setaffinity error: " << sys_call_res << ' ' << errno; + } +} + +} // namespace + +void SetCPURuntime(int omp_num_threads, CPUPowerOption power_option) { + + int cpu_count = omp_get_num_procs(); + LOG(INFO) << "cpu_count: " << cpu_count; + std::vector sorted_cpu_ids; + sorted_cpu_ids.resize(cpu_count); + SortCPUIdsByMaxFreqAsc(sorted_cpu_ids); + + std::vector use_cpu_ids; + if (power_option == CPUPowerOption::DEFAULT || omp_num_threads >= cpu_count) { + use_cpu_ids = sorted_cpu_ids; + omp_num_threads = cpu_count; + } else if (power_option == CPUPowerOption::HIGH_PERFORMANCE) { + use_cpu_ids = + std::vector(sorted_cpu_ids.begin() + cpu_count - omp_num_threads, + sorted_cpu_ids.end()); + } else { + use_cpu_ids = std::vector(sorted_cpu_ids.begin(), + sorted_cpu_ids.begin() + omp_num_threads); + } + + omp_set_num_threads(omp_num_threads); + // compute mask + cpu_set_t mask; + CPU_ZERO(&mask); + for (auto cpu_id: use_cpu_ids) { + CPU_SET(cpu_id, &mask); + } + LOG(INFO) << "use cpus mask: " << mask.__bits[0]; + +#pragma omp parallel for + for (int i = 0; i < omp_num_threads; ++i) { + SetThreadAffinity(mask); + } +} + +} // namespace mace + diff --git a/mace/core/runtime/cpu/cpu_runtime.h b/mace/core/runtime/cpu/cpu_runtime.h new file mode 100644 index 0000000000000000000000000000000000000000..13dfd680236663e5af516b004306f864013fe3e4 --- /dev/null +++ b/mace/core/runtime/cpu/cpu_runtime.h @@ -0,0 +1,17 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + + +#ifndef MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H +#define MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H + +#include "mace/public/mace.h" + +namespace mace { + +void SetCPURuntime(int omp_num_threads, CPUPowerOption power_option); + +} + +#endif //MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index bf85e6014e0bcd36ba5d667615f89094b8e062d1..d3c2c2cd585cb8bcb7a20e40ba57b27f2fd03f22 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -63,11 +63,67 @@ void OpenCLProfilingTimer::ClearTiming() { } OpenCLRuntime *OpenCLRuntime::Global() { - static OpenCLRuntime instance; - return &instance; + if (opencl_runtime_instance == nullptr) { + return CreateGlobal(GPUType::ADRENO, GPUPerfHint::PERF_NORMAL, + GPUPriorityHint::PRIORITY_LOW); + } + return opencl_runtime_instance; +} + +OpenCLRuntime *OpenCLRuntime::CreateGlobal(GPUType gpu_type, + GPUPerfHint gpu_perf_hint, + GPUPriorityHint gpu_priority_hint) { + opencl_runtime_instance = new OpenCLRuntime(gpu_type, gpu_perf_hint, + gpu_priority_hint); + return opencl_runtime_instance; +} + +void ParseOpenCLRuntimeConfig(cl_context_properties *properties, + GPUType gpu_type, + GPUPerfHint gpu_perf_hint, + GPUPriorityHint gpu_priority_hint) { + int index = 0; + if (gpu_type == GPUType::ADRENO) { + switch (gpu_perf_hint) { + case GPUPerfHint::PERF_LOW: + properties[index++] = CL_CONTEXT_PERF_HINT_QCOM; + properties[index++] = CL_PERF_HINT_LOW_QCOM; + break; + case GPUPerfHint::PERF_NORMAL: + properties[index++] = CL_CONTEXT_PERF_HINT_QCOM; + properties[index++] = CL_PERF_HINT_NORMAL_QCOM; + break; + case GPUPerfHint::PERF_HIGH: + properties[index++] = CL_CONTEXT_PERF_HINT_QCOM; + properties[index++] = CL_PERF_HINT_HIGH_QCOM; + break; + default:break; + } + switch (gpu_priority_hint) { + case GPUPriorityHint::PRIORITY_LOW: + properties[index++] = CL_CONTEXT_PRIORITY_HINT_QCOM; + properties[index++] = CL_PRIORITY_HINT_LOW_QCOM; + break; + case GPUPriorityHint::PRIORITY_NORMAL: + properties[index++] = CL_CONTEXT_PRIORITY_HINT_QCOM; + properties[index++] = CL_PRIORITY_HINT_NORMAL_QCOM; + break; + case GPUPriorityHint::PRIORITY_HIGH: + properties[index++] = CL_CONTEXT_PRIORITY_HINT_QCOM; + properties[index++] = CL_PRIORITY_HINT_HIGH_QCOM; + break; + default:break; + } + } else { + // TODO: support Mali GPU context properties + } + + // The properties list should be terminated with 0 + properties[index] = 0; } -OpenCLRuntime::OpenCLRuntime() { +OpenCLRuntime::OpenCLRuntime(GPUType gpu_type, GPUPerfHint gpu_perf_hint, + GPUPriorityHint gpu_priority_hint) { LoadOpenCLLibrary(); std::vector all_platforms; @@ -109,15 +165,12 @@ OpenCLRuntime::OpenCLRuntime() { properties |= CL_QUEUE_PROFILING_ENABLE; } - // TODO (heliangliang) Make this configurable (e.g.HIGH for benchmark, - // disabled for Mali) - cl_context_properties context_properties[] = { - // Set context perf hint to normal - CL_CONTEXT_PERF_HINT_QCOM, CL_PERF_HINT_NORMAL_QCOM, - // Set context priority hint to low - CL_CONTEXT_PRIORITY_HINT_QCOM, CL_PRIORITY_HINT_LOW_QCOM, 0}; + std::unique_ptr context_properties( + new cl_context_properties[5]); + ParseOpenCLRuntimeConfig(context_properties.get(), gpu_type, gpu_perf_hint, + gpu_priority_hint); - cl::Context context({gpu_device}, context_properties); + cl::Context context({gpu_device}, context_properties.get()); cl::CommandQueue command_queue(context, gpu_device, properties); const char *kernel_path = getenv("MACE_KERNEL_PATH"); diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h index 1f5ab2a169505f3156b47f94a47a52f856cfede4..e816439dcdc8777a567374716cef310c3c0f245f 100644 --- a/mace/core/runtime/opencl/opencl_runtime.h +++ b/mace/core/runtime/opencl/opencl_runtime.h @@ -20,7 +20,7 @@ namespace mace { class OpenCLProfilingTimer : public Timer { public: explicit OpenCLProfilingTimer(const cl::Event *event) - : event_(event), accumulated_micros_(0){}; + : event_(event), accumulated_micros_(0) {}; void StartTiming() override; void StopTiming() override; void AccumulateTiming() override; @@ -38,6 +38,7 @@ class OpenCLProfilingTimer : public Timer { class OpenCLRuntime { public: static OpenCLRuntime *Global(); + static OpenCLRuntime *CreateGlobal(GPUType, GPUPerfHint, GPUPriorityHint); cl::Context &context(); cl::Device &device(); @@ -51,7 +52,7 @@ class OpenCLRuntime { const std::set &build_options); private: - OpenCLRuntime(); + OpenCLRuntime(GPUType, GPUPerfHint, GPUPriorityHint); ~OpenCLRuntime(); OpenCLRuntime(const OpenCLRuntime &) = delete; OpenCLRuntime &operator=(const OpenCLRuntime &) = delete; @@ -73,6 +74,7 @@ class OpenCLRuntime { std::string kernel_path_; }; +static OpenCLRuntime *opencl_runtime_instance = nullptr; } // namespace mace #endif // MACE_CORE_RUNTIME_OPENCL_OPENCL_RUNTIME_H_ diff --git a/mace/core/testing/test_benchmark_main.cc b/mace/core/testing/test_benchmark_main.cc index ae8a7a2ebcfc1b066a0197245351a4b1438e1aa9..91302d7832f983f9801ccc129060176885d9fdda 100644 --- a/mace/core/testing/test_benchmark_main.cc +++ b/mace/core/testing/test_benchmark_main.cc @@ -5,10 +5,15 @@ #include #include "mace/core/testing/test_benchmark.h" +#include "mace/public/mace.h" int main(int argc, char **argv) { std::cout << "Running main() from test_main.cc\n"; + mace::ConfigCPURuntime(4, mace::CPUPowerOption::HIGH_PERFORMANCE); + mace::ConfigOpenCLRuntime(mace::GPUType::ADRENO, mace::GPUPerfHint::PERF_HIGH, + mace::GPUPriorityHint::PRIORITY_HIGH); + // TODO Use gflags if (argc == 2) { mace::testing::Benchmark::Run(argv[1]); diff --git a/mace/examples/mace_run.cc b/mace/examples/mace_run.cc index 4c188c274b8fddb3fddb9c6007d2245a3cdebd06..db4378bff1bebd69c1e699bf08bdc935b07008de 100644 --- a/mace/examples/mace_run.cc +++ b/mace/examples/mace_run.cc @@ -101,6 +101,16 @@ DeviceType ParseDeviceType(const string &device_str) { } } +GPUType ParseGPUType(const string &gpu_type_str) { + if (gpu_type_str.compare("ADRENO") == 0) { + return GPUType::ADRENO; + } else if (gpu_type_str.compare("MALI") == 0) { + return GPUType::MALI; + } else { + return GPUType::ADRENO; + } +} + struct mallinfo LogMallinfoChange(struct mallinfo prev) { struct mallinfo curr = mallinfo(); if (prev.arena != curr.arena) { @@ -160,6 +170,11 @@ DEFINE_string(device, "OPENCL", "CPU/NEON/OPENCL/HEXAGON"); DEFINE_int32(round, 1, "round"); DEFINE_int32(restart_round, 1, "restart round"); DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable"); +DEFINE_string(gpu_type, "ADRENO", "ADRENO/MALI"); +DEFINE_int32(gpu_perf_hint, 2, "0:NA/1:LOW/2:NORMAL/3:HIGH"); +DEFINE_int32(gpu_priority_hint, 1, "0:NA/1:LOW/2:NORMAL/3:HIGH"); +DEFINE_int32(omp_num_threads, 8, "num of openmp threads"); +DEFINE_int32(cpu_power_option, 0, "0:DEFAULT/1:HIGH_PERFORMANCE/2:BATTERY_SAVE"); bool SingleInputAndOutput(const std::vector &input_shape, const std::vector &output_shape) { @@ -175,6 +190,20 @@ bool SingleInputAndOutput(const std::vector &input_shape, DeviceType device_type = ParseDeviceType(FLAGS_device); LOG(INFO) << "Runing with device type: " << device_type; + // config runtime + if (device_type == DeviceType::OPENCL) { + GPUType gpu_type = ParseGPUType(FLAGS_gpu_type); + mace::ConfigOpenCLRuntime( + gpu_type, + static_cast(FLAGS_gpu_perf_hint), + static_cast(FLAGS_gpu_priority_hint)); + } + else if (device_type == DeviceType::CPU) { + mace::ConfigCPURuntime( + FLAGS_omp_num_threads, + static_cast(FLAGS_cpu_power_option)); + } + // Init model LOG(INFO) << "Run init"; t0 = NowMicros(); @@ -266,6 +295,20 @@ bool MultipleInputOrOutput(const std::vector &input_names, DeviceType device_type = ParseDeviceType(FLAGS_device); LOG(INFO) << "Runing with device type: " << device_type; + // config runtime + if (device_type == DeviceType::OPENCL) { + GPUType gpu_type = ParseGPUType(FLAGS_gpu_type); + mace::ConfigOpenCLRuntime( + gpu_type, + static_cast(FLAGS_gpu_perf_hint), + static_cast(FLAGS_gpu_priority_hint)); + } + else if (device_type == DeviceType::CPU) { + mace::ConfigCPURuntime( + FLAGS_omp_num_threads, + static_cast(FLAGS_cpu_power_option)); + } + // Init model LOG(INFO) << "Run init"; t0 = NowMicros(); @@ -367,8 +410,13 @@ int main(int argc, char **argv) { LOG(INFO) << "output_file: " << FLAGS_output_file; LOG(INFO) << "model_data_file: " << FLAGS_model_data_file; LOG(INFO) << "device: " << FLAGS_device; - LOG(INFO) << "round: " << FLAGS_restart_round; - LOG(INFO) << "restart_round: " << FLAGS_round; + LOG(INFO) << "round: " << FLAGS_round; + LOG(INFO) << "restart_round: " << FLAGS_restart_round; + LOG(INFO) << "gpu_type: " << FLAGS_gpu_type; + LOG(INFO) << "gpu_perf_hint: " << FLAGS_gpu_perf_hint; + LOG(INFO) << "gpu_priority_hint: " << FLAGS_gpu_priority_hint; + LOG(INFO) << "omp_num_threads: " << FLAGS_omp_num_threads; + LOG(INFO) << "cpu_power_option: " << FLAGS_cpu_power_option; std::vector input_names = str_util::Split(FLAGS_input_node, ','); std::vector output_names = str_util::Split(FLAGS_output_node, ','); diff --git a/mace/public/mace.h b/mace/public/mace.h index 3c8fc778a709d228d369e3321ca730dc8f086f94..ec0873d13f6435560693d6f547afd01b7bb1cb4b 100644 --- a/mace/public/mace.h +++ b/mace/public/mace.h @@ -61,6 +61,17 @@ enum DataType { DT_UINT32 = 22 }; +enum GPUType { ADRENO = 0, MALI = 1 }; +enum GPUPerfHint { PERF_NA = 0, PERF_LOW = 1, PERF_NORMAL = 2, PERF_HIGH = 3 }; +enum GPUPriorityHint { + PRIORITY_NA = 0, + PRIORITY_LOW = 1, + PRIORITY_NORMAL = 2, + PRIORITY_HIGH = 3 +}; + +enum CPUPowerOption { DEFAULT = 0, HIGH_PERFORMANCE = 1, BATTERY_SAVE = 2}; + class ConstTensor { public: ConstTensor(const std::string &name, @@ -369,6 +380,9 @@ struct MaceInputInfo { const float *data; }; +void ConfigOpenCLRuntime(GPUType, GPUPerfHint, GPUPriorityHint); +void ConfigCPURuntime(int omp_num_threads, CPUPowerOption power_option); + class MaceEngine { public: // Single input and output diff --git a/tools/benchmark.sh b/tools/benchmark.sh index c327825546af73ed8cc8214a94b94b5d82cb46b0..a37627c4f091c76658959a0a793788f2f7db102f 100644 --- a/tools/benchmark.sh +++ b/tools/benchmark.sh @@ -13,6 +13,9 @@ CURRENT_DIR=`dirname $0` source ${CURRENT_DIR}/env.sh MODEL_OUTPUT_DIR=$1 +OPTION_ARGS=$2 + +echo $OPTION_ARGS if [ -f "$MODEL_OUTPUT_DIR/benchmark_model" ]; then rm -rf $MODEL_OUTPUT_DIR/benchmark_model @@ -23,7 +26,7 @@ if [ "$EMBED_MODEL_DATA" = 0 ]; then fi if [ x"$TARGET_ABI" == x"host" ]; then - bazel build --verbose_failures -c opt --strip always benchmark:benchmark_model \ + bazel build --verbose_failures -c opt --strip always //mace/benchmark:benchmark_model \ --copt="-std=c++11" \ --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \ --copt="-Werror=return-type" \ @@ -42,10 +45,11 @@ if [ x"$TARGET_ABI" == x"host" ]; then --input_shape="${INPUT_SHAPES}"\ --output_node="${OUTPUT_NODES}" \ --output_shape="${OUTPUT_SHAPES}"\ - --input_file=${MODEL_OUTPUT_DIR}/${INPUT_FILE_NAME} || exit 1 + --input_file=${MODEL_OUTPUT_DIR}/${INPUT_FILE_NAME}_${INPUT_NODES} \ + $OPTION_ARGS || exit 1 else - bazel build --verbose_failures -c opt --strip always benchmark:benchmark_model \ + bazel build --verbose_failures -c opt --strip always //mace/benchmark:benchmark_model \ --crosstool_top=//external:android/crosstool \ --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \ --cpu=${TARGET_ABI} \ @@ -57,7 +61,7 @@ else --define openmp=true \ --define production=true || exit 1 - cp bazel-bin/benchmark/benchmark_model $MODEL_OUTPUT_DIR + cp bazel-bin/mace/benchmark/benchmark_model $MODEL_OUTPUT_DIR adb shell "mkdir -p ${PHONE_DATA_DIR}" || exit 1 IFS=',' read -r -a INPUT_NAMES <<< "${INPUT_NODES}" @@ -83,5 +87,6 @@ else --input_shape="${INPUT_SHAPES}"\ --output_node="${OUTPUT_NODES}" \ --output_shape="${OUTPUT_SHAPES}"\ - --input_file=${PHONE_DATA_DIR}/${INPUT_FILE_NAME} || exit 1 + --input_file=${PHONE_DATA_DIR}/${INPUT_FILE_NAME} \ + $OPTION_ARGS || exit 1 fi diff --git a/tools/build_run_throughput_test.sh b/tools/build_run_throughput_test.sh index e87e2ff8b066e733fb7773583ea132e07c86b3bd..ec38684f93b06935e7804c93377cd54d50135af3 100644 --- a/tools/build_run_throughput_test.sh +++ b/tools/build_run_throughput_test.sh @@ -29,9 +29,9 @@ if [ "$DSP_MODEL_TAG" != '' ]; then DSP_MODEL_TAG_BUILD_FLAGS="--copt=-DMACE_DSP_MODEL_TAG=${DSP_MODEL_TAG}" fi -cp $MERGED_LIB_FILE benchmark/libmace_merged.a +cp $MERGED_LIB_FILE mace/benchmark/libmace_merged.a -bazel build --verbose_failures -c opt --strip always benchmark:model_throughput_test \ +bazel build --verbose_failures -c opt --strip always //mace/benchmark:model_throughput_test \ --crosstool_top=//external:android/crosstool \ --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \ --cpu=${TARGET_ABI} \ @@ -44,18 +44,18 @@ bazel build --verbose_failures -c opt --strip always benchmark:model_throughput_ --define openmp=true \ --copt="-O3" || exit 1 -rm benchmark/libmace_merged.a +rm mace/benchmark/libmace_merged.a adb shell "mkdir -p ${PHONE_DATA_DIR}" || exit 1 -adb push ${MODEL_INPUT_DIR}/${INPUT_FILE_NAME} ${PHONE_DATA_DIR} || exit 1 -adb push bazel-bin/benchmark/model_throughput_test ${PHONE_DATA_DIR} || exit 1 +adb push ${MODEL_INPUT_DIR}/${INPUT_FILE_NAME}_${INPUT_NODES} ${PHONE_DATA_DIR} || exit 1 +adb push bazel-bin/mace/benchmark/model_throughput_test ${PHONE_DATA_DIR} || exit 1 if [ "$EMBED_MODEL_DATA" = 0 ]; then adb push codegen/models/${CPU_MODEL_TAG}/${CPU_MODEL_TAG}.data ${PHONE_DATA_DIR} || exit 1 adb push codegen/models/${GPU_MODEL_TAG}/${GPU_MODEL_TAG}.data ${PHONE_DATA_DIR} || exit 1 adb push codegen/models/${DSP_MODEL_TAG}/${DSP_MODEL_TAG}.data ${PHONE_DATA_DIR} || exit 1 fi -adb push lib/hexagon/libhexagon_controller.so ${PHONE_DATA_DIR} || exit 1 +adb push mace/core/runtime/hexagon/libhexagon_controller.so ${PHONE_DATA_DIR} || exit 1 adb /dev/null || exit 1 fi adb push mace/core/runtime/hexagon/libhexagon_controller.so ${PHONE_DATA_DIR} > /dev/null || exit 1 - - mace_adb_output=`adb