diff --git a/include/mace/public/mace.h b/include/mace/public/mace.h index 1898f4c3a98347151d89e9f70182cc8c2bb9b3ef..d2c768665d57fa138d0622cd92d3d8702da3de67 100644 --- a/include/mace/public/mace.h +++ b/include/mace/public/mace.h @@ -90,6 +90,18 @@ enum CPUAffinityPolicy { AFFINITY_POWER_SAVE = 4, }; +// Voltage corners for clock frequencies, please refer to +// docs/Hap_power_set_dcvs_2.html in Hexagon SDK for more detailed information. +enum HexagonNNCornerType { + HEXAGON_NN_CORNER_RELEASE, + HEXAGON_NN_CORNER_TURBO, + HEXAGON_NN_CORNER_NOMPLUS, + HEXAGON_NN_CORNER_NOMINAL, + HEXAGON_NN_CORNER_SVSPLUS, + HEXAGON_NN_CORNER_SVS, + HEXAGON_NN_CORNER_SVS2, +}; + struct CallStats { int64_t start_micros; int64_t end_micros; @@ -281,7 +293,7 @@ class MACE_API MaceEngineConfig { /// /// Just use one GPUContext for multiple models run on GPU. /// \param context created use GPUContextBuilder - /// \return MaceStatus::MACE_SUCCESS for success, other for failed. + /// \return MaceStatus::MACE_SUCCESS for success, other for failure. MaceStatus SetGPUContext(std::shared_ptr context); /// \brief Set GPU hints, currently only supports Adreno GPU. @@ -291,7 +303,7 @@ class MACE_API MaceEngineConfig { /// /// \param perf_hint performance hint /// \param priority_hint priority hint - /// \return MaceStatus::MACE_SUCCESS for success, other for failed. + /// \return MaceStatus::MACE_SUCCESS for success, other for failure. MaceStatus SetGPUHints(GPUPerfHint perf_hint, GPUPriorityHint priority_hint); @@ -312,10 +324,27 @@ class MACE_API MaceEngineConfig { /// \param status MACE_SUCCESS for successful, or it can't reliabley /// detect big-LITTLE cores (see GetBigLittleCoreIDs). In such cases, it's /// suggested to use AFFINITY_NONE to use all cores. - /// \return MaceStatus::MACE_SUCCESS for success, other for failed. + /// \return MaceStatus::MACE_SUCCESS for success, other for failure. MaceStatus SetCPUThreadPolicy(int num_threads_hint, CPUAffinityPolicy policy); + /// \brief Set Hexagon DSP power parameters + /// + /// Caution: this function may hurt performance if improper + /// parameters provided. For most performance critical applications, set + /// HexagonNNCornerType to HEXAGON_NN_CORNER_TURBO, enable dynamic clock + /// voltage scaling(DCVS) and set sleep latency to 100us works just fine. + /// If a more balanced scheme between performance and power consumption + /// is needed, these three parameters may be tweaked to achieve that. + /// \param corner DCVS voltage target corner, can be set even when DCVS + /// is disabled. + /// \param dcvs_enable enable or disable DCVS. + /// \param latency sleep latency, in micro seconds. + /// \return MaceStatus::MACE_SUCCESS for success, other for failure. + MaceStatus SetHexagonPower(HexagonNNCornerType corner, + bool dcvs_enable, + int latency); + private: class Impl; std::unique_ptr impl_; diff --git a/mace/core/CMakeLists.txt b/mace/core/CMakeLists.txt index c9b18826ee5ee409db0da8c76837a2dcea76b7b7..25dd168023fede667d4323ecd4c8cd62d3052b7a 100644 --- a/mace/core/CMakeLists.txt +++ b/mace/core/CMakeLists.txt @@ -37,7 +37,7 @@ endif(MACE_ENABLE_HEXAGON_DSP) if(MACE_ENABLE_HEXAGON_HTA) set(CORE_SRCS ${CORE_SRCS} runtime/hexagon/hexagon_hta_wrapper.cc) - set(EXTRA_LINK_LIBS ${EXTRA_LINK_LIBS} hta_controller hta_hexagon_runtime npu) + set(EXTRA_LINK_LIBS ${EXTRA_LINK_LIBS} hta_hexagon_runtime) endif(MACE_ENABLE_HEXAGON_HTA) if(MACE_ENABLE_MTK_APU) diff --git a/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc index 0a12bcc484a56e55132a1f240247bb784f047b76..cdde27866c7713828dbf119ba7f0772753a889a3 100644 --- a/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc +++ b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc @@ -87,17 +87,53 @@ std::string FloatToString(const FloatType v, const int32_t precision) { stream << std::fixed << std::setprecision(precision) << v; return stream.str(); } + +hexagon_nn_corner_type TransformCornerType(HexagonNNCornerType corner) { + switch (corner) { + case HEXAGON_NN_CORNER_RELEASE: return NN_CORNER_RELEASE; + case HEXAGON_NN_CORNER_TURBO: return NN_CORNER_TURBO; + case HEXAGON_NN_CORNER_NOMPLUS: return NN_CORNER_NOMPLUS; + case HEXAGON_NN_CORNER_NOMINAL: return NN_CORNER_NOMINAL; + case HEXAGON_NN_CORNER_SVSPLUS: return NN_CORNER_SVSPLUS; + case HEXAGON_NN_CORNER_SVS: return NN_CORNER_SVS; + case HEXAGON_NN_CORNER_SVS2: return NN_CORNER_SVS2; + default: + LOG(FATAL) << "Wrong Hexagon NN corner type: " << corner; + return NN_CORNER_TURBO; + } +} + } // namespace +HexagonDSPWrapper::HexagonDSPWrapper() { + std::string env_log_execute_time_str; + GetEnv("MACE_DSP_LOG_EXECUTE_TIME", &env_log_execute_time_str); + if (env_log_execute_time_str.empty()) { + log_execute_time_ = false; + } else { + log_execute_time_ = static_cast(std::stoi(env_log_execute_time_str)); + } +} + + int HexagonDSPWrapper::GetVersion() { int version; MACE_CHECK(hexagon_nn_version(&version) == 0, "get version error"); return version; } +bool HexagonDSPWrapper::SetPower(HexagonNNCornerType corner, + bool dcvs_enable, + int latency) { + int ret = hexagon_nn_set_clocks(TransformCornerType(corner), + dcvs_enable ? NN_DCVS_ENABLE + : NN_DCVS_DISABLE, + static_cast(std::max(0, latency))); + return ret == 0; +} + bool HexagonDSPWrapper::Config() { LOG(INFO) << "Hexagon config"; - MACE_CHECK(hexagon_nn_set_powersave_level(0) == 0, "hexagon power error"); MACE_CHECK(hexagon_nn_config() == 0, "hexagon config error"); return true; } @@ -111,7 +147,7 @@ bool HexagonDSPWrapper::Init() { bool HexagonDSPWrapper::Finalize() { LOG(INFO) << "Hexagon finalize"; - return hexagon_nn_set_powersave_level(1) == 0; + return hexagon_nn_remove_clocks() == 0; } bool HexagonDSPWrapper::SetupGraph(const NetDef &net_def, @@ -432,6 +468,11 @@ bool HexagonDSPWrapper::ExecuteGraph(const Tensor &input_tensor, } MACE_CHECK(output_bytes == output_tensor->raw_size(), "wrong output bytes inferred."); + + if (log_execute_time_) { + LOG(INFO) << "dsp cycles: " << GetLastExecuteCycles(); + } + return true; } @@ -439,8 +480,8 @@ bool HexagonDSPWrapper::ExecuteGraphNew( const std::map &input_tensors, std::map *output_tensors) { VLOG(2) << "Execute graph new: " << nn_id_; - uint32_t num_inputs = static_cast(input_tensors.size()); - uint32_t num_outputs = static_cast(output_tensors->size()); + auto num_inputs = static_cast(input_tensors.size()); + auto num_outputs = static_cast(output_tensors->size()); MACE_CHECK(num_inputs_ == static_cast(num_inputs), "Wrong inputs num"); MACE_CHECK(num_outputs_ == static_cast(num_outputs), "Wrong outputs num"); @@ -519,7 +560,18 @@ bool HexagonDSPWrapper::ExecuteGraphNew( " wrong output bytes inferred."); } + if (log_execute_time_) { + LOG(INFO) << "dsp cycles: " << GetLastExecuteCycles(); + } + return true; } +uint64_t HexagonDSPWrapper::GetLastExecuteCycles() { + uint32_t cycle_lo; + uint32_t cycle_hi; + hexagon_nn_last_execution_cycles(nn_id_, &cycle_lo, &cycle_hi); + return (static_cast(cycle_hi) << 32) | cycle_lo; +} + } // namespace mace diff --git a/mace/core/runtime/hexagon/hexagon_dsp_wrapper.h b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.h index f0877592df35b08d9cd7e1cb5263d5dbabbfd49a..1c0c4635b459f67394ddf5ec7341d9dc754b81a4 100644 --- a/mace/core/runtime/hexagon/hexagon_dsp_wrapper.h +++ b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.h @@ -27,7 +27,7 @@ namespace mace { class HexagonDSPWrapper : public HexagonControlWrapper { public: - HexagonDSPWrapper() = default; + HexagonDSPWrapper(); int GetVersion() override; bool Config() override; @@ -46,6 +46,15 @@ class HexagonDSPWrapper : public HexagonControlWrapper { void ResetPerfInfo() override; void SetDebugLevel(int level) override; + static bool SetPower(HexagonNNCornerType corner, + bool dcvs_enable, + int latency); + + private: + uint64_t GetLastExecuteCycles(); + + bool log_execute_time_; + MACE_DISABLE_COPY_AND_ASSIGN(HexagonDSPWrapper); }; } // namespace mace diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc index aae830892c498a3670d0cefd05e869b8a3e30723..51c53097f5afad9ffce95e66d4db29e0b4ba8069 100644 --- a/mace/libmace/mace.cc +++ b/mace/libmace/mace.cc @@ -34,7 +34,10 @@ #include "mace/core/runtime/opencl/opencl_runtime.h" #endif // MACE_ENABLE_OPENCL -#if defined(MACE_ENABLE_HEXAGON) || defined(MACE_ENABLE_HTA) +#if defined(MACE_ENABLE_HEXAGON) +#include "mace/core/runtime/hexagon/hexagon_device.h" +#include "mace/core/runtime/hexagon/hexagon_dsp_wrapper.h" +#elif defined(MACE_ENABLE_HTA) #include "mace/core/runtime/hexagon/hexagon_device.h" #endif @@ -189,6 +192,10 @@ class MaceEngineConfig::Impl { MaceStatus SetCPUThreadPolicy(int num_threads_hint, CPUAffinityPolicy policy); + MaceStatus SetHexagonPower(HexagonNNCornerType corner, + bool dcvs_enable, + int latency); + inline DeviceType device_type() const { return device_type_; } @@ -228,7 +235,13 @@ MaceEngineConfig::Impl::Impl(const DeviceType device_type) cpu_affinity_policy_(CPUAffinityPolicy::AFFINITY_NONE), gpu_context_(nullptr), gpu_priority_hint_(GPUPriorityHint::PRIORITY_LOW), - gpu_perf_hint_(GPUPerfHint::PERF_NORMAL) {} + gpu_perf_hint_(GPUPerfHint::PERF_NORMAL) { +#ifdef MACE_ENABLE_HEXAGON + if (!HexagonDSPWrapper::SetPower(HEXAGON_NN_CORNER_TURBO, true, 100)) { + LOG(WARNING) << "Hexagon set default clocks failed!"; + } +#endif +} MaceStatus MaceEngineConfig::Impl::SetGPUContext( std::shared_ptr context) { @@ -252,6 +265,20 @@ MaceStatus MaceEngineConfig::Impl::SetCPUThreadPolicy( return MaceStatus::MACE_SUCCESS; } +MaceStatus MaceEngineConfig::Impl::SetHexagonPower( + HexagonNNCornerType corner, + bool dcvs_enable, + int latency) { + MACE_UNUSED(corner); + MACE_UNUSED(dcvs_enable); + MACE_UNUSED(latency); + bool ret = false; +#ifdef MACE_ENABLE_HEXAGON + ret = HexagonDSPWrapper::SetPower(corner, dcvs_enable, latency); +#endif + return ret ? MaceStatus::MACE_SUCCESS : MaceStatus::MACE_RUNTIME_ERROR; +} + MaceEngineConfig::MaceEngineConfig( const DeviceType device_type) : impl_(new MaceEngineConfig::Impl(device_type)) {} @@ -275,6 +302,13 @@ MaceStatus MaceEngineConfig::SetCPUThreadPolicy( return impl_->SetCPUThreadPolicy(num_threads_hint, policy); } +MaceStatus MaceEngineConfig::SetHexagonPower( + HexagonNNCornerType corner, + bool dcvs_enable, + int latency) { + return impl_->SetHexagonPower(corner, dcvs_enable, latency); +} + // Mace Tensor class MaceTensor::Impl { public: diff --git a/mace/tools/BUILD.bazel b/mace/tools/BUILD.bazel index df620200b2c108388a6ff6f005f9c4149986796e..de2b9a35884cb0019631bebc12fb698af9f2ff73 100644 --- a/mace/tools/BUILD.bazel +++ b/mace/tools/BUILD.bazel @@ -19,7 +19,11 @@ cc_binary( "-Werror", "-Wextra", "-Wno-missing-field-initializers", - ] + if_opencl_enabled(["-DMACE_ENABLE_OPENCL"]), + ] + if_opencl_enabled([ + "-DMACE_ENABLE_OPENCL", + ]) + if_hexagon_enabled([ + "-DMACE_ENABLE_HEXAGON", + ]), linkstatic = 1, deps = [ "//external:gflags_nothreads", @@ -39,7 +43,11 @@ cc_binary( "-Werror", "-Wextra", "-Wno-missing-field-initializers", - ] + if_opencl_enabled(["-DMACE_ENABLE_OPENCL"]), + ] + if_opencl_enabled([ + "-DMACE_ENABLE_OPENCL", + ]) + if_hexagon_enabled([ + "-DMACE_ENABLE_HEXAGON", + ]), linkopts = [ "-lm", ] + if_android([ diff --git a/mace/tools/mace_run.cc b/mace/tools/mace_run.cc index 45ec2f338e5e380ea61441ae0f63df3bf93548b1..e135e5edf1c2f8880afb7bfd2b578ebdf5f8fbcc 100644 --- a/mace/tools/mace_run.cc +++ b/mace/tools/mace_run.cc @@ -196,7 +196,9 @@ bool RunModel(const std::string &model_name, static_cast(FLAGS_gpu_priority_hint)); } #endif // MACE_ENABLE_OPENCL - +#ifdef MACE_ENABLE_HEXAGON + config.SetHexagonPower(HEXAGON_NN_CORNER_TURBO, true, 100); +#endif std::unique_ptr model_graph_data = make_unique(); if (FLAGS_model_file != "") { diff --git a/third_party/nnlib/arm64-v8a/libhexagon_controller.so b/third_party/nnlib/arm64-v8a/libhexagon_controller.so index 4043dd9fc3c8a08f2cc23c4f9a2c5fbc65d00157..2509a78001d9b4490051617ffa4e895f4da85ca8 100755 Binary files a/third_party/nnlib/arm64-v8a/libhexagon_controller.so and b/third_party/nnlib/arm64-v8a/libhexagon_controller.so differ diff --git a/third_party/nnlib/armeabi-v7a/libhexagon_controller.so b/third_party/nnlib/armeabi-v7a/libhexagon_controller.so index 9301f080568b7d6af08c2b41ec0e3c6c05294883..13d6f92188f3118773db0014976d22c45110ced8 100755 Binary files a/third_party/nnlib/armeabi-v7a/libhexagon_controller.so and b/third_party/nnlib/armeabi-v7a/libhexagon_controller.so differ diff --git a/third_party/nnlib/hexagon_nn.h b/third_party/nnlib/hexagon_nn.h index 5a059c5cfd1fd3bbe90ffe99b8bfa41debffa212..d225d6b64f37258c10db110d0b434e4a76299829 100644 --- a/third_party/nnlib/hexagon_nn.h +++ b/third_party/nnlib/hexagon_nn.h @@ -165,6 +165,8 @@ __QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute)(hexagon_nn_nn_id id, __QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_teardown)(hexagon_nn_nn_id id) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_powersave_level)(unsigned int level) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_powersave_details)(hexagon_nn_corner_type corner, hexagon_nn_dcvs_type dcvs, unsigned int latency) __QAIC_HEADER_ATTRIBUTE; +__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_clocks)(hexagon_nn_corner_type corner, hexagon_nn_dcvs_type dcvs, unsigned int latency) __QAIC_HEADER_ATTRIBUTE; +__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_remove_clocks)() __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_get_perfinfo)(hexagon_nn_nn_id id, hexagon_nn_perfinfo* info_out, int info_outLen, unsigned int* n_items) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_reset_perfinfo)(hexagon_nn_nn_id id, unsigned int event) __QAIC_HEADER_ATTRIBUTE; __QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_last_execution_cycles)(hexagon_nn_nn_id id, unsigned int* cycles_lo, unsigned int* cycles_hi) __QAIC_HEADER_ATTRIBUTE; diff --git a/third_party/nnlib/v60/libhexagon_nn_skel.so b/third_party/nnlib/v60/libhexagon_nn_skel.so index 973e7df0fbf95a2101fb58b723f1b531fde16141..1e41798ae5f15106dfea7f6244d1d65865802482 100755 Binary files a/third_party/nnlib/v60/libhexagon_nn_skel.so and b/third_party/nnlib/v60/libhexagon_nn_skel.so differ diff --git a/third_party/nnlib/v65/libhexagon_nn_skel.so b/third_party/nnlib/v65/libhexagon_nn_skel.so index 29bc877b227fffa9da2d3a1400266d0cafc1b7e3..23a8e04b541df5ff0aca64881c3491ad9a39a367 100755 Binary files a/third_party/nnlib/v65/libhexagon_nn_skel.so and b/third_party/nnlib/v65/libhexagon_nn_skel.so differ diff --git a/third_party/nnlib/v66/libhexagon_nn_skel.so b/third_party/nnlib/v66/libhexagon_nn_skel.so index 14ed068ecf3956610e68f5b6ca366aaaa3502590..8aee4d7d2da59b107dacbdec81efdff34de1a4c0 100644 Binary files a/third_party/nnlib/v66/libhexagon_nn_skel.so and b/third_party/nnlib/v66/libhexagon_nn_skel.so differ