diff --git a/docs/faq.md b/docs/faq.md index 3b3ccf580b4d2425613cda1e2d90b9638aa189f7..fe0cbcf1467bfbd5f4993b61d97ac3c84ad589ca 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -55,6 +55,13 @@ Try to set `limit_opencl_kernel_time` to `1`. If still not resolved, try to modify the source code to use even smaller time intervals or changed to CPU or DSP runtime. +For GPUs such as Arm Mali, sometimes even setting `limit_opencl_kernel_time` to +a small time interval can not solve the problem. At this time, you can try to +set `opencl_queue_window_size`, such as 16. This parameter means that the GPU +command queue will contain only `opencl_queue_window_size` commands at most. +You can adjust this parameter to achieve a balance between performance and UI +response. You should not use this parameter unless you have to. + Why is MACE not working on DSP? ------------------------------------------------------------------------------ Running models on Hexagon DSP need a few prerequisites for DSP developers: diff --git a/docs/user_guide/advanced_usage.rst b/docs/user_guide/advanced_usage.rst index b86b37ef09215712bf2c5a5a20405658034b55c2..f58eac7ece52060cbe8ded444906e80cdd719739 100644 --- a/docs/user_guide/advanced_usage.rst +++ b/docs/user_guide/advanced_usage.rst @@ -94,6 +94,8 @@ in one deployment file. - [optional] The format of the output tensors, one of [NONE, NHWC, NCHW]. If there is no format of the output, please use NONE. If only one single format is specified, all inputs will use that format, default is NHWC order. * - limit_opencl_kernel_time - [optional] Whether splitting the OpenCL kernel within 1 ms to keep UI responsiveness, default is 0. + * - opencl_queue_window_size + - [optional] Limit the max commands in OpenCL command queue to keep UI responsiveness, default is 0. * - obfuscate - [optional] Whether to obfuscate the model operator name, default to 0. * - winograd diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index 4875cc228c00effeff3d12d676df410103ae16d2..6d5f82e105365c5cc38482b8bfe97ea0d0aacf5f 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -323,7 +323,7 @@ OpenCLRuntime::OpenCLRuntime( cl_command_queue_properties properties = 0; const char *profiling = getenv("MACE_OPENCL_PROFILING"); - if (IsTuning() || + if (tuner_->IsTuning() || (profiling != nullptr && strlen(profiling) == 1 && profiling[0] == '1')) { properties |= CL_QUEUE_PROFILING_ENABLE; is_profiling_enabled_ = true; diff --git a/mace/ops/opencl/helper.cc b/mace/ops/opencl/helper.cc index 9729555a5ce246a1cb4277c61bf5d5de9f16bbd1..16acafb5a244583c3d9b34df25755eb3d50284f7 100644 --- a/mace/ops/opencl/helper.cc +++ b/mace/ops/opencl/helper.cc @@ -96,6 +96,29 @@ std::vector Default3DLocalWS(OpenCLRuntime *runtime, return lws; } +/** + * For GPUs like Arm Mali, when too many commands are added in the command + * queue, the UI responsiveness may be poor. This function limits the number of + * comands in the command queue to no more than kQueueWndSize. when + * opencl_commands >= kQueueWndSize, it will wait for the completion of GPU + * command queue's execution. + * + * If kQueueWndSize <= 0, this function does nothing. + */ +inline void WaitForQueueExecution(OpenCLRuntime *runtime, + const cl::Event &event) { + static const unsigned int kQueueWndSize = + runtime->tuner()->GetOpenclQueueWindowSize(); + static thread_local unsigned int opencl_commands = 0; + if (kQueueWndSize > 0) { + opencl_commands++; + if (opencl_commands >= kQueueWndSize) { + event.wait(); + opencl_commands = 0; + } + } +} + MaceStatus TuningOrRun3DKernel(OpenCLRuntime *runtime, const cl::Kernel &kernel, const std::string tuning_key, @@ -176,6 +199,7 @@ MaceStatus TuningOrRun3DKernel(OpenCLRuntime *runtime, cl::NDRange(internal_gws[0], internal_gws[1], gws2), cl::NDRange(params[0], params[1], params[2]), nullptr, &event); MACE_CL_RET_ERROR(error); + WaitForQueueExecution(runtime, event); } } else { timer->ClearTiming(); @@ -285,6 +309,7 @@ MaceStatus TuningOrRun2DKernel(OpenCLRuntime *runtime, cl::NDRange(internal_gws[0], gws1), cl::NDRange(params[0], params[1]), nullptr, &event); MACE_CL_RET_ERROR(error); + WaitForQueueExecution(runtime, event); } } else { timer->ClearTiming(); diff --git a/mace/tools/mace_run.cc b/mace/tools/mace_run.cc index 7d6e1b2cef9bf96b88b144abb6d59a256f187c32..3529dfa850ceb641840ee36f67337afb8d420384 100644 --- a/mace/tools/mace_run.cc +++ b/mace/tools/mace_run.cc @@ -548,6 +548,10 @@ int Main(int argc, char **argv) { LOG(INFO) << "gpu_priority_hint: " << FLAGS_gpu_priority_hint; LOG(INFO) << "omp_num_threads: " << FLAGS_omp_num_threads; LOG(INFO) << "cpu_affinity_policy: " << FLAGS_cpu_affinity_policy; + LOG(INFO) << "limit_opencl_kernel_time: " + << getenv("MACE_LIMIT_OPENCL_KERNEL_TIME"); + LOG(INFO) << "opencl_queue_window_size: " + << getenv("MACE_OPENCL_QUEUE_WINDOW_SIZE"); std::vector input_shapes = Split(FLAGS_input_shape, ':'); std::vector output_shapes = Split(FLAGS_output_shape, ':'); diff --git a/mace/utils/tuner.h b/mace/utils/tuner.h index 45485234f04ee25cd10054f16bbc9341e3c0887b..da796ec08fa758cf161565c39d0f3e3373985aa7 100644 --- a/mace/utils/tuner.h +++ b/mace/utils/tuner.h @@ -35,20 +35,38 @@ namespace mace { -inline bool IsTuning() { +constexpr const char *kOpenClWindowSize = "MACE_OPENCL_QUEUE_WINDOW_SIZE"; + +inline bool GetTuningFromEnv() { std::string tuning; GetEnv("MACE_TUNING", &tuning); return tuning.size() == 1 && tuning[0] == '1'; } +inline unsigned int GetOpenclQueueWindowSizeFromEnv() { + std::string str_size; + GetEnv(kOpenClWindowSize, &str_size); + unsigned int window_size = 0; + if (str_size.size() > 0) { + window_size = std::stoi(str_size); + } + return window_size; +} + template class Tuner { public: explicit Tuner(const std::string tuned_param_file_path = "", const unsigned char *param_byte_stream = nullptr, - const size_t param_byte_stream_size = 0): + const size_t param_byte_stream_size = 0) : tuned_param_file_path_(tuned_param_file_path) { GetEnv("MACE_RUN_PARAMETER_PATH", &path_); + is_tuning_ = GetTuningFromEnv(); + if (is_tuning_) { + unsigned int wnd_size = GetOpenclQueueWindowSizeFromEnv(); + param_table_[kOpenClWindowSize] = {wnd_size}; + } + if (param_byte_stream != nullptr && param_byte_stream_size != 0) { ParseData(param_byte_stream, param_byte_stream_size); } else { @@ -92,6 +110,19 @@ class Tuner { } } + unsigned int GetOpenclQueueWindowSize() { + unsigned int window_size = 0; + if (!IsTuning() + && param_table_.find(kOpenClWindowSize) != param_table_.end()) { + window_size = param_table_[kOpenClWindowSize][0]; + } + return window_size; + } + + bool IsTuning() { + return is_tuning_; + } + private: void WriteRunParameters() { if (!path_.empty()) { @@ -239,6 +270,8 @@ class Tuner { std::string tuned_param_file_path_; std::string path_; std::unordered_map> param_table_; + unsigned int opencl_queue_window_size_; + bool is_tuning_; }; } // namespace mace diff --git a/tools/common.py b/tools/common.py index a45bf37a645f4c78a90b16df54d2bc7304044b64..8642ad4a99810e102257997b552957670ec907d7 100644 --- a/tools/common.py +++ b/tools/common.py @@ -407,6 +407,7 @@ class YAMLKeyword(object): input_data_formats = 'input_data_formats' output_data_formats = 'output_data_formats' limit_opencl_kernel_time = 'limit_opencl_kernel_time' + opencl_queue_window_size = 'opencl_queue_window_size' nnlib_graph_mode = 'nnlib_graph_mode' obfuscate = 'obfuscate' winograd = 'winograd' diff --git a/tools/converter.py b/tools/converter.py index a6a6919418f93d4c93b08d3763edc47d832d4cd8..126ef215fa490a0f7714e134ce0b537d46fd4b76 100644 --- a/tools/converter.py +++ b/tools/converter.py @@ -656,6 +656,7 @@ def format_model_config(flags): accuracy_validation_script for key in [YAMLKeyword.limit_opencl_kernel_time, + YAMLKeyword.opencl_queue_window_size, YAMLKeyword.nnlib_graph_mode, YAMLKeyword.obfuscate, YAMLKeyword.winograd, diff --git a/tools/device.py b/tools/device.py index d428756af67dcaa89242d1e9d1f2bb6c5c52f24a..341ba505142690156ee05f2ece153763d74d00a2 100644 --- a/tools/device.py +++ b/tools/device.py @@ -171,6 +171,7 @@ class DeviceWrapper: running_round, restart_round, limit_opencl_kernel_time, + opencl_queue_window_size, tuning, out_of_range_check, model_graph_format, @@ -312,6 +313,7 @@ class DeviceWrapper: "MACE_RUN_PARAMETER_PATH=%s/mace_run.config" % self.data_dir, "MACE_INTERNAL_STORAGE_PATH=%s" % internal_storage_dir, "MACE_LIMIT_OPENCL_KERNEL_TIME=%s" % limit_opencl_kernel_time, + "MACE_OPENCL_QUEUE_WINDOW_SIZE=%s" % opencl_queue_window_size, "MACE_RUNTIME_FAILURE_RATIO=%f" % runtime_failure_ratio, "MACE_LOG_TENSOR_RANGE=%d" % (1 if quantize_stat else 0), ] @@ -429,6 +431,8 @@ class DeviceWrapper: restart_round=1, limit_opencl_kernel_time=model_config[ YAMLKeyword.limit_opencl_kernel_time], + opencl_queue_window_size=model_config[ + YAMLKeyword.opencl_queue_window_size], tuning=True, out_of_range_check=False, model_graph_format=model_graph_format, @@ -541,6 +545,8 @@ class DeviceWrapper: restart_round=flags.restart_round, limit_opencl_kernel_time=model_config[ YAMLKeyword.limit_opencl_kernel_time], + opencl_queue_window_size=model_config[ + YAMLKeyword.opencl_queue_window_size], tuning=False, out_of_range_check=flags.gpu_out_of_range_check, model_graph_format=configs[