Merge branch 'gpu_queue_window' into 'master'

Add option `opencl_queue_window_size` to keep UI responsiveness. See merge request !1185

Merge branch 'gpu_queue_window' into 'master'
Add option `opencl_queue_window_size` to keep UI responsiveness. See merge request !1185
7645d8d6 · Liangliang He · 6efb1870 · 1119fff5 · 7645d8d6 · 7645d8d6
9 changed file
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -55,6 +55,13 @@ Try to set `limit_opencl_kernel_time` to `1`. If still not resolved, try to
 modify the source code to use even smaller time intervals or changed to CPU
 or DSP runtime.

+For GPUs such as Arm Mali, sometimes even setting `limit_opencl_kernel_time` to
+a small time interval can not solve the problem. At this time, you can try to
+set `opencl_queue_window_size`, such as 16. This parameter means that the GPU
+command queue will contain only `opencl_queue_window_size` commands at most.
+You can adjust this parameter to achieve a balance between performance and UI
+response. You should not use this parameter unless you have to.
+
 Why is MACE not working on DSP?
 ------------------------------------------------------------------------------
 Running models on Hexagon DSP need a few prerequisites for DSP developers:

--- a/docs/user_guide/advanced_usage.rst
+++ b/docs/user_guide/advanced_usage.rst
@@ -94,6 +94,8 @@ in one deployment file.
      - [optional] The format of the output tensors, one of [NONE, NHWC, NCHW]. If there is no format of the output, please use NONE. If only one single format is specified, all inputs will use that format, default is NHWC order.
    * - limit_opencl_kernel_time
      - [optional] Whether splitting the OpenCL kernel within 1 ms to keep UI responsiveness, default is 0.
+    * - opencl_queue_window_size
+      - [optional] Limit the max commands in OpenCL command queue to keep UI responsiveness, default is 0.
    * - obfuscate
      - [optional] Whether to obfuscate the model operator name, default to 0.
    * - winograd

--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -323,7 +323,7 @@ OpenCLRuntime::OpenCLRuntime(
  cl_command_queue_properties properties = 0;

  const char *profiling = getenv("MACE_OPENCL_PROFILING");
-  if (IsTuning() ||
+  if (tuner_->IsTuning() ||
      (profiling != nullptr && strlen(profiling) == 1 && profiling[0] == '1')) {
    properties |= CL_QUEUE_PROFILING_ENABLE;
    is_profiling_enabled_ = true;

--- a/mace/ops/opencl/helper.cc
+++ b/mace/ops/opencl/helper.cc
@@ -96,6 +96,29 @@ std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime,
  return lws;
 }

+/**
+ * For GPUs like Arm Mali, when too many commands are added in the command
+ * queue, the UI responsiveness may be poor. This function limits the number of
+ * comands in the command queue to no more than kQueueWndSize. when
+ * opencl_commands >= kQueueWndSize, it will wait for the completion of GPU
+ * command queue's execution.
+ *
+ * If kQueueWndSize <= 0, this function does nothing.
+ */
+inline void WaitForQueueExecution(OpenCLRuntime *runtime,
+                                  const cl::Event &event) {
+  static const unsigned int kQueueWndSize =
+      runtime->tuner()->GetOpenclQueueWindowSize();
+  static thread_local unsigned int opencl_commands = 0;
+  if (kQueueWndSize > 0) {
+    opencl_commands++;
+    if (opencl_commands >= kQueueWndSize) {
+      event.wait();
+      opencl_commands = 0;
+    }
+  }
+}
+
 MaceStatus TuningOrRun3DKernel(OpenCLRuntime *runtime,
                               const cl::Kernel &kernel,
                               const std::string tuning_key,
@@ -176,6 +199,7 @@ MaceStatus TuningOrRun3DKernel(OpenCLRuntime *runtime,
            cl::NDRange(internal_gws[0], internal_gws[1], gws2),
            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
        MACE_CL_RET_ERROR(error);
+        WaitForQueueExecution(runtime, event);
      }
    } else {
      timer->ClearTiming();
@@ -285,6 +309,7 @@ MaceStatus TuningOrRun2DKernel(OpenCLRuntime *runtime,
            cl::NDRange(internal_gws[0], gws1),
            cl::NDRange(params[0], params[1]), nullptr, &event);
        MACE_CL_RET_ERROR(error);
+        WaitForQueueExecution(runtime, event);
      }
    } else {
      timer->ClearTiming();

--- a/mace/tools/mace_run.cc
+++ b/mace/tools/mace_run.cc
@@ -548,6 +548,10 @@ int Main(int argc, char **argv) {
  LOG(INFO) << "gpu_priority_hint: " << FLAGS_gpu_priority_hint;
  LOG(INFO) << "omp_num_threads: " << FLAGS_omp_num_threads;
  LOG(INFO) << "cpu_affinity_policy: " << FLAGS_cpu_affinity_policy;
+  LOG(INFO) << "limit_opencl_kernel_time: "
+            << getenv("MACE_LIMIT_OPENCL_KERNEL_TIME");
+  LOG(INFO) << "opencl_queue_window_size: "
+            << getenv("MACE_OPENCL_QUEUE_WINDOW_SIZE");

  std::vector<std::string> input_shapes = Split(FLAGS_input_shape, ':');
  std::vector<std::string> output_shapes = Split(FLAGS_output_shape, ':');

--- a/mace/utils/tuner.h
+++ b/mace/utils/tuner.h
@@ -35,20 +35,38 @@

 namespace mace {

-inline bool IsTuning() {
+constexpr const char *kOpenClWindowSize = "MACE_OPENCL_QUEUE_WINDOW_SIZE";
+
+inline bool GetTuningFromEnv() {
  std::string tuning;
  GetEnv("MACE_TUNING", &tuning);
  return tuning.size() == 1 && tuning[0] == '1';
 }

+inline unsigned int GetOpenclQueueWindowSizeFromEnv() {
+  std::string str_size;
+  GetEnv(kOpenClWindowSize, &str_size);
+  unsigned int window_size = 0;
+  if (str_size.size() > 0) {
+    window_size = std::stoi(str_size);
+  }
+  return window_size;
+}
+
 template <typename param_type>
 class Tuner {
 public:
  explicit Tuner(const std::string tuned_param_file_path = "",
      const unsigned char *param_byte_stream = nullptr,
-      const size_t param_byte_stream_size = 0):
+      const size_t param_byte_stream_size = 0) :
      tuned_param_file_path_(tuned_param_file_path) {
    GetEnv("MACE_RUN_PARAMETER_PATH", &path_);
+    is_tuning_ = GetTuningFromEnv();
+    if (is_tuning_) {
+      unsigned int wnd_size = GetOpenclQueueWindowSizeFromEnv();
+      param_table_[kOpenClWindowSize] = {wnd_size};
+    }
+
    if (param_byte_stream != nullptr && param_byte_stream_size != 0) {
      ParseData(param_byte_stream, param_byte_stream_size);
    } else {
@@ -92,6 +110,19 @@ class Tuner {
    }
  }

+  unsigned int GetOpenclQueueWindowSize() {
+    unsigned int window_size = 0;
+    if (!IsTuning()
+        && param_table_.find(kOpenClWindowSize) != param_table_.end()) {
+      window_size = param_table_[kOpenClWindowSize][0];
+    }
+    return window_size;
+  }
+
+  bool IsTuning() {
+    return is_tuning_;
+  }
+
 private:
  void WriteRunParameters() {
    if (!path_.empty()) {
@@ -239,6 +270,8 @@ class Tuner {
  std::string tuned_param_file_path_;
  std::string path_;
  std::unordered_map<std::string, std::vector<param_type>> param_table_;
+  unsigned int opencl_queue_window_size_;
+  bool is_tuning_;
 };

 }  // namespace mace

--- a/tools/common.py
+++ b/tools/common.py
@@ -407,6 +407,7 @@ class YAMLKeyword(object):
    input_data_formats = 'input_data_formats'
    output_data_formats = 'output_data_formats'
    limit_opencl_kernel_time = 'limit_opencl_kernel_time'
+    opencl_queue_window_size = 'opencl_queue_window_size'
    nnlib_graph_mode = 'nnlib_graph_mode'
    obfuscate = 'obfuscate'
    winograd = 'winograd'

--- a/tools/converter.py
+++ b/tools/converter.py
@@ -656,6 +656,7 @@ def format_model_config(flags):
                accuracy_validation_script

        for key in [YAMLKeyword.limit_opencl_kernel_time,
+                    YAMLKeyword.opencl_queue_window_size,
                    YAMLKeyword.nnlib_graph_mode,
                    YAMLKeyword.obfuscate,
                    YAMLKeyword.winograd,

--- a/tools/device.py
+++ b/tools/device.py
@@ -171,6 +171,7 @@ class DeviceWrapper:
                   running_round,
                   restart_round,
                   limit_opencl_kernel_time,
+                   opencl_queue_window_size,
                   tuning,
                   out_of_range_check,
                   model_graph_format,
@@ -312,6 +313,7 @@ class DeviceWrapper:
                "MACE_RUN_PARAMETER_PATH=%s/mace_run.config" % self.data_dir,
                "MACE_INTERNAL_STORAGE_PATH=%s" % internal_storage_dir,
                "MACE_LIMIT_OPENCL_KERNEL_TIME=%s" % limit_opencl_kernel_time,
+                "MACE_OPENCL_QUEUE_WINDOW_SIZE=%s" % opencl_queue_window_size,
                "MACE_RUNTIME_FAILURE_RATIO=%f" % runtime_failure_ratio,
                "MACE_LOG_TENSOR_RANGE=%d" % (1 if quantize_stat else 0),
            ]
@@ -429,6 +431,8 @@ class DeviceWrapper:
            restart_round=1,
            limit_opencl_kernel_time=model_config[
                YAMLKeyword.limit_opencl_kernel_time],
+            opencl_queue_window_size=model_config[
+                YAMLKeyword.opencl_queue_window_size],
            tuning=True,
            out_of_range_check=False,
            model_graph_format=model_graph_format,
@@ -541,6 +545,8 @@ class DeviceWrapper:
            restart_round=flags.restart_round,
            limit_opencl_kernel_time=model_config[
                YAMLKeyword.limit_opencl_kernel_time],
+            opencl_queue_window_size=model_config[
+                YAMLKeyword.opencl_queue_window_size],
            tuning=False,
            out_of_range_check=flags.gpu_out_of_range_check,
            model_graph_format=configs[