diff --git a/docs/faq.md b/docs/faq.md
index 3b3ccf580b4d2425613cda1e2d90b9638aa189f7..fe0cbcf1467bfbd5f4993b61d97ac3c84ad589ca 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -55,6 +55,13 @@ Try to set `limit_opencl_kernel_time` to `1`. If still not resolved, try to
 modify the source code to use even smaller time intervals or changed to CPU
 or DSP runtime.
 
+For GPUs such as Arm Mali, sometimes even setting `limit_opencl_kernel_time` to
+a small time interval can not solve the problem. At this time, you can try to
+set `opencl_queue_window_size`, such as 16. This parameter means that the GPU
+command queue will contain only `opencl_queue_window_size` commands at most.
+You can adjust this parameter to achieve a balance between performance and UI
+response. You should not use this parameter unless you have to.
+
 Why is MACE not working on DSP?
 ------------------------------------------------------------------------------
 Running models on Hexagon DSP need a few prerequisites for DSP developers:
diff --git a/docs/user_guide/advanced_usage.rst b/docs/user_guide/advanced_usage.rst
index b86b37ef09215712bf2c5a5a20405658034b55c2..f58eac7ece52060cbe8ded444906e80cdd719739 100644
--- a/docs/user_guide/advanced_usage.rst
+++ b/docs/user_guide/advanced_usage.rst
@@ -94,6 +94,8 @@ in one deployment file.
       - [optional] The format of the output tensors, one of [NONE, NHWC, NCHW]. If there is no format of the output, please use NONE. If only one single format is specified, all inputs will use that format, default is NHWC order.
     * - limit_opencl_kernel_time
       - [optional] Whether splitting the OpenCL kernel within 1 ms to keep UI responsiveness, default is 0.
+    * - opencl_queue_window_size
+      - [optional] Limit the max commands in OpenCL command queue to keep UI responsiveness, default is 0.
     * - obfuscate
       - [optional] Whether to obfuscate the model operator name, default to 0.
     * - winograd
diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index 4875cc228c00effeff3d12d676df410103ae16d2..6d5f82e105365c5cc38482b8bfe97ea0d0aacf5f 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -323,7 +323,7 @@ OpenCLRuntime::OpenCLRuntime(
   cl_command_queue_properties properties = 0;
 
   const char *profiling = getenv("MACE_OPENCL_PROFILING");
-  if (IsTuning() ||
+  if (tuner_->IsTuning() ||
       (profiling != nullptr && strlen(profiling) == 1 && profiling[0] == '1')) {
     properties |= CL_QUEUE_PROFILING_ENABLE;
     is_profiling_enabled_ = true;
diff --git a/mace/ops/opencl/helper.cc b/mace/ops/opencl/helper.cc
index 9729555a5ce246a1cb4277c61bf5d5de9f16bbd1..16acafb5a244583c3d9b34df25755eb3d50284f7 100644
--- a/mace/ops/opencl/helper.cc
+++ b/mace/ops/opencl/helper.cc
@@ -96,6 +96,29 @@ std::vector<uint32_t> Default3DLocalWS(OpenCLRuntime *runtime,
   return lws;
 }
 
+/**
+ * For GPUs like Arm Mali, when too many commands are added in the command
+ * queue, the UI responsiveness may be poor. This function limits the number of
+ * comands in the command queue to no more than kQueueWndSize. when
+ * opencl_commands >= kQueueWndSize, it will wait for the completion of GPU
+ * command queue's execution.
+ *
+ * If kQueueWndSize <= 0, this function does nothing.
+ */
+inline void WaitForQueueExecution(OpenCLRuntime *runtime,
+                                  const cl::Event &event) {
+  static const unsigned int kQueueWndSize =
+      runtime->tuner()->GetOpenclQueueWindowSize();
+  static thread_local unsigned int opencl_commands = 0;
+  if (kQueueWndSize > 0) {
+    opencl_commands++;
+    if (opencl_commands >= kQueueWndSize) {
+      event.wait();
+      opencl_commands = 0;
+    }
+  }
+}
+
 MaceStatus TuningOrRun3DKernel(OpenCLRuntime *runtime,
                                const cl::Kernel &kernel,
                                const std::string tuning_key,
@@ -176,6 +199,7 @@ MaceStatus TuningOrRun3DKernel(OpenCLRuntime *runtime,
             cl::NDRange(internal_gws[0], internal_gws[1], gws2),
             cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
         MACE_CL_RET_ERROR(error);
+        WaitForQueueExecution(runtime, event);
       }
     } else {
       timer->ClearTiming();
@@ -285,6 +309,7 @@ MaceStatus TuningOrRun2DKernel(OpenCLRuntime *runtime,
             cl::NDRange(internal_gws[0], gws1),
             cl::NDRange(params[0], params[1]), nullptr, &event);
         MACE_CL_RET_ERROR(error);
+        WaitForQueueExecution(runtime, event);
       }
     } else {
       timer->ClearTiming();
diff --git a/mace/tools/mace_run.cc b/mace/tools/mace_run.cc
index 7d6e1b2cef9bf96b88b144abb6d59a256f187c32..3529dfa850ceb641840ee36f67337afb8d420384 100644
--- a/mace/tools/mace_run.cc
+++ b/mace/tools/mace_run.cc
@@ -548,6 +548,10 @@ int Main(int argc, char **argv) {
   LOG(INFO) << "gpu_priority_hint: " << FLAGS_gpu_priority_hint;
   LOG(INFO) << "omp_num_threads: " << FLAGS_omp_num_threads;
   LOG(INFO) << "cpu_affinity_policy: " << FLAGS_cpu_affinity_policy;
+  LOG(INFO) << "limit_opencl_kernel_time: "
+            << getenv("MACE_LIMIT_OPENCL_KERNEL_TIME");
+  LOG(INFO) << "opencl_queue_window_size: "
+            << getenv("MACE_OPENCL_QUEUE_WINDOW_SIZE");
 
   std::vector<std::string> input_shapes = Split(FLAGS_input_shape, ':');
   std::vector<std::string> output_shapes = Split(FLAGS_output_shape, ':');
diff --git a/mace/utils/tuner.h b/mace/utils/tuner.h
index 45485234f04ee25cd10054f16bbc9341e3c0887b..da796ec08fa758cf161565c39d0f3e3373985aa7 100644
--- a/mace/utils/tuner.h
+++ b/mace/utils/tuner.h
@@ -35,20 +35,38 @@
 
 namespace mace {
 
-inline bool IsTuning() {
+constexpr const char *kOpenClWindowSize = "MACE_OPENCL_QUEUE_WINDOW_SIZE";
+
+inline bool GetTuningFromEnv() {
   std::string tuning;
   GetEnv("MACE_TUNING", &tuning);
   return tuning.size() == 1 && tuning[0] == '1';
 }
 
+inline unsigned int GetOpenclQueueWindowSizeFromEnv() {
+  std::string str_size;
+  GetEnv(kOpenClWindowSize, &str_size);
+  unsigned int window_size = 0;
+  if (str_size.size() > 0) {
+    window_size = std::stoi(str_size);
+  }
+  return window_size;
+}
+
 template <typename param_type>
 class Tuner {
  public:
   explicit Tuner(const std::string tuned_param_file_path = "",
       const unsigned char *param_byte_stream = nullptr,
-      const size_t param_byte_stream_size = 0):
+      const size_t param_byte_stream_size = 0) :
       tuned_param_file_path_(tuned_param_file_path) {
     GetEnv("MACE_RUN_PARAMETER_PATH", &path_);
+    is_tuning_ = GetTuningFromEnv();
+    if (is_tuning_) {
+      unsigned int wnd_size = GetOpenclQueueWindowSizeFromEnv();
+      param_table_[kOpenClWindowSize] = {wnd_size};
+    }
+
     if (param_byte_stream != nullptr && param_byte_stream_size != 0) {
       ParseData(param_byte_stream, param_byte_stream_size);
     } else {
@@ -92,6 +110,19 @@ class Tuner {
     }
   }
 
+  unsigned int GetOpenclQueueWindowSize() {
+    unsigned int window_size = 0;
+    if (!IsTuning()
+        && param_table_.find(kOpenClWindowSize) != param_table_.end()) {
+      window_size = param_table_[kOpenClWindowSize][0];
+    }
+    return window_size;
+  }
+
+  bool IsTuning() {
+    return is_tuning_;
+  }
+
  private:
   void WriteRunParameters() {
     if (!path_.empty()) {
@@ -239,6 +270,8 @@ class Tuner {
   std::string tuned_param_file_path_;
   std::string path_;
   std::unordered_map<std::string, std::vector<param_type>> param_table_;
+  unsigned int opencl_queue_window_size_;
+  bool is_tuning_;
 };
 
 }  // namespace mace
diff --git a/tools/common.py b/tools/common.py
index a45bf37a645f4c78a90b16df54d2bc7304044b64..8642ad4a99810e102257997b552957670ec907d7 100644
--- a/tools/common.py
+++ b/tools/common.py
@@ -407,6 +407,7 @@ class YAMLKeyword(object):
     input_data_formats = 'input_data_formats'
     output_data_formats = 'output_data_formats'
     limit_opencl_kernel_time = 'limit_opencl_kernel_time'
+    opencl_queue_window_size = 'opencl_queue_window_size'
     nnlib_graph_mode = 'nnlib_graph_mode'
     obfuscate = 'obfuscate'
     winograd = 'winograd'
diff --git a/tools/converter.py b/tools/converter.py
index a6a6919418f93d4c93b08d3763edc47d832d4cd8..126ef215fa490a0f7714e134ce0b537d46fd4b76 100644
--- a/tools/converter.py
+++ b/tools/converter.py
@@ -656,6 +656,7 @@ def format_model_config(flags):
                 accuracy_validation_script
 
         for key in [YAMLKeyword.limit_opencl_kernel_time,
+                    YAMLKeyword.opencl_queue_window_size,
                     YAMLKeyword.nnlib_graph_mode,
                     YAMLKeyword.obfuscate,
                     YAMLKeyword.winograd,
diff --git a/tools/device.py b/tools/device.py
index d428756af67dcaa89242d1e9d1f2bb6c5c52f24a..341ba505142690156ee05f2ece153763d74d00a2 100644
--- a/tools/device.py
+++ b/tools/device.py
@@ -171,6 +171,7 @@ class DeviceWrapper:
                    running_round,
                    restart_round,
                    limit_opencl_kernel_time,
+                   opencl_queue_window_size,
                    tuning,
                    out_of_range_check,
                    model_graph_format,
@@ -312,6 +313,7 @@ class DeviceWrapper:
                 "MACE_RUN_PARAMETER_PATH=%s/mace_run.config" % self.data_dir,
                 "MACE_INTERNAL_STORAGE_PATH=%s" % internal_storage_dir,
                 "MACE_LIMIT_OPENCL_KERNEL_TIME=%s" % limit_opencl_kernel_time,
+                "MACE_OPENCL_QUEUE_WINDOW_SIZE=%s" % opencl_queue_window_size,
                 "MACE_RUNTIME_FAILURE_RATIO=%f" % runtime_failure_ratio,
                 "MACE_LOG_TENSOR_RANGE=%d" % (1 if quantize_stat else 0),
             ]
@@ -429,6 +431,8 @@ class DeviceWrapper:
             restart_round=1,
             limit_opencl_kernel_time=model_config[
                 YAMLKeyword.limit_opencl_kernel_time],
+            opencl_queue_window_size=model_config[
+                YAMLKeyword.opencl_queue_window_size],
             tuning=True,
             out_of_range_check=False,
             model_graph_format=model_graph_format,
@@ -541,6 +545,8 @@ class DeviceWrapper:
             restart_round=flags.restart_round,
             limit_opencl_kernel_time=model_config[
                 YAMLKeyword.limit_opencl_kernel_time],
+            opencl_queue_window_size=model_config[
+                YAMLKeyword.opencl_queue_window_size],
             tuning=False,
             out_of_range_check=flags.gpu_out_of_range_check,
             model_graph_format=configs[