From aa866acb04c58455a5a09343874cc779494da066 Mon Sep 17 00:00:00 2001
From: yejianwu <yejianwu@xiaomi.com>
Date: Wed, 28 Mar 2018 10:51:26 +0800
Subject: [PATCH] no roundup for qualcomm opencl2.0

---
 mace/core/runtime/opencl/opencl_runtime.cc    |  21 +++-
 mace/core/runtime/opencl/opencl_runtime.h     |  10 ++
 mace/kernels/opencl/activation_opencl.cc      |   6 +-
 mace/kernels/opencl/addn.cc                   |   6 +
 mace/kernels/opencl/batch_norm_opencl.cc      |   5 +
 mace/kernels/opencl/bias_add_opencl.cc        |  29 +++--
 mace/kernels/opencl/buffer_to_image.cc        |  32 +++--
 mace/kernels/opencl/channel_shuffle.cc        |   5 +
 mace/kernels/opencl/cl/activation.cl          |  11 +-
 mace/kernels/opencl/cl/addn.cl                |   8 ++
 mace/kernels/opencl/cl/batch_norm.cl          |  11 +-
 mace/kernels/opencl/cl/bias_add.cl            |  10 +-
 mace/kernels/opencl/cl/buffer_to_image.cl     | 101 +++++++++++++++-
 mace/kernels/opencl/cl/channel_shuffle.cl     |  18 ++-
 mace/kernels/opencl/cl/concat.cl              |  24 +++-
 mace/kernels/opencl/cl/conv_2d.cl             |  11 +-
 mace/kernels/opencl/cl/conv_2d_1x1.cl         |  10 +-
 mace/kernels/opencl/cl/conv_2d_3x3.cl         |  11 +-
 mace/kernels/opencl/cl/depthwise_conv2d.cl    |  20 +++-
 mace/kernels/opencl/cl/eltwise.cl             |   8 ++
 mace/kernels/opencl/cl/fully_connected.cl     |  19 ++-
 mace/kernels/opencl/cl/matmul.cl              |   8 ++
 mace/kernels/opencl/cl/pooling.cl             |  12 +-
 mace/kernels/opencl/cl/resize_bilinear.cl     |  11 ++
 mace/kernels/opencl/cl/slice.cl               |  11 +-
 mace/kernels/opencl/cl/softmax.cl             |  12 +-
 mace/kernels/opencl/cl/space_to_batch.cl      |  16 +++
 mace/kernels/opencl/cl/winograd_transform.cl  |  23 +++-
 mace/kernels/opencl/concat.cc                 |  11 ++
 mace/kernels/opencl/conv_2d_opencl_1x1.cc     |   6 +
 mace/kernels/opencl/conv_2d_opencl_3x3.cc     |   5 +
 mace/kernels/opencl/conv_2d_opencl_general.cc |   5 +
 mace/kernels/opencl/depthwise_conv_opencl.cc  |   5 +
 mace/kernels/opencl/eltwise_opencl.cc         |   5 +
 mace/kernels/opencl/fully_connected_opencl.cc |  15 ++-
 mace/kernels/opencl/helper.cc                 | 113 +++++++++++++-----
 mace/kernels/opencl/helper.h                  |   2 +
 mace/kernels/opencl/matmul.cc                 |   5 +
 mace/kernels/opencl/pooling_opencl.cc         |   6 +
 mace/kernels/opencl/resize_bilinear_opencl.cc |   5 +
 mace/kernels/opencl/slice.cc                  |   5 +
 mace/kernels/opencl/softmax_opencl.cc         |   6 +
 mace/kernels/opencl/space_to_batch_opencl.cc  |   5 +
 mace/kernels/opencl/winograd_transform.cc     |  10 ++
 tools/bazel-adb-run.sh                        |   2 +-
 45 files changed, 609 insertions(+), 71 deletions(-)

diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index 798409ab..02fa29a0 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -142,7 +142,6 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
   }
 
   bool gpu_detected = false;
-  bool is_adreno_gpu = false;
   device_ = std::make_shared<cl::Device>();
   for (auto device : all_devices) {
     if (device.getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_GPU) {
@@ -150,10 +149,18 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
       gpu_detected = true;
       const std::string device_name = device.getInfo<CL_DEVICE_NAME>();
       constexpr const char *kQualcommAdrenoGPUStr = "QUALCOMM Adreno(TM)";
+      constexpr const char *kMaliGPUStr = "Mali";
       if (device_name == kQualcommAdrenoGPUStr) {
-        is_adreno_gpu = true;
+        gpu_type_ = GPU_TYPE::QUALCOMM_ADRENO;
+      } else if (device_name.find(kMaliGPUStr) != std::string::npos) {
+        gpu_type_ = GPU_TYPE::MALI;
+      } else {
+        gpu_type_ = GPU_TYPE::UNKNOWN;
       }
 
+      const std::string device_version = device.getInfo<CL_DEVICE_VERSION>();
+      opencl_version_ = device_version.substr(7, 3);
+
       VLOG(1) << "Using device: " << device_name;
       break;
     }
@@ -171,7 +178,7 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
   }
 
   cl_int err;
-  if (is_adreno_gpu) {
+  if (gpu_type_ == GPU_TYPE::QUALCOMM_ADRENO) {
     std::vector<cl_context_properties> context_properties;
     context_properties.reserve(5);
     GetAdrenoContextProperties(&context_properties, gpu_perf_hint,
@@ -350,4 +357,12 @@ uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) {
   return size;
 }
 
+const GPU_TYPE OpenCLRuntime::GetGPUType() const {
+  return gpu_type_;
+}
+
+const std::string &OpenCLRuntime::GetOpenclVersion() {
+  return opencl_version_;
+}
+
 }  // namespace mace
diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h
index ce375b9a..1b257e6b 100644
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -18,6 +18,12 @@
 
 namespace mace {
 
+enum GPU_TYPE {
+  QUALCOMM_ADRENO,
+  MALI,
+  UNKNOWN,
+};
+
 class OpenCLProfilingTimer : public Timer {
  public:
   explicit OpenCLProfilingTimer(const cl::Event *event)
@@ -49,6 +55,8 @@ class OpenCLRuntime {
   uint64_t GetDeviceMaxWorkGroupSize();
   uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel);
   uint64_t GetKernelWaveSize(const cl::Kernel &kernel);
+  const GPU_TYPE GetGPUType() const;
+  const std::string &GetOpenclVersion();
   cl::Kernel BuildKernel(const std::string &program_name,
                          const std::string &kernel_name,
                          const std::set<std::string> &build_options);
@@ -74,6 +82,8 @@ class OpenCLRuntime {
   std::map<std::string, cl::Program> built_program_map_;
   std::mutex program_build_mutex_;
   std::string kernel_path_;
+  GPU_TYPE gpu_type_;
+  std::string opencl_version_;
 
   static GPUPerfHint gpu_perf_hint_;
   static GPUPriorityHint gpu_priority_hint_;
diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation_opencl.cc
index dfe703dd..d7b89336 100644
--- a/mace/kernels/opencl/activation_opencl.cc
+++ b/mace/kernels/opencl/activation_opencl.cc
@@ -26,14 +26,18 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
 
   auto runtime = OpenCLRuntime::Global();
 
-  if (kernel_.get() == nullptr) {
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
 
+  if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
     built_options.emplace("-Dactivation=" + kernel_name);
     auto dt = DataTypeToEnum<T>::value;
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     switch (activation_) {
       case RELU:
         tuning_key_prefix_ = "relu_opencl_kernel_";
diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc
index 94538fc2..37e6062a 100644
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -26,6 +26,8 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   for (int i = 1; i < size; ++i) {
     MACE_CHECK_NOTNULL(input_tensors[i]);
     MACE_CHECK(batch == input_tensors[i]->dim(0));
@@ -45,6 +47,10 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
     built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
+
     kernel_ = runtime->BuildKernel("addn", kernel_name, built_options);
   }
 
diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc
index d79b5c18..10b956de 100644
--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -36,6 +36,8 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
     auto dt = DataTypeToEnum<T>::value;
@@ -43,6 +45,9 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     built_options.emplace("-Dbatch_norm=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     if (folded_constant_) {
       built_options.emplace("-DFOLDED_CONSTANT");
     }
diff --git a/mace/kernels/opencl/bias_add_opencl.cc b/mace/kernels/opencl/bias_add_opencl.cc
index 69327995..ce0e5965 100644
--- a/mace/kernels/opencl/bias_add_opencl.cc
+++ b/mace/kernels/opencl/bias_add_opencl.cc
@@ -28,6 +28,9 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
                            static_cast<uint32_t>(height * batch)};
 
   auto runtime = OpenCLRuntime::Global();
+
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
     auto dt = DataTypeToEnum<T>::value;
@@ -35,6 +38,9 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     built_options.emplace("-Dbias_add=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options);
   }
   if (!IsVecEqual(input_shape_, input->shape())) {
@@ -52,15 +58,22 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
       static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   const std::vector<uint32_t> lws = {8, kwg_size / 64, 8};
 
-  std::vector<uint32_t> roundup_gws(lws.size());
-  for (size_t i = 0; i < lws.size(); ++i) {
-    roundup_gws[i] = RoundUp(gws[i], lws[i]);
-  }
-
   cl::Event event;
-  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-      kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
-      cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+  cl_int error;
+  if (is_qualcomm_opencl200) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+  } else {
+    std::vector<uint32_t> roundup_gws(lws.size());
+    for (size_t i = 0; i < lws.size(); ++i) {
+      roundup_gws[i] = RoundUp(gws[i], lws[i]);
+    }
+
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
+        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+  }
   MACE_CHECK(error == CL_SUCCESS);
   if (future != nullptr) {
     future->wait_fn = [runtime, event](CallStats *stats) {
diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc
index 9fee7a95..0cec970a 100644
--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -59,11 +59,19 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
                          : "winograd_filter_buffer_to_image";
       break;
   }
+
+  auto runtime = OpenCLRuntime::Global();
+
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
   std::set<std::string> built_options;
   std::stringstream kernel_name_ss;
   kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
   built_options.emplace(kernel_name_ss.str());
+  if (is_qualcomm_opencl200) {
+    built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+  }
   if (buffer->dtype() == image->dtype()) {
     built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
     built_options.emplace("-DCMD_DATA_TYPE=" +
@@ -74,7 +82,6 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
     built_options.emplace("-DCMD_DATA_TYPE=" +
                           DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
   }
-  auto runtime = OpenCLRuntime::Global();
   auto b2f_kernel = runtime->BuildKernel("buffer_to_image",
                                          obfuscated_kernel_name, built_options);
 
@@ -105,17 +112,24 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
   const uint32_t kwg_size =
       static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(b2f_kernel));
   const std::vector<uint32_t> lws = {16, kwg_size / 16};
-  std::vector<uint32_t> roundup_gws(lws.size());
-  for (size_t i = 0; i < lws.size(); ++i) {
-    roundup_gws[i] = RoundUp(gws[i], lws[i]);
-  }
 
   cl::Event event;
-  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-      b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
-      cl::NDRange(lws[0], lws[1]), nullptr, &event);
-  MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+  cl_int error;
+  if (is_qualcomm_opencl200) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
+        cl::NDRange(lws[0], lws[1]), nullptr, &event);
+  } else {
+    std::vector<uint32_t> roundup_gws(lws.size());
+    for (size_t i = 0; i < lws.size(); ++i) {
+      roundup_gws[i] = RoundUp(gws[i], lws[i]);
+    }
 
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
+        cl::NDRange(lws[0], lws[1]), nullptr, &event);
+  }
+  MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
   if (future != nullptr) {
     future->wait_fn = [runtime, event](CallStats *stats) {
       event.wait();
diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc
index 34bc5784..9d566477 100644
--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
@@ -36,6 +36,8 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
@@ -43,6 +45,9 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
     auto dt = DataTypeToEnum<T>::value;
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name,
                                    built_options);
   }
diff --git a/mace/kernels/opencl/cl/activation.cl b/mace/kernels/opencl/cl/activation.cl
index 23e6d60e..a02b0e35 100644
--- a/mace/kernels/opencl/cl/activation.cl
+++ b/mace/kernels/opencl/cl/activation.cl
@@ -5,19 +5,28 @@ __kernel void activation(__read_only image2d_t input,
                          __read_only image2d_t alpha,
 #endif
                          __private const float relux_max_limit,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                          __write_only image2d_t output,
                          __private const int global_size_dim0,
                          __private const int global_size_dim1,
                          __private const int global_size_dim2) {
+#else
+                         __write_only image2d_t output) {
+#endif
+
   const int ch_blk = get_global_id(0);
   const int w = get_global_id(1);
   const int hb = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (ch_blk >= global_size_dim0 || w >= global_size_dim1
       || hb >= global_size_dim2) {
     return;
   }
-
   const int width = global_size_dim1;
+#else
+  const int width = get_global_size(1);
+#endif
 
   const int pos = mad24(ch_blk, width, w);
   DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
diff --git a/mace/kernels/opencl/cl/addn.cl b/mace/kernels/opencl/cl/addn.cl
index 4279fc23..23e47e50 100644
--- a/mace/kernels/opencl/cl/addn.cl
+++ b/mace/kernels/opencl/cl/addn.cl
@@ -8,12 +8,20 @@ __kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
 #if INPUT_NUM > 3
                    __read_only image2d_t input3,
 #endif
+#ifndef USE_QUALCOMM_OPENCL_2_0
                    __write_only image2d_t output,
                    __private const int global_size_dim0,
                    __private const int global_size_dim1) {
+#else
+                   __write_only image2d_t output) {
+#endif
+
   const int w = get_global_id(0);
   const int hb = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (w >= global_size_dim0 || hb >= global_size_dim1) return;
+#endif
 
   DATA_TYPE4 in0 = READ_IMAGET(input0, SAMPLER, (int2)(w, hb));
   DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(w, hb));
diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl
index 5899fb00..d36c1e8b 100644
--- a/mace/kernels/opencl/cl/batch_norm.cl
+++ b/mace/kernels/opencl/cl/batch_norm.cl
@@ -9,19 +9,28 @@ __kernel void batch_norm(__read_only image2d_t input,
                          __private const float epsilon,
 #endif
                          __write_only image2d_t output,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                          __private const float relux_max_limit,
                          __private const int global_size_dim0,
                          __private const int global_size_dim1,
                          __private const int global_size_dim2) {
+#else
+                         __private const float relux_max_limit) {
+#endif
+
   const int ch_blk = get_global_id(0);
   const int w = get_global_id(1);
   const int hb = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (ch_blk >= global_size_dim0 || w >= global_size_dim1
       || hb >= global_size_dim2) {
     return;
   }
-
   const int width = global_size_dim1;
+#else
+  const int width = get_global_size(1);
+#endif
 
 #ifdef FOLDED_CONSTANT
   DATA_TYPE4 bn_scale = READ_IMAGET(scale, SAMPLER, (int2)(ch_blk, 0));
diff --git a/mace/kernels/opencl/cl/bias_add.cl b/mace/kernels/opencl/cl/bias_add.cl
index d139652b..594528ce 100644
--- a/mace/kernels/opencl/cl/bias_add.cl
+++ b/mace/kernels/opencl/cl/bias_add.cl
@@ -2,19 +2,27 @@
 // Supported data types: half/float
 __kernel void bias_add(__read_only image2d_t input,
                        __read_only image2d_t bias,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                        __write_only image2d_t output,
                        __private const int global_size_dim0,
                        __private const int global_size_dim1,
                        __private const int global_size_dim2) {
+#else
+                       __write_only image2d_t output) {
+#endif
   const int ch_blk = get_global_id(0);
   const int w = get_global_id(1);
   const int hb = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (ch_blk >= global_size_dim0 || w >= global_size_dim1
       || hb >= global_size_dim2) {
     return;
   }
-
   const int width = global_size_dim1;
+#else
+  const int width = get_global_size(1);
+#endif
 
   const int pos = mad24(ch_blk, width, w);
   DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
diff --git a/mace/kernels/opencl/cl/buffer_to_image.cl b/mace/kernels/opencl/cl/buffer_to_image.cl
index faf1f091..8e2f7184 100644
--- a/mace/kernels/opencl/cl/buffer_to_image.cl
+++ b/mace/kernels/opencl/cl/buffer_to_image.cl
@@ -5,14 +5,22 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o
                                      __private const int filter_w,
                                      __private const int out_channel,
                                      __private const int in_channel,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                      __write_only image2d_t output,
                                      __private const int global_size_dim0,
                                      __private const int global_size_dim1) {
+#else
+                                     __write_only image2d_t output) {
+#endif
+
   int w = get_global_id(0);
   int h = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
+#endif
 
   const int out_channel_idx = h * 4;
   const int rounded_in_channel = ((in_channel + 3) / 4) * 4;
@@ -51,14 +59,22 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic
                                      __private const int filter_w,
                                      __private const int out_channel,
                                      __private const int in_channel,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                      __read_only image2d_t input,
                                      __private const int global_size_dim0,
                                      __private const int global_size_dim1) {
+#else
+                                     __read_only image2d_t input) {
+#endif
+
   int w = get_global_id(0);
   int h = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
+#endif
 
   const int out_channel_idx = h * 4;
   const int rounded_in_channel = ((in_channel + 3) / 4) * 4;
@@ -96,14 +112,22 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w
                                         __private const int filter_w,
                                         __private const int in_channel,
                                         __private const int multiplier,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                         __write_only image2d_t output,
                                         __private const int global_size_dim0,
                                         __private const int global_size_dim1) { /* ic%4 * kh * kw * m, ic/4 */
+#else
+                                        __write_only image2d_t output) {
+#endif
+
   const int w = get_global_id(0);
   const int h = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
+#endif
 
   DATA_TYPE4 values = 0;
   if (multiplier == 1) {
@@ -151,14 +175,22 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
                                      __private const int height,
                                      __private const int width,
                                      __private const int channels,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                      __write_only image2d_t output,
                                      __private const int global_size_dim0,
                                      __private const int global_size_dim1) {
+#else
+                                     __write_only image2d_t output) {
+#endif
+
   int w = get_global_id(0);
   int h = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
+#endif
 
   const int batch_idx = h / height;
   const int height_idx = h % height;
@@ -189,14 +221,22 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
                                      __private const int height,
                                      __private const int width,
                                      __private const int channels,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                      __read_only image2d_t input,
                                      __private const int global_size_dim0,
                                      __private const int global_size_dim1) {
+#else
+                                     __read_only image2d_t input) {
+#endif
+
   int w = get_global_id(0);
   int h = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
+#endif
 
   const int batch_idx = h / height;
   const int height_idx = h % height;
@@ -225,14 +265,22 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
 __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
                                   __private const int input_offset,
                                   __private const int count,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                   __write_only image2d_t output,
                                   __private const int global_size_dim0,
                                   __private const int global_size_dim1) {
+#else
+                                  __write_only image2d_t output) {
+#endif
+
   int w = get_global_id(0);
   int h = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
+#endif
 
   const int offset = input_offset + w * 4;
   const int size = count - w * 4;
@@ -257,14 +305,23 @@ __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
 
 __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
                                   __private const int count,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                   __read_only image2d_t input,
                                   __private const int global_size_dim0,
                                   __private const int global_size_dim1) {
+#else
+                                  __read_only image2d_t input) {
+#endif
+
   int w = get_global_id(0);
   int h = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
+#endif
+
   const int offset = w * 4;
 
   int2 coord = (int2)(w, h);
@@ -290,14 +347,22 @@ __kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //n
                                             __private const int height,
                                             __private const int width,
                                             __private const int channels,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                             __write_only image2d_t output,
                                             __private const int global_size_dim0,
                                             __private const int global_size_dim1) {
+#else
+                                            __write_only image2d_t output) {
+#endif
+
   int w = get_global_id(0);
   int h = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
+#endif
 
   const int wc = width * channels;
   const int height_blks = (height + 3) / 4;
@@ -329,14 +394,22 @@ __kernel void in_out_height_image_to_buffer(__global DATA_TYPE *output, //nhwc
                                             __private const int height,
                                             __private const int width,
                                             __private const int channels,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                             __read_only image2d_t input,
                                             __private const int global_size_dim0,
                                             __private const int global_size_dim1) {
+#else
+                                            __read_only image2d_t input) {
+#endif
+
   int w = get_global_id(0);
   int h = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
+#endif
 
   const int height_blks = (height + 3) / 4;
   const int batch_idx = h / height_blks;
@@ -366,14 +439,22 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n
                                            __private const int height,
                                            __private const int width,
                                            __private const int channels,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                            __write_only image2d_t output,
                                            __private const int global_size_dim0,
                                            __private const int global_size_dim1) {
+#else
+                                           __write_only image2d_t output) {
+#endif
+
   int w = get_global_id(0);
   int h = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
+#endif
 
   const int width_blks = (width + 3) / 4;
   const int batch_idx = h / height;
@@ -406,16 +487,26 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, /
                                               __private const int in_channels,
                                               __private const int height,
                                               __private const int width,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                               __write_only image2d_t output,
                                               __private const int global_size_dim0,
                                               __private const int global_size_dim1) {
+#else
+                                              __write_only image2d_t output) {
+#endif
+
   int w = get_global_id(0);
   int h = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
-
   const int out_channels = global_size_dim1;
+#else
+  const int out_channels = get_global_size(1);
+#endif
+
   const int out_channel_idx = h;
   const int in_channel_idx = w << 2;
   const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width;
@@ -492,14 +583,22 @@ __kernel void winograd_filter_image_to_buffer(__global DATA_TYPE *output, //Oc,
                                               __private const int height,
                                               __private const int width,
                                               __private const int channel,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                               __read_only image2d_t input,
                                               __private const int global_size_dim0,
                                               __private const int global_size_dim1) {
+#else
+                                              __read_only image2d_t input) {
+#endif
+
   const int w = get_global_id(0);
   const int h = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
+#endif
 
   const int width_idx = w << 2;
   const int size = width - width_idx;
diff --git a/mace/kernels/opencl/cl/channel_shuffle.cl b/mace/kernels/opencl/cl/channel_shuffle.cl
index 6437ee7f..87159784 100644
--- a/mace/kernels/opencl/cl/channel_shuffle.cl
+++ b/mace/kernels/opencl/cl/channel_shuffle.cl
@@ -4,19 +4,29 @@
 __kernel void channel_shuffle(__read_only image2d_t input,
                       __private const int groups,
                       __private const int channels_per_group,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                       __write_only image2d_t output,
-                       __private const int global_size_dim0,
-                       __private const int global_size_dim1,
-                       __private const int global_size_dim2) {
+                      __private const int global_size_dim0,
+                      __private const int global_size_dim1,
+                      __private const int global_size_dim2) {
+#else
+                      __write_only image2d_t output) {
+#endif
+
   const int group_chan_blk_idx = get_global_id(0);
   const int width_idx = get_global_id(1);
   const int hb_idx = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (group_chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
       || hb_idx >= global_size_dim2) {
     return;
   }
-
   const int width = global_size_dim1;
+#else
+  const int width = get_global_size(1);
+#endif
+
   const int group_blks = groups / 4;
   const int groups_blks_width = group_blks * width;
   const int channels_per_group_blks = channels_per_group / 4;
diff --git a/mace/kernels/opencl/cl/concat.cl b/mace/kernels/opencl/cl/concat.cl
index ac74f0f2..c8bfebaa 100644
--- a/mace/kernels/opencl/cl/concat.cl
+++ b/mace/kernels/opencl/cl/concat.cl
@@ -25,19 +25,29 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left,
 __kernel void concat_channel(__read_only image2d_t input0,
                              __read_only image2d_t input1,
                              __private const int input0_chan,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                              __write_only image2d_t output,
                              __private const int global_size_dim0,
                              __private const int global_size_dim1,
                              __private const int global_size_dim2) {
+#else
+                             __write_only image2d_t output) {
+#endif
+
   const int chan_blk_idx = get_global_id(0);
   const int width_idx = get_global_id(1);
   const int hb_idx = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
       || hb_idx >= global_size_dim2) {
     return;
   }
-
   const int width = global_size_dim1;
+#else
+  const int width = get_global_size(1);
+#endif
+
   const int input0_chan_blk = (input0_chan + 3) >> 2;
 
   DATA_TYPE4 data = 0;
@@ -82,19 +92,29 @@ __kernel void concat_channel(__read_only image2d_t input0,
 // Required: All input channels are divisible by 4
 __kernel void concat_channel_multi(__read_only image2d_t input,
                                    __private const int chan_blk_offset,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                    __write_only image2d_t output,
                                    __private const int global_size_dim0,
                                    __private const int global_size_dim1,
                                    __private const int global_size_dim2) {
+#else
+                                   __write_only image2d_t output) {
+#endif
+
   const int chan_blk_idx = get_global_id(0);
   const int width_idx = get_global_id(1);
   const int hb_idx = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
       || hb_idx >= global_size_dim2) {
     return;
   }
-
   const int width = global_size_dim1;
+#else
+  const int width = get_global_size(1);
+#endif
+
   DATA_TYPE4 data = 0;
   data = READ_IMAGET(input,
                      SAMPLER,
diff --git a/mace/kernels/opencl/cl/conv_2d.cl b/mace/kernels/opencl/cl/conv_2d.cl
index 75be47f1..f85bf108 100644
--- a/mace/kernels/opencl/cl/conv_2d.cl
+++ b/mace/kernels/opencl/cl/conv_2d.cl
@@ -18,20 +18,29 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                       __private const int padding_top,
                       __private const int padding_left,
                       __private const int dilation_h,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                       __private const int dilation_w,
                       __private const int global_size_dim0,
                       __private const int global_size_dim1,
                       __private const int global_size_dim2) {
+#else
+                      __private const int dilation_w) {
+#endif
+
   const int out_ch_blk = get_global_id(0);
   const int out_w_blk = get_global_id(1);
   const int out_hb = get_global_id(2);
 
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
       || out_hb >= global_size_dim2) {
     return;
   }
-
   const int out_w_blks = global_size_dim1;
+#else
+  const int out_w_blks = get_global_size(1);
+#endif
+
   const int rounded_in_ch = in_ch_blks << 2;
 
 #ifdef BIAS
diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl
index a9e4f95f..70d88867 100644
--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
@@ -12,20 +12,28 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
                           __private const int in_ch_blks,
                           __private const int height,
                           __private const int width,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                           __private const int stride,
                           __private const int global_size_dim0,
                           __private const int global_size_dim1,
                           __private const int global_size_dim2) {
+#else
+                          __private const int stride) {
+#endif
+
   const int out_ch_blk = get_global_id(0);
   const int out_w_blk = get_global_id(1);
   const int out_hb = get_global_id(2);
 
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
       || out_hb >= global_size_dim2) {
     return;
   }
-
   const int out_w_blks = global_size_dim1;
+#else
+  const int out_w_blks = get_global_size(1);
+#endif
 
 #ifdef BIAS
   DATA_TYPE4 out0 = READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl
index b2d8eaa4..8f58255a 100644
--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -16,20 +16,29 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
                           __private const int padding_top,
                           __private const int padding_left,
                           __private const int dilation_h,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                           __private const int dilation_w,
                           __private const int global_size_dim0,
                           __private const int global_size_dim1,
                           __private const int global_size_dim2) {
+#else
+                          __private const int dilation_w) {
+#endif
+
   const int out_ch_blk = get_global_id(0);
   const int out_w_blk = get_global_id(1);
   const int out_hb = get_global_id(2);
 
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
       || out_hb >= global_size_dim2) {
     return;
   }
-
   const int out_w_blks = global_size_dim1;
+#else
+  const int out_w_blks = get_global_size(1);
+#endif
+
   const int rounded_in_ch = in_ch_blks << 2;
 
 #ifdef BIAS
diff --git a/mace/kernels/opencl/cl/depthwise_conv2d.cl b/mace/kernels/opencl/cl/depthwise_conv2d.cl
index 28125a8d..7d39d3c1 100644
--- a/mace/kernels/opencl/cl/depthwise_conv2d.cl
+++ b/mace/kernels/opencl/cl/depthwise_conv2d.cl
@@ -18,19 +18,29 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h
                                __private const short padding_top,
                                __private const short padding_left,
                                __private const short dilation_h,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                __private const short dilation_w,
                                __private const int global_size_dim0,
                                __private const int global_size_dim1,
                                __private const int global_size_dim2) {
+#else
+                               __private const short dilation_w) {
+#endif
+
   const short out_ch_blk = get_global_id(0);
   const short out_w_blk = get_global_id(1);
   const short out_hb = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
       || out_hb >= global_size_dim2) {
     return;
   }
-
   const short out_w_blks = global_size_dim1;
+#else
+  const short out_w_blks = get_global_size(1);
+#endif
+
   const short rounded_in_ch = in_ch_blks << 2;
   const short in_ch_blk = out_ch_blk; // multiplier = 1
 
@@ -149,17 +159,25 @@ __kernel void depthwise_conv2d_s1(__read_only image2d_t input, /* [c%4 * w * c/4
                                   __private const short filter_height,
                                   __private const short filter_width,
                                   __private const short padding_top,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                   __private const short padding_left,
                                   __private const int global_size_dim0,
                                   __private const int global_size_dim1,
                                   __private const int global_size_dim2) {
+#else
+                                  __private const short padding_left) {
+#endif
+
   const short out_ch_blk = get_global_id(0);
   const short out_w_blk = get_global_id(1) << 2;
   const short out_hb = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (out_ch_blk >= global_size_dim0 || get_global_id(1) >= global_size_dim1
       || out_hb >= global_size_dim2) {
     return;
   }
+#endif
 
   const short rounded_in_ch = in_ch_blks << 2;
   const short in_ch_blk = out_ch_blk; // multiplier = 1
diff --git a/mace/kernels/opencl/cl/eltwise.cl b/mace/kernels/opencl/cl/eltwise.cl
index edfb777d..d7c90e03 100644
--- a/mace/kernels/opencl/cl/eltwise.cl
+++ b/mace/kernels/opencl/cl/eltwise.cl
@@ -6,12 +6,20 @@ __kernel void eltwise(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
                       __private const float coeff0,
                       __private const float coeff1,
 #endif
+#ifndef USE_QUALCOMM_OPENCL_2_0
                       __write_only image2d_t output,
                       __private const int global_size_dim0,
                       __private const int global_size_dim1) {
+#else
+                      __write_only image2d_t output) {
+#endif
+
   const int w = get_global_id(0);
   const int hb = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (w >= global_size_dim0 || hb >= global_size_dim1) return;
+#endif
 
   DATA_TYPE4 in0 = READ_IMAGET(input0, SAMPLER, (int2)(w, hb));
   DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(w, hb));
diff --git a/mace/kernels/opencl/cl/fully_connected.cl b/mace/kernels/opencl/cl/fully_connected.cl
index 90d84c11..a474c8ca 100644
--- a/mace/kernels/opencl/cl/fully_connected.cl
+++ b/mace/kernels/opencl/cl/fully_connected.cl
@@ -10,14 +10,22 @@ __kernel void fully_connected(__read_only image2d_t input,
                               __private const int input_height,
                               __private const int input_width,
                               __private const int input_channel,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                               __private const float relux_max_limit,
                               __private const int global_size_dim0,
                               __private const int global_size_dim1) {
+#else
+                              __private const float relux_max_limit) {
+#endif
+
   const int batch_idx = get_global_id(0);
   const int out_blk_idx = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (batch_idx >= global_size_dim0 || out_blk_idx >= global_size_dim1) {
     return;
   }
+#endif
 
   const int input_chan_blk = (input_channel + 3) >> 2;
 
@@ -74,19 +82,28 @@ __kernel void fully_connected_width(__read_only image2d_t input,
                                     __private const int input_width,
                                     __private const int in_chan_blks,
                                     __private const int out_blks,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                     __private const float relux_max_limit,
                                     __private const int global_size_dim0,
                                     __private const int global_size_dim1,
                                     __private const int global_size_dim2) {
+#else
+                                    __private const float relux_max_limit) {
+#endif
+
   const int inter_out_idx = get_global_id(0);
   const int width_blk_idx = get_global_id(1);
   const int batch_out_blk_idx = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (inter_out_idx >= global_size_dim0 || width_blk_idx >= global_size_dim1
       || batch_out_blk_idx >= global_size_dim2) {
     return;
   }
-
   const int width_blk_count = global_size_dim1;
+#else
+  const int width_blk_count = get_global_size(1);
+#endif
 
   const int batch_idx = batch_out_blk_idx / out_blks;
   const int out_blk_idx = batch_out_blk_idx % out_blks;
diff --git a/mace/kernels/opencl/cl/matmul.cl b/mace/kernels/opencl/cl/matmul.cl
index f0c2ee0e..7107838c 100644
--- a/mace/kernels/opencl/cl/matmul.cl
+++ b/mace/kernels/opencl/cl/matmul.cl
@@ -8,12 +8,20 @@ __kernel void matmul(__read_only image2d_t A,
                      __private const int N,
                      __private const int K,
                      __private const int height_blocks,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                      __private const int k_blocks,
                      __private const int global_size_dim0,
                      __private const int global_size_dim1) {
+#else
+                     __private const int k_blocks) {
+#endif
+
   const int gx = get_global_id(0) << 2;
   const int hb = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (get_global_id(0) >= global_size_dim0 || hb >= global_size_dim1) return;
+#endif
 
   const int batch = hb / height_blocks;
   const int ty = (hb % height_blocks);
diff --git a/mace/kernels/opencl/cl/pooling.cl b/mace/kernels/opencl/cl/pooling.cl
index dad48824..8cdc4e46 100644
--- a/mace/kernels/opencl/cl/pooling.cl
+++ b/mace/kernels/opencl/cl/pooling.cl
@@ -27,19 +27,29 @@ __kernel void pooling(__read_only image2d_t input,
                       __private const int pad_left,
                       __private const int stride,
                       __private const int pooling_size,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                       __write_only image2d_t output,
                       __private const int global_size_dim0,
                       __private const int global_size_dim1,
                       __private const int global_size_dim2) {
+#else
+                      __write_only image2d_t output) {
+#endif
+
   const int out_chan_idx = get_global_id(0);
   const int out_width_idx = get_global_id(1);
   const int out_hb_idx = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (out_chan_idx >= global_size_dim0 || out_width_idx >= global_size_dim1
       || out_hb_idx >= global_size_dim2) {
     return;
   }
-
   const int out_width = global_size_dim1;
+#else
+  const int out_width = get_global_size(1);
+#endif
+
   const int batch_idx = mul24((out_hb_idx / out_height), in_height);
   const int in_height_start = mul24((out_hb_idx % out_height), stride) - pad_top;
   const int in_width_start = mul24(out_width_idx, stride) - pad_left;
diff --git a/mace/kernels/opencl/cl/resize_bilinear.cl b/mace/kernels/opencl/cl/resize_bilinear.cl
index b3778cb2..5369c762 100644
--- a/mace/kernels/opencl/cl/resize_bilinear.cl
+++ b/mace/kernels/opencl/cl/resize_bilinear.cl
@@ -6,19 +6,30 @@ __kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w *
                                       __private const float width_scale,
                                       __private const int in_height,
                                       __private const int in_width,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                       __private const int out_height,
                                       __private const int global_size_dim0,
                                       __private const int global_size_dim1,
                                       __private const int global_size_dim2) {
+#else
+                                      __private const int out_height) {
+#endif
+
   const int ch_blk = get_global_id(0);
   const int w = get_global_id(1);
   const int hb = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (ch_blk >= global_size_dim0 || w >= global_size_dim1
       || hb >= global_size_dim2) {
     return;
   }
   const int ch_blks = global_size_dim0;
   const int out_width = global_size_dim1;
+#else
+  const int ch_blks = get_global_size(0);
+  const int out_width = get_global_size(1);
+#endif
 
   const int b = hb / out_height;
   const int h = hb % out_height;
diff --git a/mace/kernels/opencl/cl/slice.cl b/mace/kernels/opencl/cl/slice.cl
index a626c0de..bb5f40cd 100644
--- a/mace/kernels/opencl/cl/slice.cl
+++ b/mace/kernels/opencl/cl/slice.cl
@@ -2,19 +2,28 @@
 
 __kernel void slice(__read_only image2d_t input,
                     __private const int chan_blk_offset,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                     __write_only image2d_t output,
                     __private const int global_size_dim0,
                     __private const int global_size_dim1,
                     __private const int global_size_dim2) {
+#else
+                    __write_only image2d_t output) {
+#endif
+
   const int chan_blk_idx = get_global_id(0);
   const int width_idx = get_global_id(1);
   const int hb_idx = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
       || hb_idx >= global_size_dim2) {
     return;
   }
-
   const int width = global_size_dim1;
+#else
+  const int width = get_global_size(1);
+#endif
 
   DATA_TYPE4 data = READ_IMAGET(input, SAMPLER,
                                 (int2)(mad24(chan_blk_idx + chan_blk_offset,
diff --git a/mace/kernels/opencl/cl/softmax.cl b/mace/kernels/opencl/cl/softmax.cl
index e7027394..3fadd18e 100644
--- a/mace/kernels/opencl/cl/softmax.cl
+++ b/mace/kernels/opencl/cl/softmax.cl
@@ -3,20 +3,30 @@
 __kernel void softmax(__read_only image2d_t input,
                       __private const int channels,
                       __private const int remain_channels,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                       __write_only image2d_t output,
                       __private const int global_size_dim0,
                       __private const int global_size_dim1,
                       __private const int global_size_dim2) {
+#else
+                      __write_only image2d_t output) {
+#endif
+
   const int chan_blk_idx = get_global_id(0);
   const int width_idx = get_global_id(1);
   const int hb_idx = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
       || hb_idx >= global_size_dim2) {
     return;
   }
-
   const int chan_blks = global_size_dim0 - 1;
   const int width = global_size_dim1;
+#else
+  const int chan_blks = get_global_size(0) - 1;
+  const int width = get_global_size(1);
+#endif
 
   int pos = width_idx;
   DATA_TYPE max_value = -FLT_MAX;
diff --git a/mace/kernels/opencl/cl/space_to_batch.cl b/mace/kernels/opencl/cl/space_to_batch.cl
index e36313fe..822d0906 100644
--- a/mace/kernels/opencl/cl/space_to_batch.cl
+++ b/mace/kernels/opencl/cl/space_to_batch.cl
@@ -9,17 +9,25 @@ __kernel void space_to_batch(__read_only image2d_t space_data,
                              __private const int space_height,
                              __private const int space_width,
                              __private const int batch_height,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                              __private const int batch_width,
                              __private const int global_size_dim0,
                              __private const int global_size_dim1,
                              __private const int global_size_dim2) {
+#else
+                             __private const int batch_width) {
+#endif
+
   const int chan_idx = get_global_id(0);
   const int batch_w_idx = get_global_id(1);
   const int batch_hb_idx = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1
       || batch_hb_idx >= global_size_dim2) {
     return;
   }
+#endif
 
   const int batch_b_idx = batch_hb_idx / batch_height;
   const int batch_h_idx = batch_hb_idx % batch_height;
@@ -55,17 +63,25 @@ __kernel void batch_to_space(__read_only image2d_t batch_data,
                              __private const int space_height,
                              __private const int space_width,
                              __private const int batch_height,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                              __private const int batch_width,
                              __private const int global_size_dim0,
                              __private const int global_size_dim1,
                              __private const int global_size_dim2) {
+#else
+                             __private const int batch_width) {
+#endif
+
   const int chan_idx = get_global_id(0);
   const int batch_w_idx = get_global_id(1);
   const int batch_hb_idx = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1
       || batch_hb_idx >= global_size_dim2) {
     return;
   }
+#endif
 
   const int batch_b_idx = batch_hb_idx / batch_height;
   const int batch_h_idx = batch_hb_idx % batch_height;
diff --git a/mace/kernels/opencl/cl/winograd_transform.cl b/mace/kernels/opencl/cl/winograd_transform.cl
index 3acfc902..098c8e3b 100644
--- a/mace/kernels/opencl/cl/winograd_transform.cl
+++ b/mace/kernels/opencl/cl/winograd_transform.cl
@@ -8,16 +8,25 @@ __kernel void winograd_transform_2x2(__read_only image2d_t input,
                                      __private const int round_hw,
                                      __private const int round_w,
                                      __private const int padding_top,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                      __private const int padding_left,
                                      __private const int global_size_dim0,
                                      __private const int global_size_dim1) {
+#else
+                                     __private const int padding_left) {
+#endif
+
   int out_width_idx = get_global_id(0);
   int chan_blk_idx = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (out_width_idx >= global_size_dim0 || chan_blk_idx >= global_size_dim1) {
     return;
   }
-
   const int chan_blk_size = global_size_dim1;
+#else
+  const int chan_blk_size = get_global_size(1);
+#endif
 
   const int batch_idx = out_width_idx / round_hw;
   const int t_idx = out_width_idx % round_hw;
@@ -121,16 +130,26 @@ __kernel void winograd_inverse_transform_2x2(__read_only image2d_t input,
                                              __private const int out_width,
                                              __private const int round_hw,
                                              __private const int round_w,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                              __private const float relux_max_limit,
                                              __private const int global_size_dim0,
                                              __private const int global_size_dim1) {
+#else
+                                             __private const float relux_max_limit) {
+#endif
+
   const int width_idx = get_global_id(0);
   const int height_idx = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) {
     return;
   }
-
   const int out_channel = global_size_dim1;
+#else
+  const int out_channel = get_global_size(1);
+#endif
+
   int width = width_idx;
   int height = height_idx;
 
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
index ccb5b6c2..1ddf37bc 100644
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -31,10 +31,15 @@ static void Concat2(cl::Kernel *kernel,
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel->get() == nullptr) {
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
     built_options.emplace("-Dconcat_channel=" + kernel_name);
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     if (input0->dtype() == output->dtype()) {
       built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
       built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
@@ -83,12 +88,18 @@ static void ConcatN(cl::Kernel *kernel,
   const index_t channel = output->dim(3);
 
   auto runtime = OpenCLRuntime::Global();
+
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel->get() == nullptr) {
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
     built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
   }
 
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
index 4bfa9ac7..ad2af5a7 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -37,6 +37,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
   const index_t input_channel_blocks = RoundUpDiv4(input_channels);
 
   auto runtime = OpenCLRuntime::Global();
+
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel->get() == nullptr) {
     MACE_CHECK(input_batch == batch);
 
@@ -45,6 +48,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
     built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     if (bias != nullptr) {
       built_options.emplace("-DBIAS");
     }
diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
index 97db8ab9..6ac0fa56 100644
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -37,12 +37,17 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel->get() == nullptr) {
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3");
     built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     built_options.emplace(bias != nullptr ? "-DBIAS" : "");
     switch (activation) {
       case NOOP:
diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc
index 4f1b67f6..0fc94442 100644
--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -37,12 +37,17 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel->get() == nullptr) {
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d");
     built_options.emplace("-Dconv_2d=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     built_options.emplace(bias != nullptr ? "-DBIAS" : "");
     switch (activation) {
       case NOOP:
diff --git a/mace/kernels/opencl/depthwise_conv_opencl.cc b/mace/kernels/opencl/depthwise_conv_opencl.cc
index 18b53853..c43799db 100644
--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -42,6 +42,8 @@ void DepthwiseConv2d(cl::Kernel *kernel,
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel->get() == nullptr) {
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
@@ -51,6 +53,9 @@ void DepthwiseConv2d(cl::Kernel *kernel,
     } else {
       built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
     }
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
     built_options.emplace(bias != nullptr ? "-DBIAS" : "");
diff --git a/mace/kernels/opencl/eltwise_opencl.cc b/mace/kernels/opencl/eltwise_opencl.cc
index a2e4e8f1..e2a68396 100644
--- a/mace/kernels/opencl/eltwise_opencl.cc
+++ b/mace/kernels/opencl/eltwise_opencl.cc
@@ -29,6 +29,8 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
     auto dt = DataTypeToEnum<T>::value;
@@ -37,6 +39,9 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
     built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
     kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options);
   }
diff --git a/mace/kernels/opencl/fully_connected_opencl.cc b/mace/kernels/opencl/fully_connected_opencl.cc
index 3e17f98f..208f4025 100644
--- a/mace/kernels/opencl/fully_connected_opencl.cc
+++ b/mace/kernels/opencl/fully_connected_opencl.cc
@@ -24,8 +24,11 @@ void FCWXKernel(cl::Kernel *kernel,
     << "FC width kernel only support input with 4x channel.";
   MACE_CHECK_NOTNULL(gws);
   MACE_CHECK_NOTNULL(lws);
+
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel->get() == nullptr) {
     std::set<std::string> built_options;
     auto dt = DataTypeToEnum<T>::value;
@@ -34,6 +37,9 @@ void FCWXKernel(cl::Kernel *kernel,
     built_options.emplace("-Dfully_connected_width=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     if (bias != nullptr) {
       built_options.emplace("-DBIAS");
     }
@@ -133,14 +139,21 @@ void FCWTXKernel(cl::Kernel *kernel,
                  StatsFuture *future) {
   MACE_CHECK_NOTNULL(gws);
   MACE_CHECK_NOTNULL(lws);
+
+  auto runtime = OpenCLRuntime::Global();
+
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel->get() == nullptr) {
-    auto runtime = OpenCLRuntime::Global();
     std::set<std::string> built_options;
     auto dt = DataTypeToEnum<T>::value;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected");
     built_options.emplace("-Dfully_connected=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     if (bias != nullptr) {
       built_options.emplace("-DBIAS");
     }
diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc
index 2141c65e..641abd66 100644
--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -194,12 +194,25 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
   }
 }
 
+const bool IsQualcommOpenCL200() {
+  auto runtime = OpenCLRuntime::Global();
+
+  if (runtime->GetGPUType() == GPU_TYPE::QUALCOMM_ADRENO &&
+      runtime->GetOpenclVersion() == "2.0") {
+    return true;
+  } else {
+    return false;
+  }
+}
+
 void TuningOrRun3DKernel(const cl::Kernel &kernel,
                          const std::string tuning_key,
                          const uint32_t *gws,
                          const std::vector<uint32_t> &lws,
                          StatsFuture *future) {
   auto runtime = OpenCLRuntime::Global();
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
     const uint32_t kwg_size =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
@@ -236,8 +249,10 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
         << "Tuning parameters of 3D kernel must be 4D";
     cl_int error = CL_SUCCESS;
     std::vector<uint32_t> roundup_gws(3);
-    for (size_t i = 0; i < 3; ++i) {
-      roundup_gws[i] = RoundUp(gws[i], params[i]);
+    if(!is_qualcomm_opencl200) {
+      for (size_t i = 0; i < 3; ++i) {
+        roundup_gws[i] = RoundUp(gws[i], params[i]);
+      }
     }
 
     if (timer == nullptr) {
@@ -247,18 +262,31 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
       for (uint32_t i = 0; i < num_blocks; ++i) {
         uint32_t gws2 =
             (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-        uint32_t roundup_gws2 = RoundUp(gws2, params[2]);
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            kernel, cl::NDRange(0, 0, i * block_size),
-            cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws2),
-            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        if (is_qualcomm_opencl200) {
+          error = runtime->command_queue().enqueueNDRangeKernel(
+              kernel, cl::NDRange(0, 0, i * block_size),
+              cl::NDRange(gws[0], gws[1], gws2),
+              cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        } else {
+          uint32_t roundup_gws2 = RoundUp(gws2, params[2]);
+          error = runtime->command_queue().enqueueNDRangeKernel(
+              kernel, cl::NDRange(0, 0, i * block_size),
+              cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws2),
+              cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        }
         MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
       }
     } else {
       timer->ClearTiming();
-      error = runtime->command_queue().enqueueNDRangeKernel(
-          kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
-          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      if (is_qualcomm_opencl200) {
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      } else {
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      }
       MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
       timer->AccumulateTiming();
       tuning_result->assign(params.begin(), params.end());
@@ -274,11 +302,18 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
         for (uint32_t i = 0; i < num_blocks; ++i) {
           uint32_t gws2 =
               (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-          uint32_t roundup_gws2 = RoundUp(gws2, params[2]);
-          error = runtime->command_queue().enqueueNDRangeKernel(
-              kernel, cl::NDRange(0, 0, i * block_size),
-              cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws2),
-              cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+          if (is_qualcomm_opencl200) {
+            error = runtime->command_queue().enqueueNDRangeKernel(
+                kernel, cl::NDRange(0, 0, i * block_size),
+                cl::NDRange(gws[0], gws[1], gws2),
+                cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+          } else {
+            uint32_t roundup_gws2 = RoundUp(gws2, params[2]);
+            error = runtime->command_queue().enqueueNDRangeKernel(
+                kernel, cl::NDRange(0, 0, i * block_size),
+                cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws2),
+                cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+          }
           MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
           timer->AccumulateTiming();
         }
@@ -306,6 +341,8 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
                          const std::vector<uint32_t> &lws,
                          StatsFuture *future) {
   auto runtime = OpenCLRuntime::Global();
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
     const uint32_t kwg_size =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
@@ -330,8 +367,10 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
         << "Tuning parameters of 2D kernel must be 3d";
     cl_int error = CL_SUCCESS;
     std::vector<uint32_t> roundup_gws(2);
-    for (size_t i = 0; i < 2; ++i) {
-      roundup_gws[i] = RoundUp(gws[i], params[i]);
+    if (!is_qualcomm_opencl200) {
+      for (size_t i = 0; i < 2; ++i) {
+        roundup_gws[i] = RoundUp(gws[i], params[i]);
+      }
     }
 
     if (timer == nullptr) {
@@ -341,17 +380,29 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
       for (uint32_t i = 0; i < num_blocks; ++i) {
         uint32_t gws1 =
             (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
-        uint32_t roundup_gws1 = RoundUp(gws1, params[1]);
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            kernel, cl::NDRange(0, i * block_size), cl::NDRange(roundup_gws[0], roundup_gws1),
-            cl::NDRange(params[0], params[1]), nullptr, &event);
+        if (is_qualcomm_opencl200) {
+          error = runtime->command_queue().enqueueNDRangeKernel(
+              kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1),
+              cl::NDRange(params[0], params[1]), nullptr, &event);
+        } else {
+          uint32_t roundup_gws1 = RoundUp(gws1, params[1]);
+          error = runtime->command_queue().enqueueNDRangeKernel(
+              kernel, cl::NDRange(0, i * block_size), cl::NDRange(roundup_gws[0], roundup_gws1),
+              cl::NDRange(params[0], params[1]), nullptr, &event);
+        }
         MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
       }
     } else {
       timer->ClearTiming();
-      error = runtime->command_queue().enqueueNDRangeKernel(
-          kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
-          cl::NDRange(params[0], params[1]), nullptr, &event);
+      if (is_qualcomm_opencl200) {
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
+            cl::NDRange(params[0], params[1]), nullptr, &event);
+      } else {
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
+            cl::NDRange(params[0], params[1]), nullptr, &event);
+      }
       MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
       timer->AccumulateTiming();
       tuning_result->assign(params.begin(), params.end());
@@ -367,10 +418,16 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
         for (uint32_t i = 0; i < num_blocks; ++i) {
           uint32_t gws1 =
               (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
-          uint32_t roundup_gws1 = RoundUp(gws1, params[1]);
-          error = runtime->command_queue().enqueueNDRangeKernel(
-              kernel, cl::NDRange(0, i * block_size), cl::NDRange(roundup_gws[0], roundup_gws1),
-              cl::NDRange(params[0], params[1]), nullptr, &event);
+          if (is_qualcomm_opencl200) {
+            error = runtime->command_queue().enqueueNDRangeKernel(
+                kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1),
+                cl::NDRange(params[0], params[1]), nullptr, &event);
+          } else {
+            uint32_t roundup_gws1 = RoundUp(gws1, params[1]);
+            error = runtime->command_queue().enqueueNDRangeKernel(
+                kernel, cl::NDRange(0, i * block_size), cl::NDRange(roundup_gws[0], roundup_gws1),
+                cl::NDRange(params[0], params[1]), nullptr, &event);
+          }
           MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
           timer->AccumulateTiming();
         }
diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h
index 89712c9b..5b4e0283 100644
--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -102,6 +102,8 @@ std::string Concat(Args... args) {
   return ss.str();
 }
 
+const bool IsQualcommOpenCL200();
+
 }  // namespace kernels
 }  // namespace mace
 #endif  // MACE_KERNELS_OPENCL_HELPER_H_
diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc
index 3609b1a6..9e293061 100644
--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -33,6 +33,8 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
     auto dt = DataTypeToEnum<T>::value;
@@ -40,6 +42,9 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
     built_options.emplace("-Dmatmul=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options);
   }
   uint32_t idx = 0;
diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc
index 4e97174e..f3d4714c 100644
--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -20,11 +20,14 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel_.get() == nullptr) {
     const DataType dt = DataTypeToEnum<T>::value;
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
     built_options.emplace("-Dpooling=" + kernel_name);
+
     if (pooling_type_ == MAX && input->dtype() == output->dtype()) {
       built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
       built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
@@ -36,6 +39,9 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     if (pooling_type_ == AVG) {
       built_options.emplace("-DPOOL_AVG");
     }
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options);
   }
 
diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc
index d6a18519..63c71ea7 100644
--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -30,6 +30,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
@@ -37,6 +39,9 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
     auto dt = DataTypeToEnum<T>::value;
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     kernel_ =
         runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
   }
diff --git a/mace/kernels/opencl/slice.cc b/mace/kernels/opencl/slice.cc
index f4e39089..55773a52 100644
--- a/mace/kernels/opencl/slice.cc
+++ b/mace/kernels/opencl/slice.cc
@@ -31,6 +31,8 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice");
@@ -38,6 +40,9 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
     built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
     built_options.emplace("-DCMD_DATA_TYPE="
                            + DtToCLCMDDt(DataTypeToEnum<T>::value));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     kernel_ = runtime->BuildKernel("slice", kernel_name, built_options);
   }
   const index_t channel_blk = RoundUpDiv4(output_channels);
diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax_opencl.cc
index 3ec6447a..321d7c29 100644
--- a/mace/kernels/opencl/softmax_opencl.cc
+++ b/mace/kernels/opencl/softmax_opencl.cc
@@ -28,6 +28,9 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
                            static_cast<uint32_t>(height * batch)};
 
   auto runtime = OpenCLRuntime::Global();
+
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
@@ -35,6 +38,9 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
     auto dt = DataTypeToEnum<T>::value;
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options);
   }
   if (!IsVecEqual(input_shape_, logits->shape())) {
diff --git a/mace/kernels/opencl/space_to_batch_opencl.cc b/mace/kernels/opencl/space_to_batch_opencl.cc
index b2de2748..128164f9 100644
--- a/mace/kernels/opencl/space_to_batch_opencl.cc
+++ b/mace/kernels/opencl/space_to_batch_opencl.cc
@@ -38,6 +38,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel_.get() == nullptr) {
     std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
     std::set<std::string> built_options;
@@ -47,6 +49,9 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
     built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
     built_options.emplace("-DCMD_DATA_TYPE=" +
                           DtToCLCMDDt(DataTypeToEnum<T>::value));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     kernel_ =
         runtime->BuildKernel("space_to_batch", kernel_name, built_options);
   }
diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc
index b3f4889b..c4a20a03 100644
--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -17,6 +17,8 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel_.get() == nullptr) {
     std::string obfuscated_kernel_name =
         MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
@@ -26,6 +28,9 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
                           DtToUpstreamCLDt(DataTypeToEnum<T>::value));
     built_options.emplace("-DCMD_DATA_TYPE=" +
                           DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
                                    built_options);
   }
@@ -90,6 +95,8 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel_.get() == nullptr) {
     std::string obfuscated_kernel_name =
         MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
@@ -100,6 +107,9 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
                           DtToUpstreamCLDt(DataTypeToEnum<T>::value));
     built_options.emplace("-DCMD_DATA_TYPE=" +
                           DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     built_options.emplace(bias != nullptr ? "-DBIAS" : "");
     switch (activation_) {
       case NOOP:
diff --git a/tools/bazel-adb-run.sh b/tools/bazel-adb-run.sh
index cf82cbf6..2e3dede0 100755
--- a/tools/bazel-adb-run.sh
+++ b/tools/bazel-adb-run.sh
@@ -18,8 +18,8 @@ BAZEL_BIN_PATH=${BAZEL_BIN_PATH#//}
 BAZEL_BIN_PATH=bazel-bin/$BAZEL_BIN_PATH
 BIN_NAME=`echo $BAZEL_TARGET | cut -d: -f2`
 
-ANDROID_ABI=armeabi-v7a
 ANDROID_ABI=arm64-v8a
+ANDROID_ABI=armeabi-v7a
 STRIP="--strip always"
 VLOG_LEVEL=0
 PROFILING="1"
-- 
GitLab