diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index 798409abf683c489104a96a80bf9938b906e3d4f..02fa29a0c64ab3faa2ced1f6a9ada0d9ff0fc44d 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -142,7 +142,6 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
   }
 
   bool gpu_detected = false;
-  bool is_adreno_gpu = false;
   device_ = std::make_shared<cl::Device>();
   for (auto device : all_devices) {
     if (device.getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_GPU) {
@@ -150,10 +149,18 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
       gpu_detected = true;
       const std::string device_name = device.getInfo<CL_DEVICE_NAME>();
       constexpr const char *kQualcommAdrenoGPUStr = "QUALCOMM Adreno(TM)";
+      constexpr const char *kMaliGPUStr = "Mali";
       if (device_name == kQualcommAdrenoGPUStr) {
-        is_adreno_gpu = true;
+        gpu_type_ = GPU_TYPE::QUALCOMM_ADRENO;
+      } else if (device_name.find(kMaliGPUStr) != std::string::npos) {
+        gpu_type_ = GPU_TYPE::MALI;
+      } else {
+        gpu_type_ = GPU_TYPE::UNKNOWN;
       }
 
+      const std::string device_version = device.getInfo<CL_DEVICE_VERSION>();
+      opencl_version_ = device_version.substr(7, 3);
+
       VLOG(1) << "Using device: " << device_name;
       break;
     }
@@ -171,7 +178,7 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
   }
 
   cl_int err;
-  if (is_adreno_gpu) {
+  if (gpu_type_ == GPU_TYPE::QUALCOMM_ADRENO) {
     std::vector<cl_context_properties> context_properties;
     context_properties.reserve(5);
     GetAdrenoContextProperties(&context_properties, gpu_perf_hint,
@@ -350,4 +357,12 @@ uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) {
   return size;
 }
 
+const GPU_TYPE OpenCLRuntime::GetGPUType() const {
+  return gpu_type_;
+}
+
+const std::string &OpenCLRuntime::GetOpenclVersion() {
+  return opencl_version_;
+}
+
 }  // namespace mace
diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h
index ce375b9aed7bafe47ce7cd97310a76e496008544..1b257e6bdbb06bbe7fccc1e9646ee674228f7e23 100644
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -18,6 +18,12 @@
 
 namespace mace {
 
+enum GPU_TYPE {
+  QUALCOMM_ADRENO,
+  MALI,
+  UNKNOWN,
+};
+
 class OpenCLProfilingTimer : public Timer {
  public:
   explicit OpenCLProfilingTimer(const cl::Event *event)
@@ -49,6 +55,8 @@ class OpenCLRuntime {
   uint64_t GetDeviceMaxWorkGroupSize();
   uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel);
   uint64_t GetKernelWaveSize(const cl::Kernel &kernel);
+  const GPU_TYPE GetGPUType() const;
+  const std::string &GetOpenclVersion();
   cl::Kernel BuildKernel(const std::string &program_name,
                          const std::string &kernel_name,
                          const std::set<std::string> &build_options);
@@ -74,6 +82,8 @@ class OpenCLRuntime {
   std::map<std::string, cl::Program> built_program_map_;
   std::mutex program_build_mutex_;
   std::string kernel_path_;
+  GPU_TYPE gpu_type_;
+  std::string opencl_version_;
 
   static GPUPerfHint gpu_perf_hint_;
   static GPUPriorityHint gpu_priority_hint_;
diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation_opencl.cc
index dfe703dde3bf9610f39c855897b7f38742ec2cb6..d7b89336d196ac701572a76ac23b3eedba4c46a8 100644
--- a/mace/kernels/opencl/activation_opencl.cc
+++ b/mace/kernels/opencl/activation_opencl.cc
@@ -26,14 +26,18 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
 
   auto runtime = OpenCLRuntime::Global();
 
-  if (kernel_.get() == nullptr) {
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
 
+  if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
     built_options.emplace("-Dactivation=" + kernel_name);
     auto dt = DataTypeToEnum<T>::value;
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     switch (activation_) {
       case RELU:
         tuning_key_prefix_ = "relu_opencl_kernel_";
diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc
index 94538fc246427f953c2d12a4fe110be36837a0df..37e6062a989f47baf8e613a9e1847c233d3061dc 100644
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -26,6 +26,8 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   for (int i = 1; i < size; ++i) {
     MACE_CHECK_NOTNULL(input_tensors[i]);
     MACE_CHECK(batch == input_tensors[i]->dim(0));
@@ -45,6 +47,10 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
     built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
+
     kernel_ = runtime->BuildKernel("addn", kernel_name, built_options);
   }
 
diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc
index d79b5c18f0a107eabe63563f347a349bc5544449..10b956de57e9715de1008940ead6a48d60a362f8 100644
--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -36,6 +36,8 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
     auto dt = DataTypeToEnum<T>::value;
@@ -43,6 +45,9 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     built_options.emplace("-Dbatch_norm=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     if (folded_constant_) {
       built_options.emplace("-DFOLDED_CONSTANT");
     }
diff --git a/mace/kernels/opencl/bias_add_opencl.cc b/mace/kernels/opencl/bias_add_opencl.cc
index 6932799523da6876a0597cd27ced9e0f97d53d24..ce0e596558ed76ee36dd84e40082aec718555948 100644
--- a/mace/kernels/opencl/bias_add_opencl.cc
+++ b/mace/kernels/opencl/bias_add_opencl.cc
@@ -28,6 +28,9 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
                            static_cast<uint32_t>(height * batch)};
 
   auto runtime = OpenCLRuntime::Global();
+
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
     auto dt = DataTypeToEnum<T>::value;
@@ -35,6 +38,9 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     built_options.emplace("-Dbias_add=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options);
   }
   if (!IsVecEqual(input_shape_, input->shape())) {
@@ -52,15 +58,22 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
       static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   const std::vector<uint32_t> lws = {8, kwg_size / 64, 8};
 
-  std::vector<uint32_t> roundup_gws(lws.size());
-  for (size_t i = 0; i < lws.size(); ++i) {
-    roundup_gws[i] = RoundUp(gws[i], lws[i]);
-  }
-
   cl::Event event;
-  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-      kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
-      cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+  cl_int error;
+  if (is_qualcomm_opencl200) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+  } else {
+    std::vector<uint32_t> roundup_gws(lws.size());
+    for (size_t i = 0; i < lws.size(); ++i) {
+      roundup_gws[i] = RoundUp(gws[i], lws[i]);
+    }
+
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
+        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+  }
   MACE_CHECK(error == CL_SUCCESS);
   if (future != nullptr) {
     future->wait_fn = [runtime, event](CallStats *stats) {
diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc
index 9fee7a957662d853b748d68d4f40dd9bb51671e3..0cec970aa48989d4f263c999d1f7da3ad83c7201 100644
--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -59,11 +59,19 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
                          : "winograd_filter_buffer_to_image";
       break;
   }
+
+  auto runtime = OpenCLRuntime::Global();
+
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
   std::set<std::string> built_options;
   std::stringstream kernel_name_ss;
   kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
   built_options.emplace(kernel_name_ss.str());
+  if (is_qualcomm_opencl200) {
+    built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+  }
   if (buffer->dtype() == image->dtype()) {
     built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
     built_options.emplace("-DCMD_DATA_TYPE=" +
@@ -74,7 +82,6 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
     built_options.emplace("-DCMD_DATA_TYPE=" +
                           DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
   }
-  auto runtime = OpenCLRuntime::Global();
   auto b2f_kernel = runtime->BuildKernel("buffer_to_image",
                                          obfuscated_kernel_name, built_options);
 
@@ -105,17 +112,24 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
   const uint32_t kwg_size =
       static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(b2f_kernel));
   const std::vector<uint32_t> lws = {16, kwg_size / 16};
-  std::vector<uint32_t> roundup_gws(lws.size());
-  for (size_t i = 0; i < lws.size(); ++i) {
-    roundup_gws[i] = RoundUp(gws[i], lws[i]);
-  }
 
   cl::Event event;
-  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-      b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
-      cl::NDRange(lws[0], lws[1]), nullptr, &event);
-  MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+  cl_int error;
+  if (is_qualcomm_opencl200) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
+        cl::NDRange(lws[0], lws[1]), nullptr, &event);
+  } else {
+    std::vector<uint32_t> roundup_gws(lws.size());
+    for (size_t i = 0; i < lws.size(); ++i) {
+      roundup_gws[i] = RoundUp(gws[i], lws[i]);
+    }
 
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
+        cl::NDRange(lws[0], lws[1]), nullptr, &event);
+  }
+  MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
   if (future != nullptr) {
     future->wait_fn = [runtime, event](CallStats *stats) {
       event.wait();
diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc
index 34bc57848fd09d98b4fd6a43dddadf7cc87ab9b4..9d566477eccd4b0349b2a27d3233a1d39518f030 100644
--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
@@ -36,6 +36,8 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
@@ -43,6 +45,9 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
     auto dt = DataTypeToEnum<T>::value;
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name,
                                    built_options);
   }
diff --git a/mace/kernels/opencl/cl/activation.cl b/mace/kernels/opencl/cl/activation.cl
index 23e6d60e1658694d4ae46706452989a6c827f3e0..a02b0e3570e7e27a96d6afab00843e3a6b648060 100644
--- a/mace/kernels/opencl/cl/activation.cl
+++ b/mace/kernels/opencl/cl/activation.cl
@@ -5,19 +5,28 @@ __kernel void activation(__read_only image2d_t input,
                          __read_only image2d_t alpha,
 #endif
                          __private const float relux_max_limit,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                          __write_only image2d_t output,
                          __private const int global_size_dim0,
                          __private const int global_size_dim1,
                          __private const int global_size_dim2) {
+#else
+                         __write_only image2d_t output) {
+#endif
+
   const int ch_blk = get_global_id(0);
   const int w = get_global_id(1);
   const int hb = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (ch_blk >= global_size_dim0 || w >= global_size_dim1
       || hb >= global_size_dim2) {
     return;
   }
-
   const int width = global_size_dim1;
+#else
+  const int width = get_global_size(1);
+#endif
 
   const int pos = mad24(ch_blk, width, w);
   DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
diff --git a/mace/kernels/opencl/cl/addn.cl b/mace/kernels/opencl/cl/addn.cl
index 4279fc23694a9e2ce36993a00c05cf7179d580ff..23e47e50157a8fe2ae1d8f12d3d16e5d7284ed6f 100644
--- a/mace/kernels/opencl/cl/addn.cl
+++ b/mace/kernels/opencl/cl/addn.cl
@@ -8,12 +8,20 @@ __kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
 #if INPUT_NUM > 3
                    __read_only image2d_t input3,
 #endif
+#ifndef USE_QUALCOMM_OPENCL_2_0
                    __write_only image2d_t output,
                    __private const int global_size_dim0,
                    __private const int global_size_dim1) {
+#else
+                   __write_only image2d_t output) {
+#endif
+
   const int w = get_global_id(0);
   const int hb = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (w >= global_size_dim0 || hb >= global_size_dim1) return;
+#endif
 
   DATA_TYPE4 in0 = READ_IMAGET(input0, SAMPLER, (int2)(w, hb));
   DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(w, hb));
diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl
index 5899fb006130c5347ce27a180d8b75022207dca1..d36c1e8bc054afeca6c0bda30740c54040bd906b 100644
--- a/mace/kernels/opencl/cl/batch_norm.cl
+++ b/mace/kernels/opencl/cl/batch_norm.cl
@@ -9,19 +9,28 @@ __kernel void batch_norm(__read_only image2d_t input,
                          __private const float epsilon,
 #endif
                          __write_only image2d_t output,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                          __private const float relux_max_limit,
                          __private const int global_size_dim0,
                          __private const int global_size_dim1,
                          __private const int global_size_dim2) {
+#else
+                         __private const float relux_max_limit) {
+#endif
+
   const int ch_blk = get_global_id(0);
   const int w = get_global_id(1);
   const int hb = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (ch_blk >= global_size_dim0 || w >= global_size_dim1
       || hb >= global_size_dim2) {
     return;
   }
-
   const int width = global_size_dim1;
+#else
+  const int width = get_global_size(1);
+#endif
 
 #ifdef FOLDED_CONSTANT
   DATA_TYPE4 bn_scale = READ_IMAGET(scale, SAMPLER, (int2)(ch_blk, 0));
diff --git a/mace/kernels/opencl/cl/bias_add.cl b/mace/kernels/opencl/cl/bias_add.cl
index d139652bb2833145b3ab17d64584d01a7b5ca45a..594528ce30ffb1164f1e02fc22ee12e708206ccd 100644
--- a/mace/kernels/opencl/cl/bias_add.cl
+++ b/mace/kernels/opencl/cl/bias_add.cl
@@ -2,19 +2,27 @@
 // Supported data types: half/float
 __kernel void bias_add(__read_only image2d_t input,
                        __read_only image2d_t bias,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                        __write_only image2d_t output,
                        __private const int global_size_dim0,
                        __private const int global_size_dim1,
                        __private const int global_size_dim2) {
+#else
+                       __write_only image2d_t output) {
+#endif
   const int ch_blk = get_global_id(0);
   const int w = get_global_id(1);
   const int hb = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (ch_blk >= global_size_dim0 || w >= global_size_dim1
       || hb >= global_size_dim2) {
     return;
   }
-
   const int width = global_size_dim1;
+#else
+  const int width = get_global_size(1);
+#endif
 
   const int pos = mad24(ch_blk, width, w);
   DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
diff --git a/mace/kernels/opencl/cl/buffer_to_image.cl b/mace/kernels/opencl/cl/buffer_to_image.cl
index faf1f091f2bddcbddd2e2c862c3113f71525f554..8e2f7184d63a3bed64d47aaaf66cc3b01b62943d 100644
--- a/mace/kernels/opencl/cl/buffer_to_image.cl
+++ b/mace/kernels/opencl/cl/buffer_to_image.cl
@@ -5,14 +5,22 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o
                                      __private const int filter_w,
                                      __private const int out_channel,
                                      __private const int in_channel,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                      __write_only image2d_t output,
                                      __private const int global_size_dim0,
                                      __private const int global_size_dim1) {
+#else
+                                     __write_only image2d_t output) {
+#endif
+
   int w = get_global_id(0);
   int h = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
+#endif
 
   const int out_channel_idx = h * 4;
   const int rounded_in_channel = ((in_channel + 3) / 4) * 4;
@@ -51,14 +59,22 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic
                                      __private const int filter_w,
                                      __private const int out_channel,
                                      __private const int in_channel,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                      __read_only image2d_t input,
                                      __private const int global_size_dim0,
                                      __private const int global_size_dim1) {
+#else
+                                     __read_only image2d_t input) {
+#endif
+
   int w = get_global_id(0);
   int h = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
+#endif
 
   const int out_channel_idx = h * 4;
   const int rounded_in_channel = ((in_channel + 3) / 4) * 4;
@@ -96,14 +112,22 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w
                                         __private const int filter_w,
                                         __private const int in_channel,
                                         __private const int multiplier,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                         __write_only image2d_t output,
                                         __private const int global_size_dim0,
                                         __private const int global_size_dim1) { /* ic%4 * kh * kw * m, ic/4 */
+#else
+                                        __write_only image2d_t output) {
+#endif
+
   const int w = get_global_id(0);
   const int h = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
+#endif
 
   DATA_TYPE4 values = 0;
   if (multiplier == 1) {
@@ -151,14 +175,22 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
                                      __private const int height,
                                      __private const int width,
                                      __private const int channels,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                      __write_only image2d_t output,
                                      __private const int global_size_dim0,
                                      __private const int global_size_dim1) {
+#else
+                                     __write_only image2d_t output) {
+#endif
+
   int w = get_global_id(0);
   int h = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
+#endif
 
   const int batch_idx = h / height;
   const int height_idx = h % height;
@@ -189,14 +221,22 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
                                      __private const int height,
                                      __private const int width,
                                      __private const int channels,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                      __read_only image2d_t input,
                                      __private const int global_size_dim0,
                                      __private const int global_size_dim1) {
+#else
+                                     __read_only image2d_t input) {
+#endif
+
   int w = get_global_id(0);
   int h = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
+#endif
 
   const int batch_idx = h / height;
   const int height_idx = h % height;
@@ -225,14 +265,22 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
 __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
                                   __private const int input_offset,
                                   __private const int count,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                   __write_only image2d_t output,
                                   __private const int global_size_dim0,
                                   __private const int global_size_dim1) {
+#else
+                                  __write_only image2d_t output) {
+#endif
+
   int w = get_global_id(0);
   int h = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
+#endif
 
   const int offset = input_offset + w * 4;
   const int size = count - w * 4;
@@ -257,14 +305,23 @@ __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
 
 __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
                                   __private const int count,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                   __read_only image2d_t input,
                                   __private const int global_size_dim0,
                                   __private const int global_size_dim1) {
+#else
+                                  __read_only image2d_t input) {
+#endif
+
   int w = get_global_id(0);
   int h = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
+#endif
+
   const int offset = w * 4;
 
   int2 coord = (int2)(w, h);
@@ -290,14 +347,22 @@ __kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //n
                                             __private const int height,
                                             __private const int width,
                                             __private const int channels,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                             __write_only image2d_t output,
                                             __private const int global_size_dim0,
                                             __private const int global_size_dim1) {
+#else
+                                            __write_only image2d_t output) {
+#endif
+
   int w = get_global_id(0);
   int h = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
+#endif
 
   const int wc = width * channels;
   const int height_blks = (height + 3) / 4;
@@ -329,14 +394,22 @@ __kernel void in_out_height_image_to_buffer(__global DATA_TYPE *output, //nhwc
                                             __private const int height,
                                             __private const int width,
                                             __private const int channels,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                             __read_only image2d_t input,
                                             __private const int global_size_dim0,
                                             __private const int global_size_dim1) {
+#else
+                                            __read_only image2d_t input) {
+#endif
+
   int w = get_global_id(0);
   int h = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
+#endif
 
   const int height_blks = (height + 3) / 4;
   const int batch_idx = h / height_blks;
@@ -366,14 +439,22 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n
                                            __private const int height,
                                            __private const int width,
                                            __private const int channels,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                            __write_only image2d_t output,
                                            __private const int global_size_dim0,
                                            __private const int global_size_dim1) {
+#else
+                                           __write_only image2d_t output) {
+#endif
+
   int w = get_global_id(0);
   int h = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
+#endif
 
   const int width_blks = (width + 3) / 4;
   const int batch_idx = h / height;
@@ -406,16 +487,26 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, /
                                               __private const int in_channels,
                                               __private const int height,
                                               __private const int width,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                               __write_only image2d_t output,
                                               __private const int global_size_dim0,
                                               __private const int global_size_dim1) {
+#else
+                                              __write_only image2d_t output) {
+#endif
+
   int w = get_global_id(0);
   int h = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
-
   const int out_channels = global_size_dim1;
+#else
+  const int out_channels = get_global_size(1);
+#endif
+
   const int out_channel_idx = h;
   const int in_channel_idx = w << 2;
   const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width;
@@ -492,14 +583,22 @@ __kernel void winograd_filter_image_to_buffer(__global DATA_TYPE *output, //Oc,
                                               __private const int height,
                                               __private const int width,
                                               __private const int channel,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                               __read_only image2d_t input,
                                               __private const int global_size_dim0,
                                               __private const int global_size_dim1) {
+#else
+                                              __read_only image2d_t input) {
+#endif
+
   const int w = get_global_id(0);
   const int h = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
+#endif
 
   const int width_idx = w << 2;
   const int size = width - width_idx;
diff --git a/mace/kernels/opencl/cl/channel_shuffle.cl b/mace/kernels/opencl/cl/channel_shuffle.cl
index 6437ee7fb5266563c4b95e9bf4946c1fd1c3ec78..87159784467f4e118f5aced5640d6185ff50f14f 100644
--- a/mace/kernels/opencl/cl/channel_shuffle.cl
+++ b/mace/kernels/opencl/cl/channel_shuffle.cl
@@ -4,19 +4,29 @@
 __kernel void channel_shuffle(__read_only image2d_t input,
                       __private const int groups,
                       __private const int channels_per_group,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                       __write_only image2d_t output,
-                       __private const int global_size_dim0,
-                       __private const int global_size_dim1,
-                       __private const int global_size_dim2) {
+                      __private const int global_size_dim0,
+                      __private const int global_size_dim1,
+                      __private const int global_size_dim2) {
+#else
+                      __write_only image2d_t output) {
+#endif
+
   const int group_chan_blk_idx = get_global_id(0);
   const int width_idx = get_global_id(1);
   const int hb_idx = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (group_chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
       || hb_idx >= global_size_dim2) {
     return;
   }
-
   const int width = global_size_dim1;
+#else
+  const int width = get_global_size(1);
+#endif
+
   const int group_blks = groups / 4;
   const int groups_blks_width = group_blks * width;
   const int channels_per_group_blks = channels_per_group / 4;
diff --git a/mace/kernels/opencl/cl/concat.cl b/mace/kernels/opencl/cl/concat.cl
index ac74f0f22c7e5add65957e9d3fb54a71f33cb161..c8bfebaa1765afed605ee6561fa96fddb24f4bd8 100644
--- a/mace/kernels/opencl/cl/concat.cl
+++ b/mace/kernels/opencl/cl/concat.cl
@@ -25,19 +25,29 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left,
 __kernel void concat_channel(__read_only image2d_t input0,
                              __read_only image2d_t input1,
                              __private const int input0_chan,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                              __write_only image2d_t output,
                              __private const int global_size_dim0,
                              __private const int global_size_dim1,
                              __private const int global_size_dim2) {
+#else
+                             __write_only image2d_t output) {
+#endif
+
   const int chan_blk_idx = get_global_id(0);
   const int width_idx = get_global_id(1);
   const int hb_idx = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
       || hb_idx >= global_size_dim2) {
     return;
   }
-
   const int width = global_size_dim1;
+#else
+  const int width = get_global_size(1);
+#endif
+
   const int input0_chan_blk = (input0_chan + 3) >> 2;
 
   DATA_TYPE4 data = 0;
@@ -82,19 +92,29 @@ __kernel void concat_channel(__read_only image2d_t input0,
 // Required: All input channels are divisible by 4
 __kernel void concat_channel_multi(__read_only image2d_t input,
                                    __private const int chan_blk_offset,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                    __write_only image2d_t output,
                                    __private const int global_size_dim0,
                                    __private const int global_size_dim1,
                                    __private const int global_size_dim2) {
+#else
+                                   __write_only image2d_t output) {
+#endif
+
   const int chan_blk_idx = get_global_id(0);
   const int width_idx = get_global_id(1);
   const int hb_idx = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
       || hb_idx >= global_size_dim2) {
     return;
   }
-
   const int width = global_size_dim1;
+#else
+  const int width = get_global_size(1);
+#endif
+
   DATA_TYPE4 data = 0;
   data = READ_IMAGET(input,
                      SAMPLER,
diff --git a/mace/kernels/opencl/cl/conv_2d.cl b/mace/kernels/opencl/cl/conv_2d.cl
index 75be47f1eb68bd8f8a5f433d37dd554509852aa8..f85bf1080dae99432d1786fd9828dcf32c3b6d37 100644
--- a/mace/kernels/opencl/cl/conv_2d.cl
+++ b/mace/kernels/opencl/cl/conv_2d.cl
@@ -18,20 +18,29 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                       __private const int padding_top,
                       __private const int padding_left,
                       __private const int dilation_h,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                       __private const int dilation_w,
                       __private const int global_size_dim0,
                       __private const int global_size_dim1,
                       __private const int global_size_dim2) {
+#else
+                      __private const int dilation_w) {
+#endif
+
   const int out_ch_blk = get_global_id(0);
   const int out_w_blk = get_global_id(1);
   const int out_hb = get_global_id(2);
 
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
       || out_hb >= global_size_dim2) {
     return;
   }
-
   const int out_w_blks = global_size_dim1;
+#else
+  const int out_w_blks = get_global_size(1);
+#endif
+
   const int rounded_in_ch = in_ch_blks << 2;
 
 #ifdef BIAS
diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl
index a9e4f95f8a5ad30b4b6589e3df21314182e3ac47..70d888670b26f536c451d2852f2b1652f8661a25 100644
--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
@@ -12,20 +12,28 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
                           __private const int in_ch_blks,
                           __private const int height,
                           __private const int width,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                           __private const int stride,
                           __private const int global_size_dim0,
                           __private const int global_size_dim1,
                           __private const int global_size_dim2) {
+#else
+                          __private const int stride) {
+#endif
+
   const int out_ch_blk = get_global_id(0);
   const int out_w_blk = get_global_id(1);
   const int out_hb = get_global_id(2);
 
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
       || out_hb >= global_size_dim2) {
     return;
   }
-
   const int out_w_blks = global_size_dim1;
+#else
+  const int out_w_blks = get_global_size(1);
+#endif
 
 #ifdef BIAS
   DATA_TYPE4 out0 = READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl
index b2d8eaa474a6d6b9d665cd855af838d02016352f..8f58255ab8100c6597bdb8bb701adbc8406e0537 100644
--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -16,20 +16,29 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
                           __private const int padding_top,
                           __private const int padding_left,
                           __private const int dilation_h,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                           __private const int dilation_w,
                           __private const int global_size_dim0,
                           __private const int global_size_dim1,
                           __private const int global_size_dim2) {
+#else
+                          __private const int dilation_w) {
+#endif
+
   const int out_ch_blk = get_global_id(0);
   const int out_w_blk = get_global_id(1);
   const int out_hb = get_global_id(2);
 
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
       || out_hb >= global_size_dim2) {
     return;
   }
-
   const int out_w_blks = global_size_dim1;
+#else
+  const int out_w_blks = get_global_size(1);
+#endif
+
   const int rounded_in_ch = in_ch_blks << 2;
 
 #ifdef BIAS
diff --git a/mace/kernels/opencl/cl/depthwise_conv2d.cl b/mace/kernels/opencl/cl/depthwise_conv2d.cl
index 28125a8decebee2a94a6a637a75f6989c6ea4b0c..7d39d3c1375c961039d933c3aa598bb1306fcdfc 100644
--- a/mace/kernels/opencl/cl/depthwise_conv2d.cl
+++ b/mace/kernels/opencl/cl/depthwise_conv2d.cl
@@ -18,19 +18,29 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h
                                __private const short padding_top,
                                __private const short padding_left,
                                __private const short dilation_h,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                __private const short dilation_w,
                                __private const int global_size_dim0,
                                __private const int global_size_dim1,
                                __private const int global_size_dim2) {
+#else
+                               __private const short dilation_w) {
+#endif
+
   const short out_ch_blk = get_global_id(0);
   const short out_w_blk = get_global_id(1);
   const short out_hb = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
       || out_hb >= global_size_dim2) {
     return;
   }
-
   const short out_w_blks = global_size_dim1;
+#else
+  const short out_w_blks = get_global_size(1);
+#endif
+
   const short rounded_in_ch = in_ch_blks << 2;
   const short in_ch_blk = out_ch_blk; // multiplier = 1
 
@@ -149,17 +159,25 @@ __kernel void depthwise_conv2d_s1(__read_only image2d_t input, /* [c%4 * w * c/4
                                   __private const short filter_height,
                                   __private const short filter_width,
                                   __private const short padding_top,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                   __private const short padding_left,
                                   __private const int global_size_dim0,
                                   __private const int global_size_dim1,
                                   __private const int global_size_dim2) {
+#else
+                                  __private const short padding_left) {
+#endif
+
   const short out_ch_blk = get_global_id(0);
   const short out_w_blk = get_global_id(1) << 2;
   const short out_hb = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (out_ch_blk >= global_size_dim0 || get_global_id(1) >= global_size_dim1
       || out_hb >= global_size_dim2) {
     return;
   }
+#endif
 
   const short rounded_in_ch = in_ch_blks << 2;
   const short in_ch_blk = out_ch_blk; // multiplier = 1
diff --git a/mace/kernels/opencl/cl/eltwise.cl b/mace/kernels/opencl/cl/eltwise.cl
index edfb777d0e222adad8fb6a07965276989e3794e7..d7c90e03924566165fd3665f36cb113d7f52b7fe 100644
--- a/mace/kernels/opencl/cl/eltwise.cl
+++ b/mace/kernels/opencl/cl/eltwise.cl
@@ -6,12 +6,20 @@ __kernel void eltwise(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
                       __private const float coeff0,
                       __private const float coeff1,
 #endif
+#ifndef USE_QUALCOMM_OPENCL_2_0
                       __write_only image2d_t output,
                       __private const int global_size_dim0,
                       __private const int global_size_dim1) {
+#else
+                      __write_only image2d_t output) {
+#endif
+
   const int w = get_global_id(0);
   const int hb = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (w >= global_size_dim0 || hb >= global_size_dim1) return;
+#endif
 
   DATA_TYPE4 in0 = READ_IMAGET(input0, SAMPLER, (int2)(w, hb));
   DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(w, hb));
diff --git a/mace/kernels/opencl/cl/fully_connected.cl b/mace/kernels/opencl/cl/fully_connected.cl
index 90d84c11360bc7ef61c4921df5366b545fb83dac..a474c8cacc6fb44b68ec716a87556ebae94e019f 100644
--- a/mace/kernels/opencl/cl/fully_connected.cl
+++ b/mace/kernels/opencl/cl/fully_connected.cl
@@ -10,14 +10,22 @@ __kernel void fully_connected(__read_only image2d_t input,
                               __private const int input_height,
                               __private const int input_width,
                               __private const int input_channel,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                               __private const float relux_max_limit,
                               __private const int global_size_dim0,
                               __private const int global_size_dim1) {
+#else
+                              __private const float relux_max_limit) {
+#endif
+
   const int batch_idx = get_global_id(0);
   const int out_blk_idx = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (batch_idx >= global_size_dim0 || out_blk_idx >= global_size_dim1) {
     return;
   }
+#endif
 
   const int input_chan_blk = (input_channel + 3) >> 2;
 
@@ -74,19 +82,28 @@ __kernel void fully_connected_width(__read_only image2d_t input,
                                     __private const int input_width,
                                     __private const int in_chan_blks,
                                     __private const int out_blks,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                     __private const float relux_max_limit,
                                     __private const int global_size_dim0,
                                     __private const int global_size_dim1,
                                     __private const int global_size_dim2) {
+#else
+                                    __private const float relux_max_limit) {
+#endif
+
   const int inter_out_idx = get_global_id(0);
   const int width_blk_idx = get_global_id(1);
   const int batch_out_blk_idx = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (inter_out_idx >= global_size_dim0 || width_blk_idx >= global_size_dim1
       || batch_out_blk_idx >= global_size_dim2) {
     return;
   }
-
   const int width_blk_count = global_size_dim1;
+#else
+  const int width_blk_count = get_global_size(1);
+#endif
 
   const int batch_idx = batch_out_blk_idx / out_blks;
   const int out_blk_idx = batch_out_blk_idx % out_blks;
diff --git a/mace/kernels/opencl/cl/matmul.cl b/mace/kernels/opencl/cl/matmul.cl
index f0c2ee0ebf7446c14067d22dbc0c890b9fd72f41..7107838c894b7050e2d6b3b02bd7e85712a87d4c 100644
--- a/mace/kernels/opencl/cl/matmul.cl
+++ b/mace/kernels/opencl/cl/matmul.cl
@@ -8,12 +8,20 @@ __kernel void matmul(__read_only image2d_t A,
                      __private const int N,
                      __private const int K,
                      __private const int height_blocks,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                      __private const int k_blocks,
                      __private const int global_size_dim0,
                      __private const int global_size_dim1) {
+#else
+                     __private const int k_blocks) {
+#endif
+
   const int gx = get_global_id(0) << 2;
   const int hb = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (get_global_id(0) >= global_size_dim0 || hb >= global_size_dim1) return;
+#endif
 
   const int batch = hb / height_blocks;
   const int ty = (hb % height_blocks);
diff --git a/mace/kernels/opencl/cl/pooling.cl b/mace/kernels/opencl/cl/pooling.cl
index dad48824dfb6cddfb2db2e01a56600294b3b85e5..8cdc4e4625a24e409c38697d9e40a39e57007ab7 100644
--- a/mace/kernels/opencl/cl/pooling.cl
+++ b/mace/kernels/opencl/cl/pooling.cl
@@ -27,19 +27,29 @@ __kernel void pooling(__read_only image2d_t input,
                       __private const int pad_left,
                       __private const int stride,
                       __private const int pooling_size,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                       __write_only image2d_t output,
                       __private const int global_size_dim0,
                       __private const int global_size_dim1,
                       __private const int global_size_dim2) {
+#else
+                      __write_only image2d_t output) {
+#endif
+
   const int out_chan_idx = get_global_id(0);
   const int out_width_idx = get_global_id(1);
   const int out_hb_idx = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (out_chan_idx >= global_size_dim0 || out_width_idx >= global_size_dim1
       || out_hb_idx >= global_size_dim2) {
     return;
   }
-
   const int out_width = global_size_dim1;
+#else
+  const int out_width = get_global_size(1);
+#endif
+
   const int batch_idx = mul24((out_hb_idx / out_height), in_height);
   const int in_height_start = mul24((out_hb_idx % out_height), stride) - pad_top;
   const int in_width_start = mul24(out_width_idx, stride) - pad_left;
diff --git a/mace/kernels/opencl/cl/resize_bilinear.cl b/mace/kernels/opencl/cl/resize_bilinear.cl
index b3778cb236aafe40a348d87bd27131407245b258..5369c7625846757556debce41a2835b9fa71651f 100644
--- a/mace/kernels/opencl/cl/resize_bilinear.cl
+++ b/mace/kernels/opencl/cl/resize_bilinear.cl
@@ -6,19 +6,30 @@ __kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w *
                                       __private const float width_scale,
                                       __private const int in_height,
                                       __private const int in_width,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                       __private const int out_height,
                                       __private const int global_size_dim0,
                                       __private const int global_size_dim1,
                                       __private const int global_size_dim2) {
+#else
+                                      __private const int out_height) {
+#endif
+
   const int ch_blk = get_global_id(0);
   const int w = get_global_id(1);
   const int hb = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (ch_blk >= global_size_dim0 || w >= global_size_dim1
       || hb >= global_size_dim2) {
     return;
   }
   const int ch_blks = global_size_dim0;
   const int out_width = global_size_dim1;
+#else
+  const int ch_blks = get_global_size(0);
+  const int out_width = get_global_size(1);
+#endif
 
   const int b = hb / out_height;
   const int h = hb % out_height;
diff --git a/mace/kernels/opencl/cl/slice.cl b/mace/kernels/opencl/cl/slice.cl
index a626c0de63311614fa1d7c6a9d226e797af18b64..bb5f40cda7f1fdafecab81d0e48c3b3b410b2388 100644
--- a/mace/kernels/opencl/cl/slice.cl
+++ b/mace/kernels/opencl/cl/slice.cl
@@ -2,19 +2,28 @@
 
 __kernel void slice(__read_only image2d_t input,
                     __private const int chan_blk_offset,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                     __write_only image2d_t output,
                     __private const int global_size_dim0,
                     __private const int global_size_dim1,
                     __private const int global_size_dim2) {
+#else
+                    __write_only image2d_t output) {
+#endif
+
   const int chan_blk_idx = get_global_id(0);
   const int width_idx = get_global_id(1);
   const int hb_idx = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
       || hb_idx >= global_size_dim2) {
     return;
   }
-
   const int width = global_size_dim1;
+#else
+  const int width = get_global_size(1);
+#endif
 
   DATA_TYPE4 data = READ_IMAGET(input, SAMPLER,
                                 (int2)(mad24(chan_blk_idx + chan_blk_offset,
diff --git a/mace/kernels/opencl/cl/softmax.cl b/mace/kernels/opencl/cl/softmax.cl
index e702739440e4609511cc240daf568126505792db..3fadd18e1b814e58ef0437e65aef7ea5820c9ca4 100644
--- a/mace/kernels/opencl/cl/softmax.cl
+++ b/mace/kernels/opencl/cl/softmax.cl
@@ -3,20 +3,30 @@
 __kernel void softmax(__read_only image2d_t input,
                       __private const int channels,
                       __private const int remain_channels,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                       __write_only image2d_t output,
                       __private const int global_size_dim0,
                       __private const int global_size_dim1,
                       __private const int global_size_dim2) {
+#else
+                      __write_only image2d_t output) {
+#endif
+
   const int chan_blk_idx = get_global_id(0);
   const int width_idx = get_global_id(1);
   const int hb_idx = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
       || hb_idx >= global_size_dim2) {
     return;
   }
-
   const int chan_blks = global_size_dim0 - 1;
   const int width = global_size_dim1;
+#else
+  const int chan_blks = get_global_size(0) - 1;
+  const int width = get_global_size(1);
+#endif
 
   int pos = width_idx;
   DATA_TYPE max_value = -FLT_MAX;
diff --git a/mace/kernels/opencl/cl/space_to_batch.cl b/mace/kernels/opencl/cl/space_to_batch.cl
index e36313fe651d9555253bef02e2e875824a4c5bb9..822d09063e2ff3d40bdf4ad954325db9be90cf14 100644
--- a/mace/kernels/opencl/cl/space_to_batch.cl
+++ b/mace/kernels/opencl/cl/space_to_batch.cl
@@ -9,17 +9,25 @@ __kernel void space_to_batch(__read_only image2d_t space_data,
                              __private const int space_height,
                              __private const int space_width,
                              __private const int batch_height,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                              __private const int batch_width,
                              __private const int global_size_dim0,
                              __private const int global_size_dim1,
                              __private const int global_size_dim2) {
+#else
+                             __private const int batch_width) {
+#endif
+
   const int chan_idx = get_global_id(0);
   const int batch_w_idx = get_global_id(1);
   const int batch_hb_idx = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1
       || batch_hb_idx >= global_size_dim2) {
     return;
   }
+#endif
 
   const int batch_b_idx = batch_hb_idx / batch_height;
   const int batch_h_idx = batch_hb_idx % batch_height;
@@ -55,17 +63,25 @@ __kernel void batch_to_space(__read_only image2d_t batch_data,
                              __private const int space_height,
                              __private const int space_width,
                              __private const int batch_height,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                              __private const int batch_width,
                              __private const int global_size_dim0,
                              __private const int global_size_dim1,
                              __private const int global_size_dim2) {
+#else
+                             __private const int batch_width) {
+#endif
+
   const int chan_idx = get_global_id(0);
   const int batch_w_idx = get_global_id(1);
   const int batch_hb_idx = get_global_id(2);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1
       || batch_hb_idx >= global_size_dim2) {
     return;
   }
+#endif
 
   const int batch_b_idx = batch_hb_idx / batch_height;
   const int batch_h_idx = batch_hb_idx % batch_height;
diff --git a/mace/kernels/opencl/cl/winograd_transform.cl b/mace/kernels/opencl/cl/winograd_transform.cl
index 3acfc9028b6e92b5af1d9eec3c469fe69dec3fea..098c8e3b2f119f2d55dfe5d0250ddf70086a8554 100644
--- a/mace/kernels/opencl/cl/winograd_transform.cl
+++ b/mace/kernels/opencl/cl/winograd_transform.cl
@@ -8,16 +8,25 @@ __kernel void winograd_transform_2x2(__read_only image2d_t input,
                                      __private const int round_hw,
                                      __private const int round_w,
                                      __private const int padding_top,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                      __private const int padding_left,
                                      __private const int global_size_dim0,
                                      __private const int global_size_dim1) {
+#else
+                                     __private const int padding_left) {
+#endif
+
   int out_width_idx = get_global_id(0);
   int chan_blk_idx = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (out_width_idx >= global_size_dim0 || chan_blk_idx >= global_size_dim1) {
     return;
   }
-
   const int chan_blk_size = global_size_dim1;
+#else
+  const int chan_blk_size = get_global_size(1);
+#endif
 
   const int batch_idx = out_width_idx / round_hw;
   const int t_idx = out_width_idx % round_hw;
@@ -121,16 +130,26 @@ __kernel void winograd_inverse_transform_2x2(__read_only image2d_t input,
                                              __private const int out_width,
                                              __private const int round_hw,
                                              __private const int round_w,
+#ifndef USE_QUALCOMM_OPENCL_2_0
                                              __private const float relux_max_limit,
                                              __private const int global_size_dim0,
                                              __private const int global_size_dim1) {
+#else
+                                             __private const float relux_max_limit) {
+#endif
+
   const int width_idx = get_global_id(0);
   const int height_idx = get_global_id(1);
+
+#ifndef USE_QUALCOMM_OPENCL_2_0
   if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) {
     return;
   }
-
   const int out_channel = global_size_dim1;
+#else
+  const int out_channel = get_global_size(1);
+#endif
+
   int width = width_idx;
   int height = height_idx;
 
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
index ccb5b6c20041d2b1744ab625dd144eba67560d99..1ddf37bc901e6456e8bc5ace3742fd07f5ee788c 100644
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -31,10 +31,15 @@ static void Concat2(cl::Kernel *kernel,
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel->get() == nullptr) {
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
     built_options.emplace("-Dconcat_channel=" + kernel_name);
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     if (input0->dtype() == output->dtype()) {
       built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
       built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
@@ -83,12 +88,18 @@ static void ConcatN(cl::Kernel *kernel,
   const index_t channel = output->dim(3);
 
   auto runtime = OpenCLRuntime::Global();
+
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel->get() == nullptr) {
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
     built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
   }
 
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
index 4bfa9ac7f1260cf86920fbc876f5cfb45276e83e..ad2af5a73a1a3e682c1334bbaa92945c0d49df97 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -37,6 +37,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
   const index_t input_channel_blocks = RoundUpDiv4(input_channels);
 
   auto runtime = OpenCLRuntime::Global();
+
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel->get() == nullptr) {
     MACE_CHECK(input_batch == batch);
 
@@ -45,6 +48,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
     built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     if (bias != nullptr) {
       built_options.emplace("-DBIAS");
     }
diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
index 97db8ab998e28f2152fdc9ebb1d9bc9bcd37e8d1..6ac0fa569ebe4ab1d58ca8a9a87cd1cc56564f44 100644
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -37,12 +37,17 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel->get() == nullptr) {
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3");
     built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     built_options.emplace(bias != nullptr ? "-DBIAS" : "");
     switch (activation) {
       case NOOP:
diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc
index 4f1b67f631c07bc76670ad10f7b42f4b2a1cd9fa..0fc944422fd1a22c4b37a3cce0123158b7bee1f3 100644
--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -37,12 +37,17 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel->get() == nullptr) {
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d");
     built_options.emplace("-Dconv_2d=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     built_options.emplace(bias != nullptr ? "-DBIAS" : "");
     switch (activation) {
       case NOOP:
diff --git a/mace/kernels/opencl/depthwise_conv_opencl.cc b/mace/kernels/opencl/depthwise_conv_opencl.cc
index 18b53853151a5e6c03d0b1abe14df69cd5c5b2f1..c43799db2d96312a63898904f5266bc8528ea810 100644
--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -42,6 +42,8 @@ void DepthwiseConv2d(cl::Kernel *kernel,
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel->get() == nullptr) {
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
@@ -51,6 +53,9 @@ void DepthwiseConv2d(cl::Kernel *kernel,
     } else {
       built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
     }
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
     built_options.emplace(bias != nullptr ? "-DBIAS" : "");
diff --git a/mace/kernels/opencl/eltwise_opencl.cc b/mace/kernels/opencl/eltwise_opencl.cc
index a2e4e8f1f3a4ef0987a46c4779b8afa51498c423..e2a68396d18045e94c4697295f3f1f6c8e1ec691 100644
--- a/mace/kernels/opencl/eltwise_opencl.cc
+++ b/mace/kernels/opencl/eltwise_opencl.cc
@@ -29,6 +29,8 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
     auto dt = DataTypeToEnum<T>::value;
@@ -37,6 +39,9 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
     built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
     kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options);
   }
diff --git a/mace/kernels/opencl/fully_connected_opencl.cc b/mace/kernels/opencl/fully_connected_opencl.cc
index 3e17f98fe24ba2bc16e881b221abf830b308ae31..208f402595d638dde161d90159c526615b473f54 100644
--- a/mace/kernels/opencl/fully_connected_opencl.cc
+++ b/mace/kernels/opencl/fully_connected_opencl.cc
@@ -24,8 +24,11 @@ void FCWXKernel(cl::Kernel *kernel,
     << "FC width kernel only support input with 4x channel.";
   MACE_CHECK_NOTNULL(gws);
   MACE_CHECK_NOTNULL(lws);
+
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel->get() == nullptr) {
     std::set<std::string> built_options;
     auto dt = DataTypeToEnum<T>::value;
@@ -34,6 +37,9 @@ void FCWXKernel(cl::Kernel *kernel,
     built_options.emplace("-Dfully_connected_width=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     if (bias != nullptr) {
       built_options.emplace("-DBIAS");
     }
@@ -133,14 +139,21 @@ void FCWTXKernel(cl::Kernel *kernel,
                  StatsFuture *future) {
   MACE_CHECK_NOTNULL(gws);
   MACE_CHECK_NOTNULL(lws);
+
+  auto runtime = OpenCLRuntime::Global();
+
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel->get() == nullptr) {
-    auto runtime = OpenCLRuntime::Global();
     std::set<std::string> built_options;
     auto dt = DataTypeToEnum<T>::value;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected");
     built_options.emplace("-Dfully_connected=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     if (bias != nullptr) {
       built_options.emplace("-DBIAS");
     }
diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc
index 2141c65ec1f6074b17e29533965415f20836926d..641abd664c190026314cc4419d8ee240ee031118 100644
--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -194,12 +194,25 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
   }
 }
 
+const bool IsQualcommOpenCL200() {
+  auto runtime = OpenCLRuntime::Global();
+
+  if (runtime->GetGPUType() == GPU_TYPE::QUALCOMM_ADRENO &&
+      runtime->GetOpenclVersion() == "2.0") {
+    return true;
+  } else {
+    return false;
+  }
+}
+
 void TuningOrRun3DKernel(const cl::Kernel &kernel,
                          const std::string tuning_key,
                          const uint32_t *gws,
                          const std::vector<uint32_t> &lws,
                          StatsFuture *future) {
   auto runtime = OpenCLRuntime::Global();
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
     const uint32_t kwg_size =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
@@ -236,8 +249,10 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
         << "Tuning parameters of 3D kernel must be 4D";
     cl_int error = CL_SUCCESS;
     std::vector<uint32_t> roundup_gws(3);
-    for (size_t i = 0; i < 3; ++i) {
-      roundup_gws[i] = RoundUp(gws[i], params[i]);
+    if(!is_qualcomm_opencl200) {
+      for (size_t i = 0; i < 3; ++i) {
+        roundup_gws[i] = RoundUp(gws[i], params[i]);
+      }
     }
 
     if (timer == nullptr) {
@@ -247,18 +262,31 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
       for (uint32_t i = 0; i < num_blocks; ++i) {
         uint32_t gws2 =
             (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-        uint32_t roundup_gws2 = RoundUp(gws2, params[2]);
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            kernel, cl::NDRange(0, 0, i * block_size),
-            cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws2),
-            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        if (is_qualcomm_opencl200) {
+          error = runtime->command_queue().enqueueNDRangeKernel(
+              kernel, cl::NDRange(0, 0, i * block_size),
+              cl::NDRange(gws[0], gws[1], gws2),
+              cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        } else {
+          uint32_t roundup_gws2 = RoundUp(gws2, params[2]);
+          error = runtime->command_queue().enqueueNDRangeKernel(
+              kernel, cl::NDRange(0, 0, i * block_size),
+              cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws2),
+              cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        }
         MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
       }
     } else {
       timer->ClearTiming();
-      error = runtime->command_queue().enqueueNDRangeKernel(
-          kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
-          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      if (is_qualcomm_opencl200) {
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      } else {
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      }
       MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
       timer->AccumulateTiming();
       tuning_result->assign(params.begin(), params.end());
@@ -274,11 +302,18 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
         for (uint32_t i = 0; i < num_blocks; ++i) {
           uint32_t gws2 =
               (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-          uint32_t roundup_gws2 = RoundUp(gws2, params[2]);
-          error = runtime->command_queue().enqueueNDRangeKernel(
-              kernel, cl::NDRange(0, 0, i * block_size),
-              cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws2),
-              cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+          if (is_qualcomm_opencl200) {
+            error = runtime->command_queue().enqueueNDRangeKernel(
+                kernel, cl::NDRange(0, 0, i * block_size),
+                cl::NDRange(gws[0], gws[1], gws2),
+                cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+          } else {
+            uint32_t roundup_gws2 = RoundUp(gws2, params[2]);
+            error = runtime->command_queue().enqueueNDRangeKernel(
+                kernel, cl::NDRange(0, 0, i * block_size),
+                cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws2),
+                cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+          }
           MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
           timer->AccumulateTiming();
         }
@@ -306,6 +341,8 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
                          const std::vector<uint32_t> &lws,
                          StatsFuture *future) {
   auto runtime = OpenCLRuntime::Global();
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
     const uint32_t kwg_size =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
@@ -330,8 +367,10 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
         << "Tuning parameters of 2D kernel must be 3d";
     cl_int error = CL_SUCCESS;
     std::vector<uint32_t> roundup_gws(2);
-    for (size_t i = 0; i < 2; ++i) {
-      roundup_gws[i] = RoundUp(gws[i], params[i]);
+    if (!is_qualcomm_opencl200) {
+      for (size_t i = 0; i < 2; ++i) {
+        roundup_gws[i] = RoundUp(gws[i], params[i]);
+      }
     }
 
     if (timer == nullptr) {
@@ -341,17 +380,29 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
       for (uint32_t i = 0; i < num_blocks; ++i) {
         uint32_t gws1 =
             (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
-        uint32_t roundup_gws1 = RoundUp(gws1, params[1]);
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            kernel, cl::NDRange(0, i * block_size), cl::NDRange(roundup_gws[0], roundup_gws1),
-            cl::NDRange(params[0], params[1]), nullptr, &event);
+        if (is_qualcomm_opencl200) {
+          error = runtime->command_queue().enqueueNDRangeKernel(
+              kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1),
+              cl::NDRange(params[0], params[1]), nullptr, &event);
+        } else {
+          uint32_t roundup_gws1 = RoundUp(gws1, params[1]);
+          error = runtime->command_queue().enqueueNDRangeKernel(
+              kernel, cl::NDRange(0, i * block_size), cl::NDRange(roundup_gws[0], roundup_gws1),
+              cl::NDRange(params[0], params[1]), nullptr, &event);
+        }
         MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
       }
     } else {
       timer->ClearTiming();
-      error = runtime->command_queue().enqueueNDRangeKernel(
-          kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
-          cl::NDRange(params[0], params[1]), nullptr, &event);
+      if (is_qualcomm_opencl200) {
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
+            cl::NDRange(params[0], params[1]), nullptr, &event);
+      } else {
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
+            cl::NDRange(params[0], params[1]), nullptr, &event);
+      }
       MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
       timer->AccumulateTiming();
       tuning_result->assign(params.begin(), params.end());
@@ -367,10 +418,16 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
         for (uint32_t i = 0; i < num_blocks; ++i) {
           uint32_t gws1 =
               (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
-          uint32_t roundup_gws1 = RoundUp(gws1, params[1]);
-          error = runtime->command_queue().enqueueNDRangeKernel(
-              kernel, cl::NDRange(0, i * block_size), cl::NDRange(roundup_gws[0], roundup_gws1),
-              cl::NDRange(params[0], params[1]), nullptr, &event);
+          if (is_qualcomm_opencl200) {
+            error = runtime->command_queue().enqueueNDRangeKernel(
+                kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1),
+                cl::NDRange(params[0], params[1]), nullptr, &event);
+          } else {
+            uint32_t roundup_gws1 = RoundUp(gws1, params[1]);
+            error = runtime->command_queue().enqueueNDRangeKernel(
+                kernel, cl::NDRange(0, i * block_size), cl::NDRange(roundup_gws[0], roundup_gws1),
+                cl::NDRange(params[0], params[1]), nullptr, &event);
+          }
           MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
           timer->AccumulateTiming();
         }
diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h
index 89712c9b96aa043f5019cde6eae23aa07109f6f7..5b4e028318c1487825f553dce28079d4bc2faccf 100644
--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -102,6 +102,8 @@ std::string Concat(Args... args) {
   return ss.str();
 }
 
+const bool IsQualcommOpenCL200();
+
 }  // namespace kernels
 }  // namespace mace
 #endif  // MACE_KERNELS_OPENCL_HELPER_H_
diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc
index 3609b1a6bac5f5d0d3588033372033af40b589c4..9e29306186f0714839a7c8f0763c5967bc11e21e 100644
--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -33,6 +33,8 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
     auto dt = DataTypeToEnum<T>::value;
@@ -40,6 +42,9 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
     built_options.emplace("-Dmatmul=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options);
   }
   uint32_t idx = 0;
diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc
index 4e97174ea2d1d312a261e9cb8a9ee686c586bda2..f3d4714cd325b48714f5ddf25e1b24d85aecb39b 100644
--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -20,11 +20,14 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel_.get() == nullptr) {
     const DataType dt = DataTypeToEnum<T>::value;
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
     built_options.emplace("-Dpooling=" + kernel_name);
+
     if (pooling_type_ == MAX && input->dtype() == output->dtype()) {
       built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
       built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
@@ -36,6 +39,9 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     if (pooling_type_ == AVG) {
       built_options.emplace("-DPOOL_AVG");
     }
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options);
   }
 
diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc
index d6a185194d184a6518a1445ee5e4d79f4a7a2e5b..63c71ea7fc4eb410b68ebba3dc707b5c331809c0 100644
--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -30,6 +30,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
@@ -37,6 +39,9 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
     auto dt = DataTypeToEnum<T>::value;
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     kernel_ =
         runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
   }
diff --git a/mace/kernels/opencl/slice.cc b/mace/kernels/opencl/slice.cc
index f4e3908953c4fed503a11f2db3365d89e775cded..55773a521c34c47635032b2b3d2dd4b8da346189 100644
--- a/mace/kernels/opencl/slice.cc
+++ b/mace/kernels/opencl/slice.cc
@@ -31,6 +31,8 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice");
@@ -38,6 +40,9 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
     built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
     built_options.emplace("-DCMD_DATA_TYPE="
                            + DtToCLCMDDt(DataTypeToEnum<T>::value));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     kernel_ = runtime->BuildKernel("slice", kernel_name, built_options);
   }
   const index_t channel_blk = RoundUpDiv4(output_channels);
diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax_opencl.cc
index 3ec6447ab3441adc95cbfc47c1bd15bb373603ae..321d7c296f9e756ca671e45ab4a6d554d72f40d8 100644
--- a/mace/kernels/opencl/softmax_opencl.cc
+++ b/mace/kernels/opencl/softmax_opencl.cc
@@ -28,6 +28,9 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
                            static_cast<uint32_t>(height * batch)};
 
   auto runtime = OpenCLRuntime::Global();
+
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel_.get() == nullptr) {
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
@@ -35,6 +38,9 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
     auto dt = DataTypeToEnum<T>::value;
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options);
   }
   if (!IsVecEqual(input_shape_, logits->shape())) {
diff --git a/mace/kernels/opencl/space_to_batch_opencl.cc b/mace/kernels/opencl/space_to_batch_opencl.cc
index b2de27486ff64c211d0bc1c617958f3cc66e51d6..128164f967f3ddadd547efa3862cd79529868fee 100644
--- a/mace/kernels/opencl/space_to_batch_opencl.cc
+++ b/mace/kernels/opencl/space_to_batch_opencl.cc
@@ -38,6 +38,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel_.get() == nullptr) {
     std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
     std::set<std::string> built_options;
@@ -47,6 +49,9 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
     built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
     built_options.emplace("-DCMD_DATA_TYPE=" +
                           DtToCLCMDDt(DataTypeToEnum<T>::value));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     kernel_ =
         runtime->BuildKernel("space_to_batch", kernel_name, built_options);
   }
diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc
index b3f4889b8b18b7bdc9340556c0e21e561e305baf..c4a20a0307e34e024556a0680051a6e36774772d 100644
--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -17,6 +17,8 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel_.get() == nullptr) {
     std::string obfuscated_kernel_name =
         MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
@@ -26,6 +28,9 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
                           DtToUpstreamCLDt(DataTypeToEnum<T>::value));
     built_options.emplace("-DCMD_DATA_TYPE=" +
                           DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
                                    built_options);
   }
@@ -90,6 +95,8 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
 
   auto runtime = OpenCLRuntime::Global();
 
+  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+
   if (kernel_.get() == nullptr) {
     std::string obfuscated_kernel_name =
         MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
@@ -100,6 +107,9 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
                           DtToUpstreamCLDt(DataTypeToEnum<T>::value));
     built_options.emplace("-DCMD_DATA_TYPE=" +
                           DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
+    if (is_qualcomm_opencl200) {
+      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    }
     built_options.emplace(bias != nullptr ? "-DBIAS" : "");
     switch (activation_) {
       case NOOP:
diff --git a/tools/bazel-adb-run.sh b/tools/bazel-adb-run.sh
index cf82cbf62c7eee08e1cc7df1522b7c15e1b0c53c..2e3dede092a6ddb0852ae2e75730310fd91ddeec 100755
--- a/tools/bazel-adb-run.sh
+++ b/tools/bazel-adb-run.sh
@@ -18,8 +18,8 @@ BAZEL_BIN_PATH=${BAZEL_BIN_PATH#//}
 BAZEL_BIN_PATH=bazel-bin/$BAZEL_BIN_PATH
 BIN_NAME=`echo $BAZEL_TARGET | cut -d: -f2`
 
-ANDROID_ABI=armeabi-v7a
 ANDROID_ABI=arm64-v8a
+ANDROID_ABI=armeabi-v7a
 STRIP="--strip always"
 VLOG_LEVEL=0
 PROFILING="1"