diff --git a/lite/backends/opencl/cl_context.cc b/lite/backends/opencl/cl_context.cc
index eff959d992200592c21a024f56713b9abb4b87fb..67d679fdd596b109b714bf7ba3cd45b2632b9420 100644
--- a/lite/backends/opencl/cl_context.cc
+++ b/lite/backends/opencl/cl_context.cc
@@ -157,6 +157,48 @@ cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size,
                      static_cast<size_t>(gws0)};
 #endif
 }
+cl::NDRange CLContext::LocalWorkSizeTurnReverse(cl::NDRange global_work_size,
+                                                size_t max_work_size,
+                                                int divisor) {
+  int preferred_lws = 0;
+#if 0
+  auto gws0 = global_work_size[0];
+  auto gws1 = global_work_size[1];
+  auto gws2 = global_work_size[2];
+#else
+  auto gws2 = global_work_size[0];
+  auto gws1 = global_work_size[1];
+  auto gws0 = global_work_size[2];
+#endif
+  if (divisor > 1) {
+    max_work_size /= divisor;
+  }
+  if (preferred_lws > 0 && preferred_lws <= max_work_size) {
+    max_work_size = preferred_lws;
+  }
+  while (gws1 > max_work_size && max_work_size > 0) {
+    gws1 = gws1 % 2 == 0 ? gws1 / 2 : 1;
+  }
+  while (gws2 * gws1 > max_work_size && max_work_size > 0) {
+    gws2 = gws2 % 2 == 0 ? gws2 / 2 : 1;
+  }
+  while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) {
+    gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1;
+  }
+#if 0
+  return cl::NDRange{static_cast<size_t>(gws0),
+                     static_cast<size_t>(gws1),
+                     static_cast<size_t>(gws2)};
+#else
+  return cl::NDRange{static_cast<size_t>(gws2),
+                     static_cast<size_t>(gws1),
+                     static_cast<size_t>(gws0)};
+#endif
+}
+
+bool CLContext::IsArmMali() {
+  return CLRuntime::Global()->GetGpuType() == GpuType::ARM_MALI;
+}
 
 cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size,
                                      size_t max_work_size) {
diff --git a/lite/backends/opencl/cl_context.h b/lite/backends/opencl/cl_context.h
index 06e6c7ee46d8b839873d433843f0035e3963664c..69ae11a8d71cc8c3dcae2b7ba81b4e19b44d1abe 100644
--- a/lite/backends/opencl/cl_context.h
+++ b/lite/backends/opencl/cl_context.h
@@ -66,6 +66,10 @@ class CLContext {
   cl::NDRange LocalWorkSizeTurn(cl::NDRange global_work_size,
                                 size_t max_work_size,
                                 int divitor = 2);
+  cl::NDRange LocalWorkSizeTurnReverse(cl::NDRange global_work_size,
+                                       size_t max_work_size,
+                                       int divitor = 2);
+  bool IsArmMali();
   //  cl::NDRange LocalWorkSizeConv1x1(cl::NDRange global_work_size,
   //                                   size_t max_work_size);
 
diff --git a/lite/backends/opencl/cl_runtime.cc b/lite/backends/opencl/cl_runtime.cc
index 93ceb6f9c5c55a406ca51b8c2d6279d304293fb3..929ec7838e23b9ca9259c19cd1808379664dbec3 100644
--- a/lite/backends/opencl/cl_runtime.cc
+++ b/lite/backends/opencl/cl_runtime.cc
@@ -191,6 +191,9 @@ bool CLRuntime::InitializeDevice() {
     }
     return t_str;
   };
+  const std::string device_version = device_->getInfo<CL_DEVICE_VERSION>();
+  LOG(INFO) << "device_version:" << device_version;
+
   LOG(INFO) << "device_type:" << device_type_to_str(device_type);
   device_info_["CL_DEVICE_TYPE"] = device_type;
 
@@ -317,6 +320,8 @@ std::map<std::string, size_t>& CLRuntime::GetDeviceInfo() {
   return device_info_;
 }
 
+GpuType& CLRuntime::GetGpuType() { return gpu_type_; }
+
 void CLRuntime::GetAdrenoContextProperties(
     std::vector<cl_context_properties>* properties,
     GPUPerfMode gpu_perf_mode,
diff --git a/lite/backends/opencl/cl_runtime.h b/lite/backends/opencl/cl_runtime.h
index 122422c79b80bc7e0823136cac150613d8c597dc..51e545cc3482ed7d080baa2734c8f84d8b486d3e 100644
--- a/lite/backends/opencl/cl_runtime.h
+++ b/lite/backends/opencl/cl_runtime.h
@@ -93,6 +93,8 @@ class CLRuntime {
 
   std::map<std::string, size_t>& GetDeviceInfo();
 
+  GpuType& GetGpuType();
+
  private:
   CLRuntime() { Init(); }
 
diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc
index 3de4512cb1d9d06b95d14c51615d5ab87e0a7419..362be682efc1c2330e27840ffded9586fa53ddf9 100644
--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
@@ -36,7 +36,7 @@ void ConvImageCompute::PrepareForRun() {
   float* filter_cpu = param.filter->mutable_data<float>();
   auto& context = ctx_->As<OpenCLContext>();
   CHECK(context.cl_context() != nullptr);
-
+  const bool is_mali = context.cl_context()->IsArmMali();
   filter_gpu_image_ = std::unique_ptr<Tensor>(new Tensor);
   tensor_hold_filter_image_ = std::unique_ptr<Tensor>(new Tensor);
   tensor_hold_bias_image_ = std::unique_ptr<Tensor>(new Tensor);
@@ -63,6 +63,7 @@ void ConvImageCompute::PrepareForRun() {
   bool stride_equal = stride_h == stride_w;
   bool dilation_equal = dilations[0] == dilations[1];
 
+  VLOG(3) << "Is arm mali  / " << (is_mali ? "Yes" : "No");
   VLOG(3) << "Is relu fused? / " << (relu_fused ? "Yes" : "No");
   VLOG(3) << "groups:" << groups << " stride_h:" << stride_h
           << " stride_w:" << stride_w << " pad_h:" << pad_h
@@ -278,7 +279,6 @@ void ConvImageCompute::PrepareForRun() {
 
 #endif
 #undef CONV3x3OPT_FALL_BACK
-
   } else if (kernel_h == 5 && kernel_w == 5) {
 #define CONV_5x5_OPT
 #ifndef CONV_5x5_OPT
@@ -393,7 +393,6 @@ void ConvImageCompute::PrepareForRun() {
     }
 #endif
 #undef CONV_7x7_OPT
-
   } else {
     LOG(FATAL) << "conv image compute not support this condition yet! ";
   }
@@ -477,6 +476,8 @@ void ConvImageCompute::PrepareForRun() {
     double min_turn_time = DBL_MAX;
     cl::NDRange best_local_work_size = context.cl_context()->LocalWorkSize(
         global_work_size_, max_work_group_size);
+    VLOG(3) << "origin  :local_work_size_ : " << best_local_work_size[0] << " "
+            << best_local_work_size[1] << " " << best_local_work_size[2];
     cl::NDRange last_local_work_size = cl::NDRange{
         static_cast<size_t>(0), static_cast<size_t>(0), static_cast<size_t>(0)};
     if (use_turn_) {
@@ -495,7 +496,30 @@ void ConvImageCompute::PrepareForRun() {
           // skiped turned lws
           continue;
         }
-        auto turn_time = this->Turn(5);
+        auto turn_time = this->Turn(10);
+        if (min_turn_time > turn_time) {
+          min_turn_time = turn_time;
+          best_local_work_size = local_work_size_;
+        }
+        last_local_work_size = local_work_size_;
+      }
+      // reverse
+      for (size_t i = 1; i < 15; i++) {
+        if (kernel_h == 1 && kernel_w == 1) {
+          // todo use diff logics
+          local_work_size_ = context.cl_context()->LocalWorkSizeTurnReverse(
+              global_work_size_, max_work_group_size, i);
+        } else {
+          local_work_size_ = context.cl_context()->LocalWorkSizeTurnReverse(
+              global_work_size_, max_work_group_size, i);
+        }
+        if (last_local_work_size[0] == local_work_size_[0] &&
+            last_local_work_size[1] == local_work_size_[1] &&
+            last_local_work_size[2] == local_work_size_[2]) {
+          // skiped turned lws
+          continue;
+        }
+        auto turn_time = this->Turn(10);
         if (min_turn_time > turn_time) {
           min_turn_time = turn_time;
           best_local_work_size = local_work_size_;
@@ -504,6 +528,8 @@ void ConvImageCompute::PrepareForRun() {
       }
     }
     local_work_size_ = best_local_work_size;
+    VLOG(3) << "chossen :local_work_size_ : " << local_work_size_[0] << " "
+            << local_work_size_[1] << " " << local_work_size_[2];
     VLOG(4) << "local_work_size_[3D]: {" << local_work_size_[0] << ","
             << local_work_size_[1] << "," << local_work_size_[2] << "}";
   }