[cherry-pick][BugFix][KERNEL][OPENCL] Fix opencl conv3x3 group (#4242)

* [BugFix][KERNEL][OPENCL] Fix conv3x3 group. test=develop (#4236) * cherry-pick Fix conv3x3 group * cherry-pick 394c2833(fix concat etc.). test=develop * cherry-pick transpose, transpose2(732bb91b). test=develop Co-authored-by: N dustybluebird <blbrd@outlook.com>

[cherry-pick][BugFix][KERNEL][OPENCL] Fix opencl conv3x3 group (#4242)
* [BugFix][KERNEL][OPENCL] Fix conv3x3 group. test=develop (#4236) * cherry-pick Fix conv3x3 group * cherry-pick 394c2833(fix concat etc.). test=develop * cherry-pick transpose, transpose2(732bb91b). test=develop Co-authored-by: N dustybluebird <blbrd@outlook.com>
98d5f6dd · ysh329 · GitHub · f260cefb · 98d5f6dd · 98d5f6dd
14 changed file
--- a/lite/backends/opencl/cl_kernel/buffer/concat_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/buffer/concat_kernel.cl
@@ -14,47 +14,62 @@ limitations under the License. */

 #include <cl_common.h>

-__kernel void concat2(__global const CL_DTYPE* x_data0, __global const CL_DTYPE* x_data1, __global CL_DTYPE* out_data, 
-    int size, int axis_size, int pre_size, int post_size, int total, int total0, int total1) {
-  const int index = get_global_id(0); 
-  if (index < size){
-    for (int i = 0; i < pre_size; i++){
-        int offset_out = index * post_size + i * total;
-        int offset_in = index * post_size + i * total0;
-        // memcpy(out_data + offset_out, x_data0 + offset_in, post_size);
-        CL_DTYPE* dst = out_data + offset_out;
-        CL_DTYPE* src = x_data0 + offset_in;
-        for (int k = 0; k < post_size; k++){
-           *dst++ = *src++;
-	}
-    }
-  }else if (index < axis_size){
-    for (int i = 0; i < pre_size; i++){
-        int offset_out = index * post_size + i * total;
-        int offset_in = index * post_size + i * total1;
-        // memcpy(out_data + offset_out, x_data1 + offset_in, post_size);
-        CL_DTYPE* dst = out_data + offset_out;
-        CL_DTYPE* src = x_data1 + offset_in;
-        for (int k = 0; k < post_size; k++){
-           *dst++ = *src++;
+__kernel void concat2(__global const CL_DTYPE* x_data0,
+                      __global const CL_DTYPE* x_data1,
+                      __global CL_DTYPE* out_data, 
+                      int size,
+                      int axis_size,
+                      int pre_size,
+                      int post_size,
+                      int total,
+                      int total0,
+                      int total1) {
+    const int index = get_global_id(0); 
+    if (index < size) {
+        for (int i = 0; i < pre_size; i++) {
+            int offset_out = index * post_size + i * total;
+            int offset_in = index * post_size + i * total0;
+            // memcpy(out_data + offset_out, x_data0 + offset_in, post_size);
+            __global CL_DTYPE* dst = (__global CL_DTYPE*)(out_data + offset_out);
+            __global CL_DTYPE* src = (__global CL_DTYPE*)(x_data0 + offset_in);
+            for (int k = 0; k < post_size; k++) {
+               *dst++ = *src++;
+            }
+        }
+    } else if (index < axis_size) {
+        for (int i = 0; i < pre_size; i++) {
+            int offset_out = index * post_size + i * total;
+            int offset_in = index * post_size + i * total1;
+            // memcpy(out_data + offset_out, x_data1 + offset_in, post_size);
+            __global CL_DTYPE* dst = (__global CL_DTYPE*)(out_data + offset_out);
+            __global CL_DTYPE* src = (__global CL_DTYPE*)(x_data1 + offset_in);
+            for (int k = 0; k < post_size; k++) {
+                *dst++ = *src++;
+            }
        }
    }
-  }
 }

-__kernel void concat_mul(__global const CL_DTYPE* x_data, __global CL_DTYPE* out_data, 
-    int axis_size, int pre_size, int post_size, int start, int total, int total0) {
-  const int index = get_global_id(0); 
-  if (index < axis_size){
-    for (int i = 0; i < pre_size; i++){
-        int offset_out = (start + index) * post_size + i * total;
-        int offset_in = index * post_size + i * total0;
-        // memcpy(out_data + offset_out, x_data + offset_in, post_size);
-        CL_DTYPE* dst = out_data + offset_out;
-        CL_DTYPE* src = x_data + offset_in;
-        for (int k = 0; k < post_size; k++){
-           *dst++ = *src++;
+__kernel void concat_mul_buffer(
+                     __global const CL_DTYPE* x_data,
+                     __global CL_DTYPE* out_data, 
+                     int axis_size,
+                     int pre_size,
+                     int post_size,
+                     int start,
+                     int total,
+                     int total0) {
+    const int index = get_global_id(0); // [0, axis_size)
+    if (index < axis_size) {
+        for (int i = 0; i < pre_size; i++) {
+            int offset_out = (start + index) * post_size + i * total;
+            int offset_in = index * post_size + i * total0;
+            // memcpy(out_data + offset_out, x_data + offset_in, post_size);
+            __global CL_DTYPE* dst = (__global CL_DTYPE*)(out_data + offset_out);
+            __global CL_DTYPE* src = (__global CL_DTYPE*)(x_data + offset_in);
+            for (int k = 0; k < post_size; k++) {
+                *dst++ = *src++;
+            }
        }
    }
-  }
 }
--- a/lite/backends/opencl/cl_kernel/image/activation_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/activation_kernel.cl
@@ -66,6 +66,22 @@ __kernel void sigmoid(__read_only image2d_t input,
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
 }

+__kernel void hard_sigmoid(__read_only image2d_t input,
+                           __write_only image2d_t output,
+                           __private const float value_offset,
+                           __private const float scale) {
+  const int x = get_global_id(0);  // image_width
+  const int y = get_global_id(1);  // image_height
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
+  CL_DTYPE4 out = clamp(in * (CL_DTYPE4)(scale) + (CL_DTYPE4)(value_offset), (CL_DTYPE4)(0.0), (CL_DTYPE4)(1.0));
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
+}
+
 __kernel void leaky_relu(__read_only image2d_t input,
                         __write_only image2d_t output,
                         __private const float threshold,

--- a/lite/backends/opencl/cl_kernel/image/concat_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/concat_kernel.cl
@@ -11,6 +11,286 @@ limitations under the License. */

 #include <cl_common.h>

+// deprecated
+__kernel void concatByCWith2Inputs(
+                    __write_only image2d_t output_image,
+                    __private const int output_tensor_c,
+                    __private const int output_tensor_w,
+                    __read_only image2d_t input0_image,
+                    __private const int input0_tensor_c,
+                    __read_only image2d_t input1_image,
+                    __private const int input1_tensor_c) {
+  const int out_c = get_global_id(0);   // [0, (output_tensor_c + 3) / 4)
+  const int out_w = get_global_id(1);   // [0, output_tensor_w)
+  const int out_nh = get_global_id(2);  // [0, output_tensor_n * output_tensor_h)
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+        CLK_ADDRESS_CLAMP |
+        CLK_FILTER_NEAREST;
+
+  int2 output_pos;
+  output_pos.x = out_c * output_tensor_w + out_w;
+  output_pos.y = out_nh;
+  CL_DTYPE4 output_data;
+
+  for (int i = 0; i < 4; i++) {
+    int c = out_c * 4 + i;
+    if (c >= output_tensor_c) {
+        break;
+    }
+    int c_in;
+    CL_DTYPE4 input_data;
+    if (c < input0_tensor_c) {
+      c_in = c;
+      int2 input_pos;
+      input_pos.x = (c_in / 4) * output_tensor_w + out_w;
+      input_pos.y = out_nh;
+      input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input0_image, sampler, input_pos);
+    } else {
+      c_in = c - input0_tensor_c;
+      int2 input_pos;
+      input_pos.x = (c_in / 4) * output_tensor_w + out_w;
+      input_pos.y = out_nh;
+      input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input1_image, sampler, input_pos);
+    }
+    int value_offset = c_in % 4;
+    float value;
+    if (value_offset == 0) {
+      value = input_data.x;
+    } else if (value_offset == 1) {
+      value = input_data.y;
+    } else if (value_offset == 2) {
+      value = input_data.z;
+    } else if (value_offset == 3) {
+      value = input_data.w;
+    }
+   if (i == 0) {
+      output_data.x = value;
+    } else if (i == 1) {
+      output_data.y = value;
+    } else if (i == 2) {
+      output_data.z = value;
+    } else if (i == 3) {
+      output_data.w = value;
+    }
+  }
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output_data);
+}
+
+
+__kernel void concatByCWith3Inputs(
+                    __write_only image2d_t output_image,
+                    __private const int output_tensor_c,
+                    __private const int output_tensor_w,
+                    __read_only image2d_t input0_image,
+                    __private const int input0_tensor_c,
+                    __read_only image2d_t input1_image,
+                    __private const int input1_tensor_c,
+                    __read_only image2d_t input2_image,
+                    __private const int input2_tensor_c) {
+  const int out_c = get_global_id(0);   // [0, (output_tensor_c + 3) / 4)
+  const int out_w = get_global_id(1);   // [0, output_tensor_w)
+  const int out_nh = get_global_id(2);  // [0, output_tensor_n * output_tensor_h)
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+        CLK_ADDRESS_CLAMP |
+        CLK_FILTER_NEAREST;
+
+  int2 output_pos;
+  output_pos.x = out_c * output_tensor_w + out_w;
+  output_pos.y = out_nh;
+  CL_DTYPE4 output_data;
+
+  for (int i = 0; i < 4; i++) {
+    int c = out_c * 4 + i;
+    if (c >= output_tensor_c) {
+        break;
+    }
+    int c_in;
+    CL_DTYPE4 input_data;
+    if (c < input0_tensor_c) {
+      c_in = c;
+      int2 input_pos;
+      input_pos.x = (c_in / 4) * output_tensor_w + out_w;
+      input_pos.y = out_nh;
+      input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input0_image, sampler, input_pos);
+    } else if (c < input0_tensor_c + input1_tensor_c) {
+      c_in = c - input0_tensor_c;
+      int2 input_pos;
+      input_pos.x = (c_in / 4) * output_tensor_w + out_w;
+      input_pos.y = out_nh;
+      input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input1_image, sampler, input_pos);
+    } else {
+      c_in = c - input0_tensor_c - input1_tensor_c;
+      int2 input_pos;
+      input_pos.x = (c_in / 4) * output_tensor_w + out_w;
+      input_pos.y = out_nh;
+      input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input2_image, sampler, input_pos);
+    }
+    int value_offset = c_in % 4;
+    float value;
+    if (value_offset == 0) {
+      value = input_data.x;
+    } else if (value_offset == 1) {
+      value = input_data.y;
+    } else if (value_offset == 2) {
+      value = input_data.z;
+    } else if (value_offset == 3) {
+      value = input_data.w;
+    }
+    if (i == 0) {
+      output_data.x = value;
+    } else if (i == 1) {
+      output_data.y = value;
+    } else if (i == 2) {
+      output_data.z = value;
+    } else if (i == 3) {
+      output_data.w = value;
+    }
+  }
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output_data);
+}
+
+
+__kernel void concatByCWith4Inputs(
+                    __write_only image2d_t output_image,
+                    __private const int output_tensor_c,
+                    __private const int output_tensor_w,
+                    __read_only image2d_t input0_image,
+                    __private const int input0_tensor_c,
+                    __read_only image2d_t input1_image,
+                    __private const int input1_tensor_c,
+                    __read_only image2d_t input2_image,
+                    __private const int input2_tensor_c,
+                    __read_only image2d_t input3_image,
+                    __private const int input3_tensor_c) {
+  const int out_c = get_global_id(0);   // [0, (output_tensor_c + 3) / 4)
+  const int out_w = get_global_id(1);   // [0, output_tensor_w)
+  const int out_nh = get_global_id(2);  // [0, output_tensor_n * output_tensor_h)
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+        CLK_ADDRESS_CLAMP |
+        CLK_FILTER_NEAREST;
+
+  int2 output_pos;
+  output_pos.x = out_c * output_tensor_w + out_w;
+  output_pos.y = out_nh;
+  CL_DTYPE4 output_data;
+
+  for (int i = 0; i < 4; i++) {
+    int c = out_c * 4 + i;
+    if (c >= output_tensor_c) {
+        break;
+    }
+    int c_in;
+    CL_DTYPE4 input_data;
+    if (c < input0_tensor_c) {
+      c_in = c;
+      int2 input_pos;
+      input_pos.x = (c_in / 4) * output_tensor_w + out_w;
+      input_pos.y = out_nh;
+      input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input0_image, sampler, input_pos);
+    } else if (c < input0_tensor_c + input1_tensor_c) {
+      c_in = c - input0_tensor_c;
+      int2 input_pos;
+      input_pos.x = (c_in / 4) * output_tensor_w + out_w;
+      input_pos.y = out_nh;
+      input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input1_image, sampler, input_pos);
+    } else if (c < input0_tensor_c + input1_tensor_c + input2_tensor_c) {
+      c_in = c - input0_tensor_c - input1_tensor_c;
+      int2 input_pos;
+      input_pos.x = (c_in / 4) * output_tensor_w + out_w;
+      input_pos.y = out_nh;
+      input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input2_image, sampler, input_pos);
+    }else if (c < input0_tensor_c + input1_tensor_c + input2_tensor_c + input3_tensor_c){
+      c_in = c - input0_tensor_c - input1_tensor_c - input2_tensor_c;
+      int2 input_pos;
+      input_pos.x = (c_in / 4) * output_tensor_w + out_w;
+      input_pos.y = out_nh;
+      input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input3_image, sampler, input_pos);
+    }
+    int value_offset = c_in % 4;
+    float value;
+    if (value_offset == 0) {
+      value = input_data.x;
+    } else if (value_offset == 1) {
+      value = input_data.y;
+    } else if (value_offset == 2) {
+      value = input_data.z;
+    } else if (value_offset == 3) {
+      value = input_data.w;
+    }
+    if (i == 0) {
+      output_data.x = value;
+    } else if (i == 1) {
+      output_data.y = value;
+    } else if (i == 2) {
+      output_data.z = value;
+    } else if (i == 3) {
+      output_data.w = value;
+    }
+  }
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output_data);
+}
+
+
+// deprecated
+__kernel void concatByH(__read_only image2d_t input_image,
+                        __write_only image2d_t output_image,
+                        __private const int out_W,
+                        __private const int out_H_Start) {
+
+  const int in_c = get_global_id(0);
+  const int in_w = get_global_id(1);
+  const int in_nh = get_global_id(2);
+
+  int2 input_pos;
+  input_pos.x = in_c * out_W + in_w;
+  input_pos.y = in_nh;
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+        CLK_ADDRESS_CLAMP |
+        CLK_FILTER_NEAREST;
+  CL_DTYPE4 input;
+  input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,input_pos);
+
+  int2 output_pos;
+  output_pos.x = input_pos.x;
+  output_pos.y = out_H_Start + input_pos.y;
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, input);
+
+}
+
+
+// deprecated
+__kernel void concatByW(__read_only image2d_t input_image,
+                        __write_only image2d_t output_image,
+                        __private const int in_W,
+                        __private const int pre_Width,
+                        __private const int out_Width) {
+
+  const int in_c = get_global_id(0);
+  const int in_w = get_global_id(1);
+  const int in_nh = get_global_id(2);
+
+  int2 input_pos;
+  input_pos.x = in_c * in_W + in_w;
+  input_pos.y = in_nh;
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+        CLK_ADDRESS_CLAMP |
+        CLK_FILTER_NEAREST;
+  CL_DTYPE4 input;
+  input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,input_pos);
+
+  int2 output_pos;
+  output_pos.x = input_pos.x + pre_Width + out_Width * in_c;
+  output_pos.y = input_pos.y;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, input);
+}
+
+
 __kernel void concat2(__read_only image2d_t input0,
                      __read_only image2d_t input1,
                      __write_only image2d_t output,
@@ -103,62 +383,3 @@ __kernel void concat2(__read_only image2d_t input0,
    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input);
  }
 }
-
-__kernel void concat_mul(__read_only image2d_t input,
-                         __write_only image2d_t output,
-                         int flag, int C_0, int out_C, int out_W, int in_W, int width) {
-  const int in_w = get_global_id(0); // image_width cxw/4
-  const int in_c = get_global_id(1); // image_width cxw/4
-  const int in_nh = get_global_id(2); // image_height nxh
-
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
-  int2 input_pos;
-  int2 output_pos;
-  input_pos.x = in_c * in_W + in_w;
-  input_pos.y = in_nh;
-  CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
-  if (flag == 1){ // by channel
-    CL_DTYPE4 output_data;
-    for (int i = 0; i < 4; i++) {
-      int c_out = C_0 + in_c * 4 + i;
-      if (c_out >= out_C) {
-        break;
-      }
-      int2 output_pos;
-      output_pos.x = (c_out / 4) * in_W + in_w;
-      output_pos.y = in_nh;
-      CL_DTYPE val;
-      if (i == 0) {
-        val = input_data.x;
-      } else if (i == 1) {
-        val = input_data.y;
-      } else if (i == 2) {
-        val = input_data.z;
-      } else if (i == 3) {
-        val = input_data.w;
-      }
-      if (c_out % 4 == 0){
-        output_data.x = val;
-      }else if (c_out % 4 == 1){
-        output_data.y = val;
-      }else if (c_out % 4 == 2){
-        output_data.z = val;
-      }else if (c_out % 4 == 3){
-        output_data.w = val;
-      }
-      WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, output_data);
-    }
-  }else if (flag == 2){ // by height, width == n
-    int2 output_pos;
-    output_pos.x = in_c * in_W + in_w;
-    output_pos.y = in_nh + C_0 * width;
-    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input_data);
-  }else if (flag == 3){ // by width, width == C
-    int2 output_pos;
-    output_pos.y = in_nh;
-    output_pos.x = in_c * out_W + (in_w + C_0);
-    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input_data);
-  }
-}
--- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/transpose_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/transpose_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void transpose_4d(__read_only image2d_t input_image,
+                           __write_only image2d_t output_image,
+                           __private const int out_C,
+                           __private const int out_H,
+                           __private const int out_W,
+                           __private const int in_W) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const int out_n = 1;
+  const int out_h = out_nh % out_H;
+  const int out_c0 = out_c * 4;
+  const int out_c1 = out_c * 4 + 1;
+  const int out_c2 = out_c * 4 + 2;
+  const int out_c3 = out_c * 4 + 3;
+
+  const int in_n = out_n;
+  const int in_c = out_w * 0.25;
+  const int in_h0 = out_c0;
+  const int in_h1 = out_c1;
+  const int in_h2 = out_c2;
+  const int in_h3 = out_c3;
+  const int in_w = out_h;
+
+  int2 output_pos;
+  output_pos.x = out_c * out_W + out_w;
+  output_pos.y = out_nh;
+
+  int2 input_pos0;
+  int2 input_pos1;
+  int2 input_pos2;
+  int2 input_pos3;
+
+  input_pos0.x = in_W * in_c + in_w;
+  input_pos0.y = in_n * in_h0;
+
+  input_pos1.x = in_W * in_c + in_w;
+  input_pos1.y = in_n * in_h1;
+
+  input_pos2.x = in_W * in_c + in_w;
+  input_pos2.y = in_n * in_h2;
+
+  input_pos3.x = in_W * in_c + in_w;
+  input_pos3.y = in_n * in_h3;
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 input0;
+  CL_DTYPE4 input1;
+  CL_DTYPE4 input2;
+  CL_DTYPE4 input3;
+  CL_DTYPE4 output;
+  input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos0);
+
+  if (out_w % 4 == 0) {
+    output.x = input0.x;
+  } else if (out_w % 4 == 1) {
+    output.x = input0.y;
+  } else if (out_w % 4 == 2) {
+    output.x = input0.z;
+  } else {
+    output.x = input0.w;
+  }
+  if (out_C - out_c * 4 >= 2) {
+    input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos1);
+    if(out_w % 4 == 0) {
+      output.y = input1.x;
+    } else if(out_w % 4 == 1) {
+      output.y = input1.y;
+    } else if(out_w % 4 == 2) {
+      output.y = input1.z;
+    } else {
+      output.y = input1.w;
+    }
+  } else {
+    output.y = 0.0f;
+  }
+
+  if (out_C - out_c * 4 >= 3) {
+    input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos2);
+    if (out_w % 4 == 0){
+      output.z = input2.x;
+    } else if (out_w % 4 == 1) {
+      output.z = input2.y;
+    } else if (out_w % 4 == 2) {
+      output.z = input2.z;
+    } else {
+      output.z = input2.w;
+    }
+  } else {
+    output.z = 0.0f;
+  }
+
+  if (out_C - out_c * 4 >= 4) {
+    input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos3);
+    if (out_w % 4 == 0) {
+      output.w = input3.x;
+    } else if (out_w % 4 == 1) {
+      output.w = input3.y;
+    } else if (out_w % 4 == 2) {
+      output.w = input3.z;
+    } else {
+      output.w = input3.w;
+    }
+  } else {
+    output.w = 0.0f;
+  }
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
+}
+
+__kernel void transpose(__read_only image2d_t input_image,
+                        __write_only image2d_t output_image,
+                        __private const int out_C,
+                        __private const int out_H,
+                        __private const int out_W,
+                        __private const int in_W) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const int out_n = 1;
+  const int out_h = out_nh % out_H;
+  
+  const int in_n = 1;
+  const int in_c = out_c;
+  const int in_w = out_h;
+  const int in_h = out_w;
+  
+  int2 input_pos;
+  int2 output_pos;
+  input_pos.x = in_c * in_W + in_w;
+  input_pos.y = in_n * in_h;
+  
+  output_pos.x = out_c * out_W + out_w;
+  output_pos.y = out_n * out_h;
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 input;
+  CL_DTYPE4 output;
+  input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos);
+
+  output = input;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, input);
+}
\ No newline at end of file
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -8,34 +8,34 @@ set(cl_kernel_deps op_params cl_runtime cl_context cl_wrapper cl_target_wrapper
 # image kernel      #
 #####################
 # basic
-add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(elementwise_sub_opencl OPENCL basic SRCS elementwise_sub_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(elementwise_mul_opencl OPENCL basic SRCS elementwise_mul_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(fusion_elementwise_add_activation_opencl
+add_kernel(elementwise_add_opencl_image OPENCL basic SRCS elementwise_add_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(elementwise_sub_opencl_image OPENCL basic SRCS elementwise_sub_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(elementwise_mul_opencl_image OPENCL basic SRCS elementwise_mul_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(fusion_elementwise_add_activation_opencl_image
           OPENCL basic SRCS fusion_elementwise_add_activation_image_compute.cc
-           DEPS elementwise_add_opencl ${cl_kernel_deps})
-add_kernel(fusion_elementwise_sub_activation_opencl
+           DEPS elementwise_add_opencl_image ${cl_kernel_deps})
+add_kernel(fusion_elementwise_sub_activation_opencl_image
           OPENCL basic SRCS fusion_elementwise_sub_activation_image_compute.cc
-           DEPS elementwise_sub_opencl ${cl_kernel_deps})
-
-add_kernel(pool_opencl OPENCL basic SRCS pool_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(activation_opencl OPENCL basic SRCS activation_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(reshape_opencl OPENCL basic SRCS reshape_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(conv_opencl OPENCL basic SRCS conv_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(layout_opencl OPENCL basic SRCS layout_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(concat_opencl OPENCL basic SRCS concat_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(scale_opencl OPENCL basic SRCS scale_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(grid_sampler_opencl OPENCL basic SRCS grid_sampler_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(lrn_opencl OPENCL basic SRCS lrn_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(bilinear_interp_opencl OPENCL basic SRCS bilinear_interp_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(slice_opencl OPENCL basic SRCS slice_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(instance_norm_opencl OPENCL basic SRCS instance_norm_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(dropout_opencl OPENCL basic SRCS dropout_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(pad2d_opencl OPENCL basic SRCS pad2d_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(box_coder_opencl OPENCL basic SRCS box_coder_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(pixel_shuffle_opencl OPENCL basic SRCS pixel_shuffle_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(expand_opencl OPENCL basic SRCS expand_image_compute.cc DEPS ${cl_kernel_deps})
+           DEPS elementwise_sub_opencl_image ${cl_kernel_deps})
+add_kernel(pool_opencl_image OPENCL basic SRCS pool_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(activation_opencl_image OPENCL basic SRCS activation_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(reshape_opencl_image OPENCL basic SRCS reshape_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(transpose_opencl_image OPENCL basic SRCS transpose_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(conv_opencl_image OPENCL basic SRCS conv_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(layout_opencl_image OPENCL basic SRCS layout_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(concat_opencl_image OPENCL basic SRCS concat_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(nearest_interp_opencl_image OPENCL basic SRCS nearest_interp_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(scale_opencl_image OPENCL basic SRCS scale_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(grid_sampler_opencl_image OPENCL basic SRCS grid_sampler_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(lrn_opencl_image OPENCL basic SRCS lrn_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(bilinear_interp_opencl_image OPENCL basic SRCS bilinear_interp_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(slice_opencl_image OPENCL basic SRCS slice_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(instance_norm_opencl_image OPENCL basic SRCS instance_norm_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(dropout_opencl_image OPENCL basic SRCS dropout_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(pad2d_opencl_image OPENCL basic SRCS pad2d_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(box_coder_opencl_image OPENCL basic SRCS box_coder_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(pixel_shuffle_opencl_image OPENCL basic SRCS pixel_shuffle_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(expand_opencl_image OPENCL basic SRCS expand_image_compute.cc DEPS ${cl_kernel_deps})

 # extra
 # wait to add ...
@@ -47,86 +47,89 @@ add_kernel(expand_opencl OPENCL basic SRCS expand_image_compute.cc DEPS ${cl_ker
 # image kernel test  #
 ######################
 lite_cc_test(test_activation_image_opencl SRCS activation_image_compute_test.cc
-             DEPS activation_opencl layout_opencl op_registry program context)
+             DEPS activation_opencl_image layout_opencl_image op_registry program context)

 lite_cc_test(test_conv_image_opencl SRCS conv_image_compute_test.cc
-             DEPS conv_opencl op_registry program context)
+             DEPS conv_opencl_image op_registry program context)

 lite_cc_test(test_depthwise_conv2d_image_opencl SRCS depthwise_conv2d_image_compute_test.cc
-             DEPS conv_opencl op_registry program context)
+             DEPS conv_opencl_image op_registry program context)

 lite_cc_test(test_nearest_interp_image_opencl SRCS nearest_interp_image_compute_test.cc
-             DEPS nearest_interp_opencl layout_opencl op_registry program context)
+             DEPS nearest_interp_opencl_image layout_opencl_image op_registry program context)

 lite_cc_test(test_pool_image_opencl SRCS pool_image_compute_test.cc
-             DEPS pool_opencl op_registry program context)
+             DEPS pool_opencl_image op_registry program context)

 lite_cc_test(test_scale_image_opencl SRCS scale_image_compute_test.cc
-             DEPS scale_opencl op_registry program context)
+             DEPS scale_opencl_image op_registry program context)

 lite_cc_test(test_reshape_image_opencl SRCS reshape_image_compute_test.cc
-             DEPS reshape_opencl op_registry program context)
+             DEPS reshape_opencl_image op_registry program context)

+lite_cc_test(test_transpose_image_opencl SRCS transpose_image_compute_test.cc
+             DEPS transpose_opencl_image layout_opencl_image op_registry program context)
+             
 lite_cc_test(test_concat_image_opencl SRCS concat_image_compute_test.cc
-             DEPS concat_opencl layout_opencl op_registry program context)
+             DEPS concat_opencl_image layout_opencl_image op_registry program context)

 #lite_cc_test(test_elementwise_mul_image_opencl SRCS elementwise_mul_image_compute_test.cc
-#             DEPS elementwise_mul_opencl op_registry program context)
+#             DEPS elementwise_mul_opencl_image op_registry program context)

 lite_cc_test(test_layout_image_opencl SRCS layout_image_compute_test.cc
-             DEPS layout_opencl op_registry program context)
+             DEPS layout_opencl_image op_registry program context)

 lite_cc_test(test_pixel_shuffle_image_opencl SRCS pixel_shuffle_image_compute_test.cc
-             DEPS pixel_shuffle_opencl op_registry program context)
+             DEPS pixel_shuffle_opencl_image op_registry program context)

 lite_cc_test(test_expand_image_opencl SRCS expand_image_compute_test.cc
-             DEPS expand_opencl op_registry program context)
+             DEPS expand_opencl_image op_registry program context)

 lite_cc_test(test_elementwise_add_image_opencl SRCS elementwise_add_image_compute_test.cc
-             DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context)
+             DEPS elementwise_add_opencl_image fusion_elementwise_add_activation_opencl_image op_registry program context)
 lite_cc_test(test_elementwise_sub_image_opencl SRCS elementwise_sub_image_compute_test.cc
-             DEPS elementwise_sub_opencl fusion_elementwise_sub_activation_opencl op_registry program context)
+             DEPS elementwise_sub_opencl_image fusion_elementwise_sub_activation_opencl_image op_registry program context)

 lite_cc_test(test_grid_sampler_image_opencl SRCS grid_sampler_image_compute_test.cc
-             DEPS grid_sampler_opencl op_registry program context)
+             DEPS grid_sampler_opencl_image op_registry program context)

 lite_cc_test(test_lrn_image_opencl SRCS lrn_image_compute_test.cc
-             DEPS lrn_opencl op_registry program context)
+             DEPS lrn_opencl_image op_registry program context)
             
 lite_cc_test(test_bilinear_interp_image_opencl SRCS bilinear_interp_image_compute_test.cc
-	         DEPS bilinear_interp_opencl op_registry program context)
+             DEPS bilinear_interp_opencl_image op_registry program context)
 lite_cc_test(test_slice_image_opencl SRCS slice_image_compute_test.cc
-	         DEPS slice_opencl op_registry program context)
+             DEPS slice_opencl_image op_registry program context)
             
-	     #lite_cc_test(test_instance_norm_image_opencl SRCS instance_norm_image_compute_test.cc
-	     #             DEPS instance_norm_opencl op_registry program context)
+#lite_cc_test(test_instance_norm_image_opencl SRCS instance_norm_image_compute_test.cc
+#             DEPS instance_norm_opencl_image op_registry program context)

 lite_cc_test(test_dropout_image_opencl SRCS dropout_image_compute_test.cc
-                 DEPS dropout_opencl op_registry program context)  
+             DEPS dropout_opencl_image op_registry program context)
                 
 lite_cc_test(test_pad2d_image_opencl SRCS pad2d_image_compute_test.cc
-                 DEPS pad2d_opencl layout_opencl op_registry program context)
+             DEPS pad2d_opencl_image layout_opencl_image op_registry program context)

 lite_cc_test(test_box_coder_image_opencl SRCS box_coder_image_compute_test.cc
-                 DEPS box_coder_opencl op_registry program context)
+             DEPS box_coder_opencl_image op_registry program context)

 ######################
 # buffer kernel      #
 ######################
 # basic
-#add_kernel(activation_opencl OPENCL basic SRCS activation_buffer_compute.cc DEPS ${cl_kernel_deps})
-#add_kernel(conv_opencl OPENCL basic SRCS conv_buffer_compute.cc DEPS ${cl_kernel_deps})
+#add_kernel(activation_opencl_buffer OPENCL basic SRCS activation_buffer_compute.cc DEPS ${cl_kernel_deps})
+#add_kernel(conv_opencl_buffer OPENCL basic SRCS conv_buffer_compute.cc DEPS ${cl_kernel_deps})
 #add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_buffer_compute.cc DEPS ${cl_kernel_deps})
-#add_kernel(pool_opencl OPENCL basic SRCS pool_buffer_compute.cc DEPS ${cl_kernel_deps})
-#add_kernel(concat_opencl OPENCL basic SRCS concat_buffer_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(fc_opencl OPENCL basic SRCS fc_buffer_compute.cc DEPS ${cl_kernel_deps})
+#add_kernel(pool_opencl_buffer OPENCL basic SRCS pool_buffer_compute.cc DEPS ${cl_kernel_deps})
+#add_kernel(concat_opencl_buffer OPENCL basic SRCS concat_buffer_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(fc_opencl_buffer OPENCL basic SRCS fc_buffer_compute.cc DEPS ${cl_kernel_deps})
 # NOTE(ysh329): use fc as `mul`, and mul is not used.
-#add_kernel(mul_opencl OPENCL basic SRCS mul_buffer_compute.cc DEPS ${cl_kernel_deps})
-#add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_buffer_compute.cc DEPS ${cl_kernel_deps})
-#add_kernel(fusion_elementwise_add_activation_opencl
+#add_kernel(mul_opencl_buffer OPENCL basic SRCS mul_buffer_compute.cc DEPS ${cl_kernel_deps})
+#add_kernel(elementwise_add_opencl_buffer OPENCL basic SRCS elementwise_add_buffer_compute.cc DEPS ${cl_kernel_deps})
+#add_kernel(fusion_elementwise_add_activation_opencl_buffer
 #           OPENCL basic SRCS fusion_elementwise_add_activation_buffer_compute.cc
 #           DEPS elementwise_add_opencl ${cl_kernel_deps})
-add_kernel(io_copy_opencl OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps})
+add_kernel(io_copy_opencl_buffer OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps})

 # extra
 # wait to add ...
@@ -152,10 +155,10 @@ add_kernel(io_copy_opencl OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${ten
 #             DEPS pool_opencl op_registry program context)

 #lite_cc_test(test_concat_buffer_opencl SRCS concat_buffer_compute_test.cc
-#             DEPS concat_opencl op_registry program context)
+#             DEPS concat_opencl_buffer op_registry program context)

 lite_cc_test(test_fc_buffer_opencl SRCS fc_buffer_compute_test.cc
-             DEPS fc_opencl op_registry program context)
+             DEPS fc_opencl_buffer op_registry program context)

 #lite_cc_test(test_mul_buffer_opencl SRCS mul_buffer_compute_test.cc
 #             DEPS mul_opencl op_registry program context)
@@ -164,4 +167,4 @@ lite_cc_test(test_fc_buffer_opencl SRCS fc_buffer_compute_test.cc
 #             DEPS elementwise_add_opencl op_registry program context)

 lite_cc_test(test_io_copy_buffer_opencl SRCS io_copy_buffer_compute_test.cc
-             DEPS io_copy_opencl op_registry program context)
+             DEPS io_copy_opencl_buffer op_registry program context)
--- a/lite/kernels/opencl/activation_image_compute.cc
+++ b/lite/kernels/opencl/activation_image_compute.cc
@@ -72,6 +72,11 @@ class ActivationComputeImageDefault
      case 8:
        kernel_func_name_ = "exp_act";
        break;
+      case 14:
+        kernel_func_name_ = "hard_sigmoid";
+        scale_ = act_param_->hard_sigmoid_slope;
+        threshold_ = act_param_->hard_sigmoid_offset;
+        break;
      default:
        LOG(FATAL) << "This act type:" << act_type << " doesn't support.";
        return;

--- a/lite/kernels/opencl/concat_buffer_compute.cc
+++ b/lite/kernels/opencl/concat_buffer_compute.cc
@@ -40,7 +40,7 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
    if (concat_param_->x.size() == 2) {
      kernel_func_name_ = "concat2";
    } else {
-      kernel_func_name_ = "concat_mul";
+      kernel_func_name_ = "concat_mul_buffer";
    }
    context.cl_context()->AddKernel(kernel_func_name_,
                                    "buffer/concat_kernel.cl",
@@ -86,7 +86,6 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
  void Run() override {
    auto& param = *param_.get_mutable<param_t>();
    const auto& x_dims = param.output->dims();
-    auto image_shape = InitImageDimInfoWith(x_dims);
    auto* out_buf =
        param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
    const auto& y_dims = param.output->dims();  // useless: check dim only
@@ -98,8 +97,9 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),

    auto inputs = param.x;
    int arg_idx = 0;
-    auto global_work_size = cl::NDRange{axis_size_};
+    auto global_work_size = cl::NDRange{static_cast<cl::size_type>(axis_size_)};
    int total = axis_size_ * post_size_;
+
    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
    if (inputs.size() == 2) {
      auto* x_buf0 = inputs[0]->data<float, cl::Buffer>();
@@ -144,6 +144,15 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
        auto* x_buf = inputs[i]->data<float, cl::Buffer>();
        global_work_size = cl::NDRange{static_cast<size_t>(size)};
        int total0 = size * post_size_;
+#ifdef LITE_WITH_LOG
+        LOG(INFO) << "------------- i=" << i << " -------------";
+        LOG(INFO) << "pre_size:" << pre_size_;
+        LOG(INFO) << "post_size:" << post_size_;
+        LOG(INFO) << "size:" << size;
+        LOG(INFO) << "start:" << start;
+        LOG(INFO) << "total:" << total;
+        LOG(INFO) << "total0:" << total0;
+#endif
        cl_int status = kernel.setArg(arg_idx, *x_buf);
        CL_CHECK_FATAL(status);
        status = kernel.setArg(++arg_idx, *out_buf);

--- a/lite/kernels/opencl/concat_buffer_compute_test.cc
+++ b/lite/kernels/opencl/concat_buffer_compute_test.cc
@@ -99,13 +99,14 @@ TEST(opencl_concat_buffer, compute) {
  auto *mapped_x2 = static_cast<float *>(
      TargetWrapperCL::Map(x2_data, 0, sizeof(float) * x2_dim.production()));
  for (int i = 0; i < x0_dim.production(); i++) {
-    mapped_x0[i] = dist(engine);
+    mapped_x0[i] = i + 1;  // dist(engine);
  }
  for (int i = 0; i < x1_dim.production(); i++) {
-    mapped_x1[i] = dist(engine);
+    mapped_x1[i] = x0_dim.production() + i + 1;  // dist(engine);
  }
  for (int i = 0; i < x2_dim.production(); i++) {
-    mapped_x2[i] = dist(engine);
+    mapped_x2[i] =
+        x0_dim.production() + x1_dim.production() + i + 1;  // dist(engine);
  }

  // set param and kernel, then run
@@ -151,9 +152,13 @@ TEST(opencl_concat_buffer, compute) {
  auto *out_data = out.mutable_data<float, cl::Buffer>();
  auto *mapped_out = static_cast<float *>(
      TargetWrapperCL::Map(out_data, 0, sizeof(float) * out_dim.production()));
+#ifdef PRINT_RESULT_CONCAT_BUFFER
  for (int i = 0; i < out_dim.production(); i++) {
-    EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6);
+    LOG(INFO) << "i:" << i << ", out[" << i << "]:" << mapped_out[i]
+              << ", out_ref_data[" << i << "]:" << out_ref_data[i];
  }
+#endif
+  EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6);
  TargetWrapperCL::Unmap(out_data, mapped_out);
  TargetWrapperCL::Unmap(x0_data, mapped_x0);
  TargetWrapperCL::Unmap(x1_data, mapped_x1);

--- a/lite/kernels/opencl/concat_image_compute.cc
+++ b/lite/kernels/opencl/concat_image_compute.cc
--- a/lite/kernels/opencl/concat_image_compute_test.cc
+++ b/lite/kernels/opencl/concat_image_compute_test.cc
--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
@@ -185,47 +185,29 @@ void ConvImageCompute::PrepareForRun() {

    impl_ = &ConvImageCompute::DepthwiseConv2d;
  } else if (filter_tensor_h_ == 3 && filter_tensor_w_ == 3) {
-// #define CONV3x3OPT_FALL_BACK
-#ifndef CONV3x3OPT_FALL_BACK
    // conv2d_3x3
-    kernel_func_names_.push_back(input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch"
-                                                     : "conv2d_3x3_opt");
-    kernel_func_paths_.push_back("image/conv2d_3x3_opt_kernel.cl");
-
-    CLImageConverterFolder converter;
-    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    filter_image_h_ = filter_image_dims[1];
-    filter_image_w_ = filter_image_dims[0];
-    tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
-
-    half_t* filter_image_data =
-        tensor_hold_filter_image_->mutable_data<half_t>();
-
-    converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
-    filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        filter_image_w_, filter_image_h_, filter_image_data);
-
-    impl_ = &ConvImageCompute::Conv2d3x3opt;
-#else
-    kernel_func_names_.push_back("conv2d_3x3");
-    kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl");
+    if (groups_ == 1) {
+      kernel_func_names_.push_back(
+          input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch" : "conv2d_3x3_opt");
+      kernel_func_paths_.push_back("image/conv2d_3x3_opt_kernel.cl");
+      impl_ = &ConvImageCompute::Conv2d3x3opt;
+    } else {  // groups_ > 1
+      kernel_func_names_.push_back("conv2d_3x3");
+      kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl");
+      impl_ = &ConvImageCompute::Conv2d3x3;
+    }

    CLImageConverterFolder converter;
    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
    filter_image_h_ = filter_image_dims[1];
    filter_image_w_ = filter_image_dims[0];
    tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
-
    half_t* filter_image_data =
        tensor_hold_filter_image_->mutable_data<half_t>();

    converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
    filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
        filter_image_w_, filter_image_h_, filter_image_data);
-
-    impl_ = &ConvImageCompute::Conv2d3x3;
-#endif
-#undef CONV3x3OPT_FALL_BACK
  } else if (filter_tensor_h_ == 5 && filter_tensor_w_ == 5) {
 #define CONV_5x5_OPT
 #ifndef CONV_5x5_OPT
@@ -584,6 +566,11 @@ void ConvImageCompute::GetGlobalWorkSize() {
                                    static_cast<size_t>(w_blk_),
                                    static_cast<size_t>(nh_blk_)};
    input_c_block_ = static_cast<const int>((input_tensor_c_ + 3) / 4);
+  } else if (kernel_func_names_[0] == "conv2d_3x3") {
+    global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                    static_cast<size_t>(w_blk_),
+                                    static_cast<size_t>(nh_blk_)};
+
  } else if (kernel_func_names_[0] == "conv2d_3x3_multi_batch" ||
             kernel_func_names_[0] == "conv2d_3x3_opt") {
    int w_blk_size = 5;
@@ -1185,6 +1172,56 @@ void ConvImageCompute::PrintConvInfo() {
  VLOG(4) << "dilations: " << dilation_h_ << ", " << dilation_w_;
  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
          << global_work_size_[1] << "," << global_work_size_[2] << "}";
+  VLOG(4) << "groups_:" << groups_;
+
+  LOG(INFO) << "================================";
+  LOG(INFO) << "c_blk_=" << c_blk_ << ", w_blk_=" << w_blk_
+            << ",nh_blk_=" << nh_blk_;
+  LOG(INFO) << "input_image_p_:" << input_image_p_;
+  LOG(INFO) << "filter_image_p_:" << filter_image_p_;
+  LOG(INFO) << "bias_image_p_:" << bias_image_p_;
+  LOG(INFO) << "output_image_p_:" << output_image_p_;
+
+  LOG(INFO) << "stride_h_:" << stride_h_;
+  LOG(INFO) << "stride_w_:" << stride_w_;
+
+  LOG(INFO) << "dilation_h_:" << dilation_h_;
+  LOG(INFO) << "dilation_w_:" << dilation_w_;
+
+  LOG(INFO) << "pad_up_:" << pad_up_;
+  LOG(INFO) << "pad_down_:" << pad_down_;
+  LOG(INFO) << "pad_left_:" << pad_left_;
+  LOG(INFO) << "pad_right_:" << pad_right_;
+
+  LOG(INFO) << "offset_:" << offset_;
+  LOG(INFO) << "groups_:" << groups_;
+  LOG(INFO) << "relu_fused_:" << relu_fused_;
+  LOG(INFO) << "has_bias_:" << has_bias_;
+
+  LOG(INFO) << "input_tensor_n_:" << input_tensor_n_;
+  LOG(INFO) << "input_tensor_c_:" << input_tensor_c_;
+  LOG(INFO) << "input_tensor_h_:" << input_tensor_h_;
+  LOG(INFO) << "input_tensor_w_:" << input_tensor_w_;
+  LOG(INFO) << "input_image_h_:" << input_image_h_;
+  LOG(INFO) << "input_image_w_:" << input_image_w_;
+  LOG(INFO) << "input_c_block_:" << input_c_block_;
+
+  LOG(INFO) << "output_tensor_n_:" << output_tensor_n_;
+  LOG(INFO) << "output_tensor_c_:" << output_tensor_c_;
+  LOG(INFO) << "output_tensor_h_:" << output_tensor_h_;
+  LOG(INFO) << "output_tensor_w_:" << output_tensor_w_;
+  LOG(INFO) << "output_image_h_:" << output_image_h_;
+  LOG(INFO) << "output_image_w_:" << output_image_w_;
+
+  LOG(INFO) << "filter_tensor_n_:" << filter_tensor_n_;
+  LOG(INFO) << "filter_tensor_c_:" << filter_tensor_c_;
+  LOG(INFO) << "filter_tensor_h_:" << filter_tensor_h_;
+  LOG(INFO) << "filter_tensor_w_:" << filter_tensor_w_;
+  LOG(INFO) << "filter_image_h_:" << filter_image_h_;
+  LOG(INFO) << "filter_image_w_:" << filter_image_w_;
+
+  LOG(INFO) << "bias_image_h_" << bias_image_h_;
+  LOG(INFO) << "bias_image_w_" << bias_image_w_;
 }

 double ConvImageCompute::Tune(int times) {

--- a/lite/kernels/opencl/transpose_image_compute.cc
+++ b/lite/kernels/opencl/transpose_image_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/opencl/cl_half.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/logging.h"
+#include "lite/utils/replace_stl/stream.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif
+#include "lite/backends/opencl/cl_utility.h"
+
+#undef LITE_WITH_LOG
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+// transpose operator
+class TransposeComputeFloatImage
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFP16),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::TransposeParam;
+
+  void PrepareForRun() override {
+    auto& param = *param_.get_mutable<param_t>();
+    Tensor* const output = param.output;
+    const DDimLite& out_dims = output->dims();
+    if (out_dims.size() == 4) {
+      kernel_func_name_ = "transpose_4d";
+    } else {
+      kernel_func_name_ = "transpose";
+    }
+    auto& context = ctx_->As<OpenCLContext>();
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/transpose_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
+  }
+
+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const Tensor* const x = param.x;
+    const auto x_dims = x->dims();
+    const std::map<std::string, size_t>& input_image_shape =
+        InitImageDimInfoWith(x_dims);
+    const int64_t& input_image_width = input_image_shape.at("width");
+    const int64_t& input_image_height = input_image_shape.at("height");
+    const cl::Image2D* const x_image = x->data<half_t, cl::Image2D>();
+
+    Tensor* const output = param.output;
+    const DDimLite& out_dims = output->dims();
+    VLOG(4) << "out_dims= " << out_dims;
+    const std::map<std::string, size_t>& out_image_shape =
+        InitImageDimInfoWith(out_dims);
+    cl::Image2D* const out_image = output->mutable_data<half_t, cl::Image2D>(
+        out_image_shape.at("width"), out_image_shape.at("height"));
+#ifdef LITE_WITH_LOG
+    VLOG(4) << "out_dims=   " << out_dims;
+#endif
+    const std::vector<size_t>& default_work_size = DefaultWorkSize(
+        out_dims,
+        DDim(std::vector<DDim::value_type>{
+            static_cast<int64_t>(out_image_shape.at("width")),
+            static_cast<int64_t>(out_image_shape.at("height"))}));
+
+    int out_C = 0, out_H = 0, out_W = 0, in_W = 0;
+    if (param.output->dims().size() == 4) {
+      out_C = out_dims[1];
+      out_H = out_dims[2];
+      out_W = out_dims[3];
+      in_W = x_dims[3];
+    } else if (param.output->dims().size() == 3) {
+      out_C = out_dims[0];
+      out_H = out_dims[1];
+      out_W = out_dims[2];
+      in_W = x_dims[2];
+    } else if (param.output->dims().size() == 2) {
+      out_C = 1;
+      out_H = out_dims[0];
+      out_W = out_dims[1];
+      in_W = x_dims[1];
+    }
+
+#ifdef LITE_WITH_LOG
+    VLOG(4) << "out_C=" << out_C;
+    VLOG(4) << "out_H=" << out_H;
+    VLOG(4) << "out_W=" << out_W;
+    VLOG(4) << "in_W=" << in_W;
+    VLOG(4) << "default_work_size= " << default_work_size[0] << ", "
+            << default_work_size[1] << ", " << default_work_size[2];
+#endif
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+#ifdef LITE_WITH_LOG
+    VLOG(4) << TargetToStr(x->target());
+    VLOG(4) << TargetToStr(param.output->target());
+#endif
+
+    int arg_idx = 0;
+    cl_int status;
+    status = kernel.setArg(arg_idx, *x_image);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_image);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, out_C);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, out_H);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, out_W);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, in_W);
+    CL_CHECK_FATAL(status);
+
+    auto global_work_size =
+        cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                    static_cast<size_t>(default_work_size.data()[1]),
+                    static_cast<size_t>(default_work_size.data()[2])};
+
+    status = EnqueueNDRangeKernel(context,
+                                  kernel,
+                                  cl::NullRange,
+                                  global_work_size,
+                                  cl::NullRange,
+                                  nullptr,
+                                  event_);
+    CL_CHECK_FATAL(status);
+  }
+
+ private:
+  std::string kernel_func_name_{"transpose"};
+  std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
+};
+
+// transpose2 operator
+class Transpose2ComputeFloatImage
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFP16),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::TransposeParam;
+
+  void PrepareForRun() override {}
+
+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {}
+#endif
+
+  bool IsShuffleChannel(const std::vector<int>& axis) {
+    bool is_shuffle_channel = true;
+    if (axis.size() > 2 && axis[0] == 0 && axis[1] == 2 && axis[2] == 1) {
+      for (int i = 3; i < axis.size(); ++i) {
+        if (axis[i] != i) {
+          is_shuffle_channel = false;
+          break;
+        }
+      }
+    } else {
+      return false;
+    }
+    return is_shuffle_channel;
+  }
+
+  template <typename Dtype>
+  void DeviceTensorToHostTensor(const Tensor* device_tensor,
+                                Tensor* host_tensor) {
+    host_tensor->Resize(device_tensor->dims());
+    Dtype* host_ptr = host_tensor->mutable_data<Dtype>();
+    CLRuntime::Global()->command_queue().finish();
+    CLImageConverterDefault default_converter;
+    auto device_tensor_image_dim =
+        default_converter.InitImageDimInfoWith(device_tensor->dims());
+    half_t* image_data = new half_t[device_tensor_image_dim.production() * 4];
+    TargetWrapperCL::ImgcpySync(image_data,
+                                device_tensor->data<half_t, cl::Image2D>(),
+                                device_tensor_image_dim[0],
+                                device_tensor_image_dim[1],
+                                0,
+                                0,
+                                IoDirection::DtoH);
+    default_converter.ImageToNCHW(
+        image_data, host_ptr, device_tensor_image_dim, host_tensor->dims());
+    delete[] image_data;
+  }
+
+  template <typename Dtype>
+  void HostTensorToDeviceTensor(const Tensor* host_tensor,
+                                Tensor* device_tensor) {
+    Dtype* host_ptr = const_cast<Dtype*>(host_tensor->data<Dtype>());
+    CLImageConverterDefault default_converter;
+    auto device_tensor_image_dim =
+        default_converter.InitImageDimInfoWith(device_tensor->dims());
+    device_tensor->mutable_data<half_t, cl::Image2D>(
+        device_tensor_image_dim[0], device_tensor_image_dim[1]);
+    half_t* image_data = new half_t[device_tensor->dims().production() * 4];
+    default_converter.NCHWToImage(host_ptr, image_data, device_tensor->dims());
+
+    TargetWrapperCL::ImgcpySync(
+        device_tensor->mutable_data<half_t, cl::Image2D>(),
+        image_data,
+        device_tensor_image_dim[0],
+        device_tensor_image_dim[1],
+        0,
+        0,
+        IoDirection::HtoD);
+
+    delete[] image_data;
+  }
+
+  template <typename Dtype>
+  void ShuffleChannelCompute(const operators::TransposeParam& param) {
+    const Tensor* input = param.x;
+    Tensor* input_tensor = new Tensor();
+    DeviceTensorToHostTensor<Dtype>(input, input_tensor);
+    Dtype* input_ptr = input_tensor->mutable_data<Dtype>();
+
+    Tensor* output = param.output;
+    Tensor* output_tensor = new Tensor();
+    output_tensor->Resize(output->dims());
+    Dtype* output_ptr = output_tensor->mutable_data<Dtype>();
+
+    // input and output's shape dimension must >= 2 && <= 6.
+    const DDim& in_dim = input->dims();
+    const DDim& out_dim = output->dims();
+    size_t offset = 1;
+    for (int i = 3; i < param.axis.size(); ++i) {
+      offset *= in_dim[i];
+    }
+#pragma omp parallel for collapse(3)
+    for (int batch = 0; batch < out_dim[0]; ++batch) {
+      for (int c1 = 0; c1 < out_dim[1]; ++c1) {
+        for (int c2 = 0; c2 < out_dim[2]; ++c2) {
+          size_t out_offset =
+              ((batch * out_dim[1] + c1) * out_dim[2] + c2) * offset;
+          size_t in_offset =
+              ((batch * in_dim[1] + c2) * in_dim[2] + c1) * offset;
+          memcpy(output_ptr + out_offset,
+                 input_ptr + in_offset,
+                 offset * sizeof(Dtype));
+        }
+      }
+    }
+    HostTensorToDeviceTensor<Dtype>(output_tensor, output);
+    delete input_tensor;
+    delete output_tensor;
+  }
+
+  template <typename Dtype>
+  void Transpose2Compute(const operators::TransposeParam& param) {
+    const Tensor* input = param.x;
+    Tensor* input_tensor = new Tensor();
+    DeviceTensorToHostTensor<Dtype>(input, input_tensor);
+    Dtype* input_ptr = input_tensor->mutable_data<Dtype>();
+
+    Tensor* output = param.output;
+    Tensor* output_tensor = new Tensor();
+    output_tensor->Resize(output->dims());
+    Dtype* output_ptr = output_tensor->mutable_data<Dtype>();
+
+    // input and output's shape dimension must >= 2 && <= 6.
+    const DDim& in_dim = input->dims();
+    const DDim& out_dim = output->dims();
+
+    // precompute inverted output dim and strides
+    size_t rout_dim[6], strides[6];
+    auto& axis = param.axis;
+    int permute = axis.size();  // permute must >=2 && <= 6.
+    for (int i = 0; i < permute; ++i) {
+      int k = permute - 1 - i;
+      strides[k] = 1;
+      for (int j = axis[i] + 1; j < permute; ++j) {
+        strides[k] *= in_dim[j];
+      }
+      rout_dim[k] = out_dim[i];
+    }
+
+    // unroll the first 2 dimensions
+    int reamin_dim = 1;
+    for (int i = 2; i < out_dim.size(); ++i) {
+      reamin_dim *= out_dim[i];
+    }
+
+#pragma omp parallel for collapse(2)
+    for (int batch = 0; batch < out_dim[0]; ++batch) {
+      for (int j = 0; j < out_dim[1]; ++j) {
+        size_t offset = batch * strides[permute - 1] + j * strides[permute - 2];
+        Dtype* out_ptr = output_ptr + (batch * out_dim[1] + j) * reamin_dim;
+        int indics[4] = {0, 0, 0, 0};
+        for (int k = 0; k < reamin_dim; ++k) {
+          out_ptr[k] = input_ptr[offset];
+          indics[0] += 1;
+          offset += strides[0];
+          for (int p = 0; p < permute - 3; ++p) {
+            if (indics[p] == rout_dim[p]) {
+              indics[p + 1] += 1;
+              indics[p] = 0;
+              offset += strides[p + 1];
+              offset -= rout_dim[p] * strides[p];
+            } else {
+              break;
+            }
+          }
+        }
+      }
+    }
+    HostTensorToDeviceTensor<Dtype>(output_tensor, output);
+    delete input_tensor;
+    delete output_tensor;
+  }
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const std::vector<int> axis = param.axis;
+
+    bool shuffle_channel = IsShuffleChannel(axis);
+    if (shuffle_channel) {
+      ShuffleChannelCompute<float>(param);
+    } else {
+      Transpose2Compute<float>(param);
+    }
+  }
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(transpose,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::TransposeComputeFloatImage,
+                     image2d)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(transpose2,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::Transpose2ComputeFloatImage,
+                     image2d)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+
+#define LITE_WITH_LOG
--- a/lite/kernels/opencl/transpose_image_compute_test.cc
+++ b/lite/kernels/opencl/transpose_image_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/opencl/test_helper.h"
+#include "lite/operators/reshape_op.h"
+#include "lite/utils/logging.h"
+
+#define FP16_MAX_DIFF (5e-1)
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+static inline void TestWithKernel(
+    const std::unique_ptr<paddle::lite::KernelBase>& kernel) {
+  int64_t batch_size = 1;
+  int64_t ic = 2;
+  int64_t ih = 3;
+  int64_t iw = 4;
+
+  int64_t oc = 3;
+  int64_t oh = 4;
+  int64_t ow = 2;
+
+  lite::Tensor input, output;
+  operators::TransposeParam param;
+
+  param.x = &input;
+  param.output = &output;
+  param.axis = std::vector<int>({0, 2, 3, 1});
+  const DDim input_dim =
+      lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
+  input.Resize(input_dim);
+  const DDim output_dim =
+      lite::DDim{std::vector<int64_t>({batch_size, oc, oh, ow})};
+  param.output->Resize(output_dim);
+
+  LOG(INFO) << "prepare kernel SetParam------";
+  kernel->SetParam(param);
+
+  size_t input_image_width = iw * ((ic + 3) / 4);
+  size_t input_image_height = ih * batch_size;
+
+  size_t output_image_width = ow * ((oc + 3) / 4);
+  size_t output_image_height = oh * batch_size;
+
+  const size_t cl_image2d_row_pitch{0};
+  const size_t cl_image2d_slice_pitch{0};
+
+  std::vector<float> input_v(batch_size * ic * ih * iw);
+
+  LOG(INFO) << "gen input ...";
+
+  float* input_v_data = &input_v[0];
+  auto index = 0;
+  for (auto& i : input_v) {
+    i = index++;
+  }
+
+  paddle::lite::CLImageConverterDefault default_convertor;
+
+  std::vector<half_t> x_image_data(input_image_width * input_image_height *
+                                   4);  // 4 : RGBA
+
+  LOG(INFO) << "set mapped input  ...";
+  default_convertor.NCHWToImage(input_v_data, x_image_data.data(), input_dim);
+
+  auto* input_image = input.mutable_data<half_t, cl::Image2D>(
+      input_image_width, input_image_height, x_image_data.data());
+
+  LOG(INFO) << "prepare kernel ready";
+
+  LOG(INFO) << "mutable output ...";
+  CLImageConverterDefault default_converter;
+  DDim out_image_shape = default_converter.InitImageDimInfoWith(output_dim);
+  LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
+            << out_image_shape[1];
+  auto* out_image = output.mutable_data<half_t, cl::Image2D>(
+      out_image_shape[0], out_image_shape[1]);
+
+  LOG(INFO) << "kernel context ...";
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+
+  std::unique_ptr<KernelContext> transpose_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(transpose_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(transpose_context));
+
+  LOG(INFO) << "kernel launch ...";
+  kernel->Launch();
+
+  CLRuntime::Global()->command_queue().finish();
+
+  half_t* out_image_data = new half_t[out_image_shape.production() * 4];
+  TargetWrapperCL::ImgcpySync(out_image_data,
+                              output.data<half_t, cl::Image2D>(),
+                              out_image_shape[0],
+                              out_image_shape[1],
+                              cl_image2d_row_pitch,
+                              cl_image2d_slice_pitch,
+                              IoDirection::DtoH);
+  float* out_data = new float[out_image_shape.production() * 4];
+  default_converter.ImageToNCHW(
+      out_image_data, out_data, out_image_shape, output_dim);
+
+  // check output data
+  index = 0;
+  auto hxw = ih * iw;
+  auto cxhxw = ic * hxw;
+  for (auto n = 0; n < batch_size; n++) {
+    for (auto h = 0; h < ih; h++) {
+      for (auto w = 0; w < iw; w++) {
+        for (auto c = 0; c < ic; c++) {
+          auto input_index = n * cxhxw + c * hxw + h * iw + w;
+          auto input_value = input_v_data[input_index];
+          auto output_value = out_data[index];
+          auto abs_diff = abs(input_value - output_value);
+          auto relative_diff = COMPUTE_RELATIVE_DIFF(input_value, output_value);
+          EXPECT_EQ(
+              (relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
+              true);
+          index++;
+        }
+      }
+    }
+  }
+}
+
+TEST(transpose_opencl, compute) {
+  auto kernels = KernelRegistry::Global().Create("transpose",
+                                                 TARGET(kOpenCL),
+                                                 PRECISION(kFP16),
+                                                 DATALAYOUT(kImageDefault));
+  ASSERT_FALSE(kernels.empty());
+  auto kernel = std::move(kernels.front());
+  TestWithKernel(kernel);
+}
+
+TEST(transpose2_opencl, compute) {
+  auto kernels = KernelRegistry::Global().Create("transpose2",
+                                                 TARGET(kOpenCL),
+                                                 PRECISION(kFP16),
+                                                 DATALAYOUT(kImageDefault));
+  ASSERT_FALSE(kernels.empty());
+  auto kernel = std::move(kernels.front());
+  TestWithKernel(kernel);
+}
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(transpose, kOpenCL, kFP16, kImageDefault, image2d);