[KERNEL][OPENCL] Fix concat; Enable conv3x3 with group (#4180)

* [KERNEL][OPENCL] support opencl concat 3inputs, 4inputs of cases; enable conv3x3 with group; fix buffer opencl concat; clean and clear cmake of opencl kernels. test=develop

[KERNEL][OPENCL] Fix concat; Enable conv3x3 with group (#4180)
* [KERNEL][OPENCL] support opencl concat 3inputs, 4inputs of cases; enable conv3x3 with group; fix buffer opencl concat; clean and clear cmake of opencl kernels. test=develop
394c2833 · ysh329 · GitHub · 396607ca · 394c2833 · 394c2833
10 changed file
--- a/lite/backends/opencl/cl_kernel/buffer/concat_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/buffer/concat_kernel.cl
@@ -14,45 +14,60 @@ limitations under the License. */

 #include <cl_common.h>

-__kernel void concat2(__global const CL_DTYPE* x_data0, __global const CL_DTYPE* x_data1, __global CL_DTYPE* out_data, 
-    int size, int axis_size, int pre_size, int post_size, int total, int total0, int total1) {
+__kernel void concat2(__global const CL_DTYPE* x_data0,
+                      __global const CL_DTYPE* x_data1,
+                      __global CL_DTYPE* out_data, 
+                      int size,
+                      int axis_size,
+                      int pre_size,
+                      int post_size,
+                      int total,
+                      int total0,
+                      int total1) {
    const int index = get_global_id(0); 
-  if (index < size){
-    for (int i = 0; i < pre_size; i++){
+    if (index < size) {
+        for (int i = 0; i < pre_size; i++) {
            int offset_out = index * post_size + i * total;
            int offset_in = index * post_size + i * total0;
            // memcpy(out_data + offset_out, x_data0 + offset_in, post_size);
-        CL_DTYPE* dst = out_data + offset_out;
-        CL_DTYPE* src = x_data0 + offset_in;
-        for (int k = 0; k < post_size; k++){
+            __global CL_DTYPE* dst = (__global CL_DTYPE*)(out_data + offset_out);
+            __global CL_DTYPE* src = (__global CL_DTYPE*)(x_data0 + offset_in);
+            for (int k = 0; k < post_size; k++) {
               *dst++ = *src++;
            }
        }
-  }else if (index < axis_size){
-    for (int i = 0; i < pre_size; i++){
+    } else if (index < axis_size) {
+        for (int i = 0; i < pre_size; i++) {
            int offset_out = index * post_size + i * total;
            int offset_in = index * post_size + i * total1;
            // memcpy(out_data + offset_out, x_data1 + offset_in, post_size);
-        CL_DTYPE* dst = out_data + offset_out;
-        CL_DTYPE* src = x_data1 + offset_in;
-        for (int k = 0; k < post_size; k++){
+            __global CL_DTYPE* dst = (__global CL_DTYPE*)(out_data + offset_out);
+            __global CL_DTYPE* src = (__global CL_DTYPE*)(x_data1 + offset_in);
+            for (int k = 0; k < post_size; k++) {
                *dst++ = *src++;
            }
        }
    }
 }

-__kernel void concat_mul(__global const CL_DTYPE* x_data, __global CL_DTYPE* out_data, 
-    int axis_size, int pre_size, int post_size, int start, int total, int total0) {
-  const int index = get_global_id(0); 
-  if (index < axis_size){
-    for (int i = 0; i < pre_size; i++){
+__kernel void concat_mul_buffer(
+                     __global const CL_DTYPE* x_data,
+                     __global CL_DTYPE* out_data, 
+                     int axis_size,
+                     int pre_size,
+                     int post_size,
+                     int start,
+                     int total,
+                     int total0) {
+    const int index = get_global_id(0); // [0, axis_size)
+    if (index < axis_size) {
+        for (int i = 0; i < pre_size; i++) {
            int offset_out = (start + index) * post_size + i * total;
            int offset_in = index * post_size + i * total0;
            // memcpy(out_data + offset_out, x_data + offset_in, post_size);
-        CL_DTYPE* dst = out_data + offset_out;
-        CL_DTYPE* src = x_data + offset_in;
-        for (int k = 0; k < post_size; k++){
+            __global CL_DTYPE* dst = (__global CL_DTYPE*)(out_data + offset_out);
+            __global CL_DTYPE* src = (__global CL_DTYPE*)(x_data + offset_in);
+            for (int k = 0; k < post_size; k++) {
                *dst++ = *src++;
            }
        }

--- a/lite/backends/opencl/cl_kernel/image/activation_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/activation_kernel.cl
@@ -77,7 +77,7 @@ __kernel void hard_sigmoid(__read_only image2d_t input,
      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
-  CL_DTYPE4 out = clamp(in * scale + value_offset, 0.0, 1.0);
+  CL_DTYPE4 out = clamp(in * (CL_DTYPE4)(scale) + (CL_DTYPE4)(value_offset), (CL_DTYPE4)(0.0), (CL_DTYPE4)(1.0));

  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
 }

--- a/lite/backends/opencl/cl_kernel/image/concat_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/concat_kernel.cl
@@ -11,6 +11,286 @@ limitations under the License. */

 #include <cl_common.h>

+// deprecated
+__kernel void concatByCWith2Inputs(
+                    __write_only image2d_t output_image,
+                    __private const int output_tensor_c,
+                    __private const int output_tensor_w,
+                    __read_only image2d_t input0_image,
+                    __private const int input0_tensor_c,
+                    __read_only image2d_t input1_image,
+                    __private const int input1_tensor_c) {
+  const int out_c = get_global_id(0);   // [0, (output_tensor_c + 3) / 4)
+  const int out_w = get_global_id(1);   // [0, output_tensor_w)
+  const int out_nh = get_global_id(2);  // [0, output_tensor_n * output_tensor_h)
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+        CLK_ADDRESS_CLAMP |
+        CLK_FILTER_NEAREST;
+
+  int2 output_pos;
+  output_pos.x = out_c * output_tensor_w + out_w;
+  output_pos.y = out_nh;
+  CL_DTYPE4 output_data;
+
+  for (int i = 0; i < 4; i++) {
+    int c = out_c * 4 + i;
+    if (c >= output_tensor_c) {
+        break;
+    }
+    int c_in;
+    CL_DTYPE4 input_data;
+    if (c < input0_tensor_c) {
+      c_in = c;
+      int2 input_pos;
+      input_pos.x = (c_in / 4) * output_tensor_w + out_w;
+      input_pos.y = out_nh;
+      input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input0_image, sampler, input_pos);
+    } else {
+      c_in = c - input0_tensor_c;
+      int2 input_pos;
+      input_pos.x = (c_in / 4) * output_tensor_w + out_w;
+      input_pos.y = out_nh;
+      input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input1_image, sampler, input_pos);
+    }
+    int value_offset = c_in % 4;
+    float value;
+    if (value_offset == 0) {
+      value = input_data.x;
+    } else if (value_offset == 1) {
+      value = input_data.y;
+    } else if (value_offset == 2) {
+      value = input_data.z;
+    } else if (value_offset == 3) {
+      value = input_data.w;
+    }
+   if (i == 0) {
+      output_data.x = value;
+    } else if (i == 1) {
+      output_data.y = value;
+    } else if (i == 2) {
+      output_data.z = value;
+    } else if (i == 3) {
+      output_data.w = value;
+    }
+  }
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output_data);
+}
+
+
+__kernel void concatByCWith3Inputs(
+                    __write_only image2d_t output_image,
+                    __private const int output_tensor_c,
+                    __private const int output_tensor_w,
+                    __read_only image2d_t input0_image,
+                    __private const int input0_tensor_c,
+                    __read_only image2d_t input1_image,
+                    __private const int input1_tensor_c,
+                    __read_only image2d_t input2_image,
+                    __private const int input2_tensor_c) {
+  const int out_c = get_global_id(0);   // [0, (output_tensor_c + 3) / 4)
+  const int out_w = get_global_id(1);   // [0, output_tensor_w)
+  const int out_nh = get_global_id(2);  // [0, output_tensor_n * output_tensor_h)
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+        CLK_ADDRESS_CLAMP |
+        CLK_FILTER_NEAREST;
+
+  int2 output_pos;
+  output_pos.x = out_c * output_tensor_w + out_w;
+  output_pos.y = out_nh;
+  CL_DTYPE4 output_data;
+
+  for (int i = 0; i < 4; i++) {
+    int c = out_c * 4 + i;
+    if (c >= output_tensor_c) {
+        break;
+    }
+    int c_in;
+    CL_DTYPE4 input_data;
+    if (c < input0_tensor_c) {
+      c_in = c;
+      int2 input_pos;
+      input_pos.x = (c_in / 4) * output_tensor_w + out_w;
+      input_pos.y = out_nh;
+      input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input0_image, sampler, input_pos);
+    } else if (c < input0_tensor_c + input1_tensor_c) {
+      c_in = c - input0_tensor_c;
+      int2 input_pos;
+      input_pos.x = (c_in / 4) * output_tensor_w + out_w;
+      input_pos.y = out_nh;
+      input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input1_image, sampler, input_pos);
+    } else {
+      c_in = c - input0_tensor_c - input1_tensor_c;
+      int2 input_pos;
+      input_pos.x = (c_in / 4) * output_tensor_w + out_w;
+      input_pos.y = out_nh;
+      input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input2_image, sampler, input_pos);
+    }
+    int value_offset = c_in % 4;
+    float value;
+    if (value_offset == 0) {
+      value = input_data.x;
+    } else if (value_offset == 1) {
+      value = input_data.y;
+    } else if (value_offset == 2) {
+      value = input_data.z;
+    } else if (value_offset == 3) {
+      value = input_data.w;
+    }
+    if (i == 0) {
+      output_data.x = value;
+    } else if (i == 1) {
+      output_data.y = value;
+    } else if (i == 2) {
+      output_data.z = value;
+    } else if (i == 3) {
+      output_data.w = value;
+    }
+  }
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output_data);
+}
+
+
+__kernel void concatByCWith4Inputs(
+                    __write_only image2d_t output_image,
+                    __private const int output_tensor_c,
+                    __private const int output_tensor_w,
+                    __read_only image2d_t input0_image,
+                    __private const int input0_tensor_c,
+                    __read_only image2d_t input1_image,
+                    __private const int input1_tensor_c,
+                    __read_only image2d_t input2_image,
+                    __private const int input2_tensor_c,
+                    __read_only image2d_t input3_image,
+                    __private const int input3_tensor_c) {
+  const int out_c = get_global_id(0);   // [0, (output_tensor_c + 3) / 4)
+  const int out_w = get_global_id(1);   // [0, output_tensor_w)
+  const int out_nh = get_global_id(2);  // [0, output_tensor_n * output_tensor_h)
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+        CLK_ADDRESS_CLAMP |
+        CLK_FILTER_NEAREST;
+
+  int2 output_pos;
+  output_pos.x = out_c * output_tensor_w + out_w;
+  output_pos.y = out_nh;
+  CL_DTYPE4 output_data;
+
+  for (int i = 0; i < 4; i++) {
+    int c = out_c * 4 + i;
+    if (c >= output_tensor_c) {
+        break;
+    }
+    int c_in;
+    CL_DTYPE4 input_data;
+    if (c < input0_tensor_c) {
+      c_in = c;
+      int2 input_pos;
+      input_pos.x = (c_in / 4) * output_tensor_w + out_w;
+      input_pos.y = out_nh;
+      input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input0_image, sampler, input_pos);
+    } else if (c < input0_tensor_c + input1_tensor_c) {
+      c_in = c - input0_tensor_c;
+      int2 input_pos;
+      input_pos.x = (c_in / 4) * output_tensor_w + out_w;
+      input_pos.y = out_nh;
+      input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input1_image, sampler, input_pos);
+    } else if (c < input0_tensor_c + input1_tensor_c + input2_tensor_c) {
+      c_in = c - input0_tensor_c - input1_tensor_c;
+      int2 input_pos;
+      input_pos.x = (c_in / 4) * output_tensor_w + out_w;
+      input_pos.y = out_nh;
+      input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input2_image, sampler, input_pos);
+    }else if (c < input0_tensor_c + input1_tensor_c + input2_tensor_c + input3_tensor_c){
+      c_in = c - input0_tensor_c - input1_tensor_c - input2_tensor_c;
+      int2 input_pos;
+      input_pos.x = (c_in / 4) * output_tensor_w + out_w;
+      input_pos.y = out_nh;
+      input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input3_image, sampler, input_pos);
+    }
+    int value_offset = c_in % 4;
+    float value;
+    if (value_offset == 0) {
+      value = input_data.x;
+    } else if (value_offset == 1) {
+      value = input_data.y;
+    } else if (value_offset == 2) {
+      value = input_data.z;
+    } else if (value_offset == 3) {
+      value = input_data.w;
+    }
+    if (i == 0) {
+      output_data.x = value;
+    } else if (i == 1) {
+      output_data.y = value;
+    } else if (i == 2) {
+      output_data.z = value;
+    } else if (i == 3) {
+      output_data.w = value;
+    }
+  }
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output_data);
+}
+
+
+// deprecated
+__kernel void concatByH(__read_only image2d_t input_image,
+                        __write_only image2d_t output_image,
+                        __private const int out_W,
+                        __private const int out_H_Start) {
+
+  const int in_c = get_global_id(0);
+  const int in_w = get_global_id(1);
+  const int in_nh = get_global_id(2);
+
+  int2 input_pos;
+  input_pos.x = in_c * out_W + in_w;
+  input_pos.y = in_nh;
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+        CLK_ADDRESS_CLAMP |
+        CLK_FILTER_NEAREST;
+  CL_DTYPE4 input;
+  input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,input_pos);
+
+  int2 output_pos;
+  output_pos.x = input_pos.x;
+  output_pos.y = out_H_Start + input_pos.y;
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, input);
+
+}
+
+
+// deprecated
+__kernel void concatByW(__read_only image2d_t input_image,
+                        __write_only image2d_t output_image,
+                        __private const int in_W,
+                        __private const int pre_Width,
+                        __private const int out_Width) {
+
+  const int in_c = get_global_id(0);
+  const int in_w = get_global_id(1);
+  const int in_nh = get_global_id(2);
+
+  int2 input_pos;
+  input_pos.x = in_c * in_W + in_w;
+  input_pos.y = in_nh;
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+        CLK_ADDRESS_CLAMP |
+        CLK_FILTER_NEAREST;
+  CL_DTYPE4 input;
+  input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,input_pos);
+
+  int2 output_pos;
+  output_pos.x = input_pos.x + pre_Width + out_Width * in_c;
+  output_pos.y = input_pos.y;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, input);
+}
+
+
 __kernel void concat2(__read_only image2d_t input0,
                      __read_only image2d_t input1,
                      __write_only image2d_t output,
@@ -103,62 +383,3 @@ __kernel void concat2(__read_only image2d_t input0,
    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input);
  }
 }
-
-__kernel void concat_mul(__read_only image2d_t input,
-                         __write_only image2d_t output,
-                         int flag, int C_0, int out_C, int out_W, int in_W, int width) {
-  const int in_w = get_global_id(0); // image_width cxw/4
-  const int in_c = get_global_id(1); // image_width cxw/4
-  const int in_nh = get_global_id(2); // image_height nxh
-
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
-  int2 input_pos;
-  int2 output_pos;
-  input_pos.x = in_c * in_W + in_w;
-  input_pos.y = in_nh;
-  CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
-  if (flag == 1){ // by channel
-    CL_DTYPE4 output_data;
-    for (int i = 0; i < 4; i++) {
-      int c_out = C_0 + in_c * 4 + i;
-      if (c_out >= out_C) {
-        break;
-      }
-      int2 output_pos;
-      output_pos.x = (c_out / 4) * in_W + in_w;
-      output_pos.y = in_nh;
-      CL_DTYPE val;
-      if (i == 0) {
-        val = input_data.x;
-      } else if (i == 1) {
-        val = input_data.y;
-      } else if (i == 2) {
-        val = input_data.z;
-      } else if (i == 3) {
-        val = input_data.w;
-      }
-      if (c_out % 4 == 0){
-        output_data.x = val;
-      }else if (c_out % 4 == 1){
-        output_data.y = val;
-      }else if (c_out % 4 == 2){
-        output_data.z = val;
-      }else if (c_out % 4 == 3){
-        output_data.w = val;
-      }
-      WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, output_data);
-    }
-  }else if (flag == 2){ // by height, width == n
-    int2 output_pos;
-    output_pos.x = in_c * in_W + in_w;
-    output_pos.y = in_nh + C_0 * width;
-    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input_data);
-  }else if (flag == 3){ // by width, width == C
-    int2 output_pos;
-    output_pos.y = in_nh;
-    output_pos.x = in_c * out_W + (in_w + C_0);
-    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input_data);
-  }
-}
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -8,35 +8,35 @@ set(cl_kernel_deps op_params cl_runtime cl_context cl_wrapper cl_target_wrapper
 # image kernel      #
 #####################
 # basic
-add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(elementwise_sub_opencl OPENCL basic SRCS elementwise_sub_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(elementwise_mul_opencl OPENCL basic SRCS elementwise_mul_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(fusion_elementwise_add_activation_opencl
+add_kernel(elementwise_add_opencl_image OPENCL basic SRCS elementwise_add_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(elementwise_sub_opencl_image OPENCL basic SRCS elementwise_sub_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(elementwise_mul_opencl_image OPENCL basic SRCS elementwise_mul_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(fusion_elementwise_add_activation_opencl_image
           OPENCL basic SRCS fusion_elementwise_add_activation_image_compute.cc
-           DEPS elementwise_add_opencl ${cl_kernel_deps})
-add_kernel(fusion_elementwise_sub_activation_opencl
+           DEPS elementwise_add_opencl_image ${cl_kernel_deps})
+add_kernel(fusion_elementwise_sub_activation_opencl_image
           OPENCL basic SRCS fusion_elementwise_sub_activation_image_compute.cc
-           DEPS elementwise_sub_opencl ${cl_kernel_deps})
-
-add_kernel(pool_opencl OPENCL basic SRCS pool_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(activation_opencl OPENCL basic SRCS activation_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(reshape_opencl OPENCL basic SRCS reshape_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(transpose_opencl OPENCL basic SRCS transpose_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(conv_opencl OPENCL basic SRCS conv_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(layout_opencl OPENCL basic SRCS layout_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(concat_opencl OPENCL basic SRCS concat_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(scale_opencl OPENCL basic SRCS scale_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(grid_sampler_opencl OPENCL basic SRCS grid_sampler_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(lrn_opencl OPENCL basic SRCS lrn_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(bilinear_interp_opencl OPENCL basic SRCS bilinear_interp_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(slice_opencl OPENCL basic SRCS slice_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(instance_norm_opencl OPENCL basic SRCS instance_norm_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(dropout_opencl OPENCL basic SRCS dropout_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(pad2d_opencl OPENCL basic SRCS pad2d_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(box_coder_opencl OPENCL basic SRCS box_coder_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(pixel_shuffle_opencl OPENCL basic SRCS pixel_shuffle_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(expand_opencl OPENCL basic SRCS expand_image_compute.cc DEPS ${cl_kernel_deps})
+           DEPS elementwise_sub_opencl_image ${cl_kernel_deps})
+
+add_kernel(pool_opencl_image OPENCL basic SRCS pool_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(activation_opencl_image OPENCL basic SRCS activation_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(reshape_opencl_image OPENCL basic SRCS reshape_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(transpose_opencl_image OPENCL basic SRCS transpose_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(conv_opencl_image OPENCL basic SRCS conv_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(layout_opencl_image OPENCL basic SRCS layout_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(concat_opencl_image OPENCL basic SRCS concat_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(nearest_interp_opencl_image OPENCL basic SRCS nearest_interp_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(scale_opencl_image OPENCL basic SRCS scale_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(grid_sampler_opencl_image OPENCL basic SRCS grid_sampler_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(lrn_opencl_image OPENCL basic SRCS lrn_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(bilinear_interp_opencl_image OPENCL basic SRCS bilinear_interp_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(slice_opencl_image OPENCL basic SRCS slice_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(instance_norm_opencl_image OPENCL basic SRCS instance_norm_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(dropout_opencl_image OPENCL basic SRCS dropout_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(pad2d_opencl_image OPENCL basic SRCS pad2d_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(box_coder_opencl_image OPENCL basic SRCS box_coder_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(pixel_shuffle_opencl_image OPENCL basic SRCS pixel_shuffle_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(expand_opencl_image OPENCL basic SRCS expand_image_compute.cc DEPS ${cl_kernel_deps})

 # extra
 # wait to add ...
@@ -48,89 +48,89 @@ add_kernel(expand_opencl OPENCL basic SRCS expand_image_compute.cc DEPS ${cl_ker
 # image kernel test  #
 ######################
 lite_cc_test(test_activation_image_opencl SRCS activation_image_compute_test.cc
-             DEPS activation_opencl layout_opencl op_registry program context)
+             DEPS activation_opencl_image layout_opencl_image op_registry program context)

 lite_cc_test(test_conv_image_opencl SRCS conv_image_compute_test.cc
-             DEPS conv_opencl op_registry program context)
+             DEPS conv_opencl_image op_registry program context)

 lite_cc_test(test_depthwise_conv2d_image_opencl SRCS depthwise_conv2d_image_compute_test.cc
-             DEPS conv_opencl op_registry program context)
+             DEPS conv_opencl_image op_registry program context)

 lite_cc_test(test_nearest_interp_image_opencl SRCS nearest_interp_image_compute_test.cc
-             DEPS nearest_interp_opencl layout_opencl op_registry program context)
+             DEPS nearest_interp_opencl_image layout_opencl_image op_registry program context)

 lite_cc_test(test_pool_image_opencl SRCS pool_image_compute_test.cc
-             DEPS pool_opencl op_registry program context)
+             DEPS pool_opencl_image op_registry program context)

 lite_cc_test(test_scale_image_opencl SRCS scale_image_compute_test.cc
-             DEPS scale_opencl op_registry program context)
+             DEPS scale_opencl_image op_registry program context)

 lite_cc_test(test_reshape_image_opencl SRCS reshape_image_compute_test.cc
-             DEPS reshape_opencl op_registry program context)
+             DEPS reshape_opencl_image op_registry program context)

 lite_cc_test(test_transpose_image_opencl SRCS transpose_image_compute_test.cc
-             DEPS transpose_opencl layout_opencl op_registry program context)
+             DEPS transpose_opencl_image layout_opencl_image op_registry program context)
             
 lite_cc_test(test_concat_image_opencl SRCS concat_image_compute_test.cc
-             DEPS concat_opencl layout_opencl op_registry program context)
+             DEPS concat_opencl_image layout_opencl_image op_registry program context)

 #lite_cc_test(test_elementwise_mul_image_opencl SRCS elementwise_mul_image_compute_test.cc
-#             DEPS elementwise_mul_opencl op_registry program context)
+#             DEPS elementwise_mul_opencl_image op_registry program context)

 lite_cc_test(test_layout_image_opencl SRCS layout_image_compute_test.cc
-             DEPS layout_opencl op_registry program context)
+             DEPS layout_opencl_image op_registry program context)

 lite_cc_test(test_pixel_shuffle_image_opencl SRCS pixel_shuffle_image_compute_test.cc
-             DEPS pixel_shuffle_opencl op_registry program context)
+             DEPS pixel_shuffle_opencl_image op_registry program context)

 lite_cc_test(test_expand_image_opencl SRCS expand_image_compute_test.cc
-             DEPS expand_opencl op_registry program context)
+             DEPS expand_opencl_image op_registry program context)

 lite_cc_test(test_elementwise_add_image_opencl SRCS elementwise_add_image_compute_test.cc
-             DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context)
+             DEPS elementwise_add_opencl_image fusion_elementwise_add_activation_opencl_image op_registry program context)
 lite_cc_test(test_elementwise_sub_image_opencl SRCS elementwise_sub_image_compute_test.cc
-             DEPS elementwise_sub_opencl fusion_elementwise_sub_activation_opencl op_registry program context)
+             DEPS elementwise_sub_opencl_image fusion_elementwise_sub_activation_opencl_image op_registry program context)

 lite_cc_test(test_grid_sampler_image_opencl SRCS grid_sampler_image_compute_test.cc
-             DEPS grid_sampler_opencl op_registry program context)
+             DEPS grid_sampler_opencl_image op_registry program context)

 lite_cc_test(test_lrn_image_opencl SRCS lrn_image_compute_test.cc
-             DEPS lrn_opencl op_registry program context)
+             DEPS lrn_opencl_image op_registry program context)
             
 lite_cc_test(test_bilinear_interp_image_opencl SRCS bilinear_interp_image_compute_test.cc
-	         DEPS bilinear_interp_opencl op_registry program context)
+             DEPS bilinear_interp_opencl_image op_registry program context)
 lite_cc_test(test_slice_image_opencl SRCS slice_image_compute_test.cc
-	         DEPS slice_opencl op_registry program context)
+             DEPS slice_opencl_image op_registry program context)
             
-	     #lite_cc_test(test_instance_norm_image_opencl SRCS instance_norm_image_compute_test.cc
-	     #             DEPS instance_norm_opencl op_registry program context)
+#lite_cc_test(test_instance_norm_image_opencl SRCS instance_norm_image_compute_test.cc
+#             DEPS instance_norm_opencl_image op_registry program context)

 lite_cc_test(test_dropout_image_opencl SRCS dropout_image_compute_test.cc
-                 DEPS dropout_opencl op_registry program context)  
+             DEPS dropout_opencl_image op_registry program context)
                 
 lite_cc_test(test_pad2d_image_opencl SRCS pad2d_image_compute_test.cc
-                 DEPS pad2d_opencl layout_opencl op_registry program context)
+             DEPS pad2d_opencl_image layout_opencl_image op_registry program context)

 lite_cc_test(test_box_coder_image_opencl SRCS box_coder_image_compute_test.cc
-                 DEPS box_coder_opencl op_registry program context)
+             DEPS box_coder_opencl_image op_registry program context)

 ######################
 # buffer kernel      #
 ######################
 # basic
-#add_kernel(activation_opencl OPENCL basic SRCS activation_buffer_compute.cc DEPS ${cl_kernel_deps})
-#add_kernel(conv_opencl OPENCL basic SRCS conv_buffer_compute.cc DEPS ${cl_kernel_deps})
+#add_kernel(activation_opencl_buffer OPENCL basic SRCS activation_buffer_compute.cc DEPS ${cl_kernel_deps})
+#add_kernel(conv_opencl_buffer OPENCL basic SRCS conv_buffer_compute.cc DEPS ${cl_kernel_deps})
 #add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_buffer_compute.cc DEPS ${cl_kernel_deps})
-#add_kernel(pool_opencl OPENCL basic SRCS pool_buffer_compute.cc DEPS ${cl_kernel_deps})
-#add_kernel(concat_opencl OPENCL basic SRCS concat_buffer_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(fc_opencl OPENCL basic SRCS fc_buffer_compute.cc DEPS ${cl_kernel_deps})
+#add_kernel(pool_opencl_buffer OPENCL basic SRCS pool_buffer_compute.cc DEPS ${cl_kernel_deps})
+#add_kernel(concat_opencl_buffer OPENCL basic SRCS concat_buffer_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(fc_opencl_buffer OPENCL basic SRCS fc_buffer_compute.cc DEPS ${cl_kernel_deps})
 # NOTE(ysh329): use fc as `mul`, and mul is not used.
-#add_kernel(mul_opencl OPENCL basic SRCS mul_buffer_compute.cc DEPS ${cl_kernel_deps})
-#add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_buffer_compute.cc DEPS ${cl_kernel_deps})
-#add_kernel(fusion_elementwise_add_activation_opencl
+#add_kernel(mul_opencl_buffer OPENCL basic SRCS mul_buffer_compute.cc DEPS ${cl_kernel_deps})
+#add_kernel(elementwise_add_opencl_buffer OPENCL basic SRCS elementwise_add_buffer_compute.cc DEPS ${cl_kernel_deps})
+#add_kernel(fusion_elementwise_add_activation_opencl_buffer
 #           OPENCL basic SRCS fusion_elementwise_add_activation_buffer_compute.cc
 #           DEPS elementwise_add_opencl ${cl_kernel_deps})
-add_kernel(io_copy_opencl OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps})
+add_kernel(io_copy_opencl_buffer OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps})

 # extra
 # wait to add ...
@@ -156,10 +156,10 @@ add_kernel(io_copy_opencl OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${ten
 #             DEPS pool_opencl op_registry program context)

 #lite_cc_test(test_concat_buffer_opencl SRCS concat_buffer_compute_test.cc
-#             DEPS concat_opencl op_registry program context)
+#             DEPS concat_opencl_buffer op_registry program context)

 lite_cc_test(test_fc_buffer_opencl SRCS fc_buffer_compute_test.cc
-             DEPS fc_opencl op_registry program context)
+             DEPS fc_opencl_buffer op_registry program context)

 #lite_cc_test(test_mul_buffer_opencl SRCS mul_buffer_compute_test.cc
 #             DEPS mul_opencl op_registry program context)
@@ -168,4 +168,4 @@ lite_cc_test(test_fc_buffer_opencl SRCS fc_buffer_compute_test.cc
 #             DEPS elementwise_add_opencl op_registry program context)

 lite_cc_test(test_io_copy_buffer_opencl SRCS io_copy_buffer_compute_test.cc
-             DEPS io_copy_opencl op_registry program context)
+             DEPS io_copy_opencl_buffer op_registry program context)
--- a/lite/kernels/opencl/activation_image_compute.cc
+++ b/lite/kernels/opencl/activation_image_compute.cc
@@ -77,7 +77,7 @@ class ActivationComputeImageDefault
        scale_ = act_param_->hard_sigmoid_slope;
        threshold_ = act_param_->hard_sigmoid_offset;
        break;
-      defauln:
+      default:
        LOG(FATAL) << "This act type:" << act_type << " doesn't support.";
        return;
    }

--- a/lite/kernels/opencl/concat_buffer_compute.cc
+++ b/lite/kernels/opencl/concat_buffer_compute.cc
@@ -40,7 +40,7 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
    if (concat_param_->x.size() == 2) {
      kernel_func_name_ = "concat2";
    } else {
-      kernel_func_name_ = "concat_mul";
+      kernel_func_name_ = "concat_mul_buffer";
    }
    context.cl_context()->AddKernel(kernel_func_name_,
                                    "buffer/concat_kernel.cl",
@@ -86,7 +86,6 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
  void Run() override {
    auto& param = *param_.get_mutable<param_t>();
    const auto& x_dims = param.output->dims();
-    auto image_shape = InitImageDimInfoWith(x_dims);
    auto* out_buf =
        param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
    const auto& y_dims = param.output->dims();  // useless: check dim only
@@ -98,8 +97,9 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),

    auto inputs = param.x;
    int arg_idx = 0;
-    auto global_work_size = cl::NDRange{axis_size_};
+    auto global_work_size = cl::NDRange{static_cast<cl::size_type>(axis_size_)};
    int total = axis_size_ * post_size_;
+
    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
    if (inputs.size() == 2) {
      auto* x_buf0 = inputs[0]->data<float, cl::Buffer>();
@@ -144,6 +144,15 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
        auto* x_buf = inputs[i]->data<float, cl::Buffer>();
        global_work_size = cl::NDRange{static_cast<size_t>(size)};
        int total0 = size * post_size_;
+#ifdef LITE_WITH_LOG
+        LOG(INFO) << "------------- i=" << i << " -------------";
+        LOG(INFO) << "pre_size:" << pre_size_;
+        LOG(INFO) << "post_size:" << post_size_;
+        LOG(INFO) << "size:" << size;
+        LOG(INFO) << "start:" << start;
+        LOG(INFO) << "total:" << total;
+        LOG(INFO) << "total0:" << total0;
+#endif
        cl_int status = kernel.setArg(arg_idx, *x_buf);
        CL_CHECK_FATAL(status);
        status = kernel.setArg(++arg_idx, *out_buf);

--- a/lite/kernels/opencl/concat_buffer_compute_test.cc
+++ b/lite/kernels/opencl/concat_buffer_compute_test.cc
@@ -99,13 +99,14 @@ TEST(opencl_concat_buffer, compute) {
  auto *mapped_x2 = static_cast<float *>(
      TargetWrapperCL::Map(x2_data, 0, sizeof(float) * x2_dim.production()));
  for (int i = 0; i < x0_dim.production(); i++) {
-    mapped_x0[i] = dist(engine);
+    mapped_x0[i] = i + 1;  // dist(engine);
  }
  for (int i = 0; i < x1_dim.production(); i++) {
-    mapped_x1[i] = dist(engine);
+    mapped_x1[i] = x0_dim.production() + i + 1;  // dist(engine);
  }
  for (int i = 0; i < x2_dim.production(); i++) {
-    mapped_x2[i] = dist(engine);
+    mapped_x2[i] =
+        x0_dim.production() + x1_dim.production() + i + 1;  // dist(engine);
  }

  // set param and kernel, then run
@@ -151,9 +152,13 @@ TEST(opencl_concat_buffer, compute) {
  auto *out_data = out.mutable_data<float, cl::Buffer>();
  auto *mapped_out = static_cast<float *>(
      TargetWrapperCL::Map(out_data, 0, sizeof(float) * out_dim.production()));
+#ifdef PRINT_RESULT_CONCAT_BUFFER
  for (int i = 0; i < out_dim.production(); i++) {
-    EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6);
+    LOG(INFO) << "i:" << i << ", out[" << i << "]:" << mapped_out[i]
+              << ", out_ref_data[" << i << "]:" << out_ref_data[i];
  }
+#endif
+  EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6);
  TargetWrapperCL::Unmap(out_data, mapped_out);
  TargetWrapperCL::Unmap(x0_data, mapped_x0);
  TargetWrapperCL::Unmap(x1_data, mapped_x1);

--- a/lite/kernels/opencl/concat_image_compute.cc
+++ b/lite/kernels/opencl/concat_image_compute.cc
@@ -38,72 +38,89 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
  void PrepareForRun() override {
    auto& context = ctx_->As<OpenCLContext>();
    concat_param_ = param_.get_mutable<param_t>();
-    if (concat_param_->x.size() == 2) {
-      kernel_func_name_ = "concat2";
-    } else {
-      kernel_func_name_ = "concat_mul";
-    }
-    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
-    context.cl_context()->AddKernel(kernel_func_name_,
-                                    "image/concat_kernel.cl",
-                                    build_options_,
-                                    time_stamp_);

-    auto axis = concat_param_->axis;
    auto inputs = concat_param_->x;
-    auto out_dims = concat_param_->output->dims();
+    auto axis_ = concat_param_->axis;
+    auto output_tensor_dims = concat_param_->output->dims();
    auto* axis_tensor = concat_param_->axis_tensor;
    if (axis_tensor != nullptr) {
      // auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
      // axis = axis_tensor_data[0];
    }
+
+    if (inputs.size() == 2) {
+      kernel_func_name_ = "concat2";
+    } else if (inputs.size() == 3) {
+      kernel_func_name_ = "concatByCWith3Inputs";
+    } else if (inputs.size() == 4) {
+      kernel_func_name_ = "concatByCWith4Inputs";
+    } else {
+      // note: do layout transform between image and buffer,
+      // before and after concat(buffer impl.)
+      kernel_func_name_ = "concat_mul_buffer";  // buffer/concat_kernel.cl
+      build_options_ = " -DCL_DTYPE_float";
      auto in_dims = inputs[0]->dims();
-    axis_size_ = out_dims[axis];
-    axis_ = axis;
-    if (out_dims.size() < 4) {
-      if (out_dims.size() - axis == 1) {
+      for (int i = 0; i < axis_; i++) {
+        pre_size_ *= in_dims[i];
+      }
+      for (int i = axis_ + 1; i < in_dims.size(); i++) {
+        post_size_ *= in_dims[i];
+      }
+    }
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    (kernel_func_name_ == "concat_mul_buffer")
+                                        ? "buffer/concat_kernel.cl"
+                                        : "image/concat_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
+
+    if (output_tensor_dims.size() < 4) {
+      if (output_tensor_dims.size() - axis_ == 1) {
        // width
-        width_ = out_dims[1];  // c
+        width_ = output_tensor_dims[1];  // c
        flag_ = 3;
      } else {
        // height
-        width_ = out_dims[0];  // n
+        width_ = output_tensor_dims[0];  // n
        flag_ = 2;
      }
    } else {
      switch (axis_) {
        case 0:
-          width_ = out_dims[2];  // h
+          width_ = output_tensor_dims[2];  // h
          flag_ = 0;
          break;
        case 1:                            // channel
-          width_ = out_dims[3];  // w
+          width_ = output_tensor_dims[3];  // w
          flag_ = 1;
          break;
        case 2:                            // height
-          width_ = out_dims[0];  // n
+          width_ = output_tensor_dims[0];  // n
          flag_ = 2;
          break;
        case 3:
        case -1:                           // width
-          width_ = out_dims[1];  // c
+          width_ = output_tensor_dims[1];  // c
          flag_ = 3;
          break;
        default:
-          printf("this axis: %d does not support \n", axis_);
+          LOG(FATAL) << "Unsupported axis:" << axis_;
      }
    }

+    auto input0_tensor_dims = inputs[0]->dims();
    for (int i = 1; i < inputs.size(); i++) {
      auto dims = inputs[i]->dims();
-      // auto flag = CHECK_EQ_OR_FALSE(in_dims.size(), dims.size());
-      if (in_dims.size() != dims.size()) {
+      // auto flag = CHECK_EQ_OR_FALSE(input0_tensor_dims.size(), dims.size());
+      if (input0_tensor_dims.size() != dims.size()) {
        printf("input shape must be same \n");
        return;
      }
      for (int i = 0; i < dims.size(); i++) {
-        if (i != axis) {
-          if (in_dims[i] != dims[i]) {
+        if (i != axis_) {
+          if (input0_tensor_dims[i] != dims[i]) {
            printf("input shape must be same \n");
            return;
          }
@@ -113,21 +130,22 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
  }

  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    const auto& x_dims = param.output->dims();
-    auto image_shape = InitImageDimInfoWith(x_dims);
-    auto* out_buf = param.output->mutable_data<half_t, cl::Image2D>(
-        image_shape["width"], image_shape["height"]);
-    const auto& y_dims = param.output->dims();  // useless: check dim only
-
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
+    const auto& output_tensor_dims = concat_param_->output->dims();
+    int output_tensor_w = output_tensor_dims[output_tensor_dims.size() - 1];
+    int output_tensor_c = output_tensor_dims[1];
+    auto output_image_shape = InitImageDimInfoWith(output_tensor_dims);
+    auto* output_image_p =
+        concat_param_->output->mutable_data<half_t, cl::Image2D>(
+            output_image_shape["width"], output_image_shape["height"]);
+    auto inputs = concat_param_->x;

-    auto inputs = param.x;
-    int arg_idx = 0;
-    int width = inputs[0]->dims()[inputs[0]->dims().size() - 1];
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(
+                        output_tensor_dims[output_tensor_dims.size() - 1]),
+                    static_cast<cl::size_type>(
+                        output_image_shape["width"] /
+                        output_tensor_dims[output_tensor_dims.size() - 1]),
+                    static_cast<cl::size_type>(output_image_shape["height"])};

 #ifdef LITE_WITH_LOG
    VLOG(4) << "concat input shape:  ";
@@ -141,55 +159,54 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),

    VLOG(4) << "concat output shape:  ";
    VLOG(4) << " out  dims:  "
-            << "[" << x_dims.size() << "D]:" << x_dims[0] << " " << x_dims[1]
-            << " " << x_dims[2] << " " << x_dims[3];
+            << "[" << output_tensor_dims.size()
+            << "D]:" << output_tensor_dims[0] << " " << output_tensor_dims[1]
+            << " " << output_tensor_dims[2] << " " << output_tensor_dims[3];
    VLOG(4) << "axis_: " << axis_;
    VLOG(4) << "flag_: " << flag_;
-#endif
-
-    auto global_work_size =
-        cl::NDRange{static_cast<cl::size_type>(x_dims[x_dims.size() - 1]),
-                    static_cast<cl::size_type>(image_shape["width"] /
-                                               x_dims[x_dims.size() - 1]),
-                    static_cast<cl::size_type>(image_shape["height"])};

-#ifdef LITE_WITH_LOG
-    VLOG(4) << TargetToStr(param.output->target());
-    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
-            << image_shape["height"];
-    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
-            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3]
-            << "x_dims[x_dims.size() - 1]" << x_dims[x_dims.size() - 1];
-    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
-            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
-    VLOG(4) << "width_: " << width_ << ", flag_: " << flag_;
-    VLOG(4) << "global_work_size: " << x_dims[x_dims.size() - 1] << "  "
-            << (image_shape["width"] / x_dims[x_dims.size() - 1]) << "  "
-            << (image_shape["height"]);
+    VLOG(4) << TargetToStr(concat_param_->output->target());
+    VLOG(4) << "output_image_shape(w,h):" << output_image_shape["width"] << " "
+            << output_image_shape["height"];
+    VLOG(4) << "output_tensor_dims[" << output_tensor_dims.size()
+            << "D]:" << output_tensor_dims[0] << " " << output_tensor_dims[1]
+            << " " << output_tensor_dims[2] << " " << output_tensor_dims[3]
+            << "output_tensor_dims[output_tensor_dims.size() - 1]"
+            << output_tensor_dims[output_tensor_dims.size() - 1];
+    VLOG(4) << "output_tensor_w: " << output_tensor_w << ", flag_: " << flag_;
+    VLOG(4) << "width_:" << width_;
+    VLOG(4) << "global_work_size: "
+            << output_tensor_dims[output_tensor_dims.size() - 1] << "  "
+            << (output_image_shape["width"] /
+                output_tensor_dims[output_tensor_dims.size() - 1])
+            << "  " << (output_image_shape["height"]);
 #endif

+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-    int out_w = x_dims[x_dims.size() - 1];
-    int out_c = x_dims[1];
-    if (inputs.size() == 2) {
-      auto* x_buf0 = inputs[0]->data<half_t, cl::Image2D>();
-      auto* x_buf1 = inputs[1]->data<half_t, cl::Image2D>();
-      cl_int status = kernel.setArg(arg_idx, *x_buf0);
+
+    if (kernel_func_name_ == "concat2") {
+      auto* input0_image_p = inputs[0]->data<half_t, cl::Image2D>();
+      auto* input1_image_p = inputs[1]->data<half_t, cl::Image2D>();
+      int input0_axis_dims = inputs[0]->dims()[axis_];
+      cl_int status = kernel.setArg(0, *input0_image_p);
      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, *x_buf1);
+      status = kernel.setArg(1, *input1_image_p);
      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, *out_buf);
+      status = kernel.setArg(2, *output_image_p);
      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, flag_);
+      status = kernel.setArg(3, flag_);
      CL_CHECK_FATAL(status);
-      status =
-          kernel.setArg(++arg_idx, static_cast<int>(inputs[0]->dims()[axis_]));
+      status = kernel.setArg(4, input0_axis_dims);
      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, out_c);
+      status = kernel.setArg(5, output_tensor_c);
      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, out_w);
+      status = kernel.setArg(6, output_tensor_w);
      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, width_);
+      status = kernel.setArg(7, width_);
      CL_CHECK_FATAL(status);

      status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
@@ -200,51 +217,209 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
          nullptr,
          nullptr);
      CL_CHECK_FATAL(status);
-    } else {
-      auto start = 0;
-      for (int i = 0; i < inputs.size(); i++) {
-        arg_idx = 0;
-        auto in_dims = inputs[i]->dims();
-        image_shape = InitImageDimInfoWith(in_dims);
-        auto* x_buf = inputs[i]->data<half_t, cl::Image2D>();
-        int in_w = in_dims[in_dims.size() - 1];
+    } else if (kernel_func_name_ == "concatByCWith3Inputs" ||
+               kernel_func_name_ == "concatByCWith4Inputs") {
+      auto* input0 = inputs[0];
+      auto* input0_image_p = input0->data<half_t, cl::Image2D>();
+      int input0_tensor_c = input0->dims()[1];
+
+      auto* input1 = inputs.size() >= 2 ? inputs[1] : nullptr;
+      auto* input1_image_p =
+          input1 ? input1->data<half_t, cl::Image2D>() : nullptr;
+      int input1_tensor_c = input1 ? input1->dims()[1] : -1;
+
+      auto* input2 = inputs.size() >= 3 ? inputs[2] : nullptr;
+      auto* input2_image_p =
+          input2 ? input2->data<half_t, cl::Image2D>() : nullptr;
+      int input2_tensor_c = input2 ? input2->dims()[1] : -1;
+
+      auto* input3 = inputs.size() >= 4 ? inputs[3] : nullptr;
+      auto* input3_image_p =
+          input3 ? input3->data<half_t, cl::Image2D>() : nullptr;
+      int input3_tensor_c = input3 ? input3->dims()[1] : -1;
+
+      int output_tensor_c = output_tensor_dims[1];
+      int output_tensor_w = output_tensor_dims[3];
+
+      const std::vector<size_t>& default_work_size = DefaultWorkSize(
+          output_tensor_dims,
+          DDim(std::vector<DDim::value_type>{
+              static_cast<int64_t>(output_image_shape["width"]),
+              static_cast<int64_t>(output_image_shape["height"])}));
+      cl::NDRange global_work_size =
+          cl::NDRange{static_cast<size_t>(default_work_size[0]),
+                      static_cast<size_t>(default_work_size[1]),
+                      static_cast<size_t>(default_work_size[2])};
+
+      cl_int status;
+      status = kernel.setArg(0, *output_image_p);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(1, output_tensor_c);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(2, output_tensor_w);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(3, *input0_image_p);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(4, input0_tensor_c);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(5, *input1_image_p);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(6, input1_tensor_c);
+      CL_CHECK_FATAL(status);
+      if (inputs.size() >= 3) {
+        status = kernel.setArg(7, *input2_image_p);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(8, input2_tensor_c);
+        CL_CHECK_FATAL(status);
+      }
+      if (inputs.size() == 4) {
+        status = kernel.setArg(9, *input3_image_p);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(10, input3_tensor_c);
+        CL_CHECK_FATAL(status);
+      }
+      status = EnqueueNDRangeKernel(context,
+                                    kernel,
+                                    cl::NullRange,
+                                    global_work_size,
+                                    cl::NullRange,
+                                    nullptr,
+                                    event_);
+      CL_CHECK_FATAL(status);
+    } else if (kernel_func_name_ == "concat_mul_buffer") {  // inputs.size() > 4
+      // note: do image layout transform: image to buffer
+      size_t inputs_num = inputs.size();
+      std::vector<const cl::Image2D*> inputs_image_pointers(inputs_num);
+      std::vector<std::map<std::string, size_t>> inputs_image_shapes(
+          inputs_num);
+      std::vector<DDimLite> inputs_dims(inputs_num);
+      std::vector<cl::Buffer*> inputs_buffer_pointers(inputs_num);
+      for (int i = 0; i < inputs_num; i++) {
+        auto* input = inputs[i];
+        inputs_dims[i] = input->dims();
+        inputs_image_shapes[i] = InitImageDimInfoWith(input->dims());
+        inputs_image_pointers[i] = input->data<half_t, cl::Image2D>();
+      }
+      // step1. create kernels
+      // 1.1 img_to_buf
+      std::vector<std::list<std::unique_ptr<KernelBase>>>
+          img_to_buf_kernels_vec(inputs_num);
+      for (size_t i = 0; i < inputs_num; ++i) {
+        auto img_to_buf_kernels = KernelRegistry::Global().Create(
+            "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
+        img_to_buf_kernels_vec[i] = std::move(img_to_buf_kernels);
+      }
+      // 1.2 buf_to_img
+      std::list<std::unique_ptr<KernelBase>> buf_to_img_kernels =
+          KernelRegistry::Global().Create("layout",
+                                          TARGET(kOpenCL),
+                                          PRECISION(kAny),
+                                          DATALAYOUT(kImageDefault));
+
+      // step2. get real kernel
+      // 2.1 img_to_buf
+      std::vector<std::unique_ptr<KernelBase>> img_to_buf_kernel_vec(
+          inputs_num);
+      for (size_t i = 0; i < inputs_num; ++i) {
+        img_to_buf_kernel_vec[i] = std::move(img_to_buf_kernels_vec[i].front());
+      }
+      // 2.2 buf_to_img
+      std::unique_ptr<KernelBase> buf_to_img_kernel =
+          std::move(buf_to_img_kernels.front());
+
+      // step3. create and set param, context to kernel
+      std::unique_ptr<KernelContext> kernel_context(new KernelContext);
+      kernel_context->As<OpenCLContext>().InitOnce();
+      // 3.1 img_to_buf
+      std::vector<operators::LayoutParam> img_to_buf_params(inputs_num);
+      std::vector<lite::Tensor> outputs_vec(inputs_num);
+      std::vector<cl::Buffer*> outputs_buffer_pointers(inputs_num);
+      for (size_t i = 0; i < inputs_num; ++i) {
+        img_to_buf_params[i].x = inputs[i];
+        img_to_buf_params[i].y = &outputs_vec[i];
+        outputs_vec[i].Resize(inputs_dims[i]);
+        outputs_buffer_pointers[i] =
+            outputs_vec[i].mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+        img_to_buf_kernel_vec[i]->SetParam(img_to_buf_params[i]);
+
+        std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
+        kernel_context->As<OpenCLContext>().CopySharedTo(
+            &(img_to_buf_context->As<OpenCLContext>()));
+        img_to_buf_kernel_vec[i]->SetContext(std::move(img_to_buf_context));
+      }
+      // 3.2 concat_mul_buf
+      std::shared_ptr<lite::Tensor> concat_mul_buf_output_t(new lite::Tensor);
+      concat_mul_buf_output_t->Resize(concat_param_->output->dims());
+      auto conat_mul_buf_output_data =
+          concat_mul_buf_output_t->mutable_data<float, cl::Buffer>(
+              TARGET(kOpenCL));
+      // 3.3 buf_to_img
+      std::shared_ptr<lite::Tensor> buf_to_img_output_t(new lite::Tensor);
+      buf_to_img_output_t->Resize(concat_param_->output->dims());
+
+      std::shared_ptr<operators::LayoutParam> buf_to_img_param(
+          new operators::LayoutParam);
+      buf_to_img_param->x = concat_mul_buf_output_t.get();
+      buf_to_img_param->y = concat_param_->output;
+      buf_to_img_kernel->SetParam(buf_to_img_param);
+
+      std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
+      kernel_context->As<OpenCLContext>().CopySharedTo(
+          &(buf_to_img_context->As<OpenCLContext>()));
+      buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
+
+      // step4. run kernels
+      // 4.1 run kernel: image->buffer
+      for (size_t i = 0; i < inputs_num; ++i) {
+        img_to_buf_kernel_vec[i]->Launch();
+      }
+      // 4.2 run kernel: concat_mul_buffer
+      int cur_axis_start_idx = 0;
+      int total = output_tensor_dims[axis_] * post_size_;
+      for (size_t i = 0; i < inputs_num; ++i) {
+        auto* x_buf = outputs_buffer_pointers[i];
+        int axis_dim_size = inputs[i]->dims()[axis_];
+        global_work_size = cl::NDRange{static_cast<size_t>(axis_dim_size)};
+        int total0 = axis_dim_size * post_size_;
 #ifdef LITE_WITH_LOG
-        VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
-                << image_shape["height"];
+        VLOG(2) << "--------------- i:" << i << " -----------------";
+        VLOG(2) << "post_size_:" << post_size_;
+        VLOG(2) << "pre_size_:" << pre_size_;
+        VLOG(2) << "axis_dim_size:" << axis_dim_size;
+        VLOG(2) << "cur_axis_start_idx:" << cur_axis_start_idx;
+        VLOG(2) << "total:" << total;
+        VLOG(2) << "total0:" << total0;
 #endif
-        global_work_size =
-            cl::NDRange{static_cast<cl::size_type>(in_dims[in_dims.size() - 1]),
-                        static_cast<cl::size_type>(image_shape["width"] /
-                                                   in_dims[in_dims.size() - 1]),
-                        static_cast<cl::size_type>(image_shape["height"])};
-        cl_int status = kernel.setArg(arg_idx, *x_buf);
-        CL_CHECK_FATAL(status);
-        status = kernel.setArg(++arg_idx, *out_buf);
+        cl_int status;
+        status = kernel.setArg(0, *x_buf);
        CL_CHECK_FATAL(status);
-        status = kernel.setArg(++arg_idx, flag_);
+        status = kernel.setArg(1, *conat_mul_buf_output_data);
        CL_CHECK_FATAL(status);
-        status = kernel.setArg(++arg_idx, start);
+        status = kernel.setArg(2, axis_dim_size);
        CL_CHECK_FATAL(status);
-        status = kernel.setArg(++arg_idx, out_c);
+        status = kernel.setArg(3, pre_size_);
        CL_CHECK_FATAL(status);
-        status = kernel.setArg(++arg_idx, out_w);
+        status = kernel.setArg(4, post_size_);
        CL_CHECK_FATAL(status);
-        status = kernel.setArg(++arg_idx, in_w);
+        status = kernel.setArg(5, cur_axis_start_idx);
        CL_CHECK_FATAL(status);
-        status = kernel.setArg(++arg_idx, width_);
+        status = kernel.setArg(6, total);
        CL_CHECK_FATAL(status);
+        status = kernel.setArg(7, total0);
        CL_CHECK_FATAL(status);

-        status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        status = EnqueueNDRangeKernel(context,
                                      kernel,
                                      cl::NullRange,
                                      global_work_size,
                                      cl::NullRange,
                                      nullptr,
-            nullptr);
+                                      event_);
        CL_CHECK_FATAL(status);
-        start += inputs[i]->dims()[axis_];
+        cur_axis_start_idx += axis_dim_size;
      }
+      // 4.3 run kernel: buffer->image
+      buf_to_img_kernel->Launch();
    }
  }

@@ -258,10 +433,11 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
  }
 #endif

-  int axis_size_ = 1;
  int axis_ = 1;
  int flag_ = 1;
  int width_ = 1;
+  int pre_size_ = 1;
+  int post_size_ = 1;
  param_t* concat_param_{nullptr};
  std::string kernel_func_name_{};
  std::string build_options_{" -DCL_DTYPE_half"};

--- a/lite/kernels/opencl/concat_image_compute_test.cc
+++ b/lite/kernels/opencl/concat_image_compute_test.cc
@@ -77,31 +77,32 @@ void concat_mul_compute_ref(std::vector<const dtype *> ins_data,
  }
 }

-// #define LOOP_TEST
-// #define PRINT_RESULT
-TEST(concat_image2d, compute) {
+//  #define LOOP_TEST_CONCAT2
+//  #define PRINT_RESULT_CONCAT2
+#define CHECK_RESULT_CONCAT2
+TEST(concat_image2d_concat2, compute) {
  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> concat(img) -> "
               "layout(img2buf) "
               "-> host";
-
-#ifdef LOOP_TEST
-  for (int n = 1; n <= 100; n += 33) {
-    for (auto c : {1, 3}) {
-      for (int h = 12; h <= 100; h += 13) {
-        for (int w = 12; w <= 100; w += 25) {
-          for (atuo &axis : {0, 1, 2, 3}) {
-#else
+#ifdef LOOP_TEST_CONCAT2
+  const int axis = 1;
  const int n = 1;
-  const int c = 2;
-  const int h = 3;
-  const int w = 4;
+  for (int c = 1; c < 4; ++c) {
+    for (int h = 1; h < 50; ++h) {
+      for (int w = 1; w < 50; ++w) {
+#else
  const int axis = 1;
-#endif  // LOOP_TEST
-            LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c
-                      << " " << h << " " << w << " ========";
+  const int n = 1;
+  const int c = 3;   // 1;
+  const int h = 15;  // 2;
+  const int w = 15;  // 42;
+#endif  // LOOP_TEST_CONCAT2
+        LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
+                  << h << " " << w << " ========";
        LOG(INFO) << "======== axis: " << axis;
+
        // set layout kernels
-            auto buf_to_img_kernels =
+        std::list<std::unique_ptr<KernelBase>> buf_to_img_kernels =
            KernelRegistry::Global().Create("layout",
                                            TARGET(kOpenCL),
                                            PRECISION(kAny),
@@ -127,7 +128,7 @@ TEST(concat_image2d, compute) {
        auto buf_to_img_kernel1 = std::move(buf_to_img_kernels1.front());
        auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
        auto concat_img_kernel = std::move(concat_img_kernels.front());
-            LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
+        LOG(INFO) << "get 1st-0 kernel: " << buf_to_img_kernel->doc();
        LOG(INFO) << "get 1st-1 kernel: " << buf_to_img_kernel1->doc();
        LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
        LOG(INFO) << "get 3rd kernel: " << concat_img_kernel->doc();
@@ -135,7 +136,8 @@ TEST(concat_image2d, compute) {
        // set tensors about op param
        LOG(INFO) << "set tensors about op param";
        lite::Tensor x0, x1, y, concat_in0, concat_in1, concat_out, y_ref;
-            operators::LayoutParam BufferToImageParam0, BufferToImageParam1;
+        operators::LayoutParam BufferToImageParam0, BufferToImageParam1,
+            BufferToImageParam2;
        operators::LayoutParam ImageToBufferParam;
        BufferToImageParam0.x = &x0;
        BufferToImageParam0.y = &concat_in0;
@@ -151,10 +153,11 @@ TEST(concat_image2d, compute) {
        concatParam.axis = axis;
        concatParam.output = &concat_out;

-            const DDim x0_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+        DDim x0_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
        DDim x1_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
        DDim out_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
-            x1_dim[axis] += 2;
+        // note: used to make cases with different channel
+        // x1_dim[axis] += 2;
        out_dim[axis] = x0_dim[axis] + x1_dim[axis];
        x0.Resize(x0_dim);
        x1.Resize(x1_dim);
@@ -183,26 +186,25 @@ TEST(concat_image2d, compute) {
        auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
            y_data, 0, sizeof(float) * out_dim.production()));
        for (int i = 0; i < x0_dim.production(); ++i) {
-              mapped_x0[i] = static_cast<int>(i) - x0_dim.production() / 2;
+          mapped_x0[i] = i + 1;  //  (i+1)  * 0.01;//*/static_cast<int>(i) -
+                                 //  x0_dim.production() / 2 * 0.00987;
        }
        for (int i = 0; i < x1_dim.production(); ++i) {
-              mapped_x1[i] = static_cast<int>(i) - x1_dim.production() / 2;
+          mapped_x1[i] = x0_dim.production() + i +
+                         1;  //  (i+1) * 0.1;//*/static_cast<int>(i) -
+                             //  x1_dim.production() / 2 * 0.00987;
        }
        for (int i = 0; i < out_dim.production(); ++i) {
          mapped_y[i] = static_cast<int>(0);
        }
-            auto *concat_in_data0 =
-                concat_in0.mutable_data<half_t, cl::Image2D>(
+        auto *concat_in_data0 = concat_in0.mutable_data<half_t, cl::Image2D>(
            concat_image2d_shape_in0["width"],
            concat_image2d_shape_in0["height"]);
-            auto *concat_in_data1 =
-                concat_in1.mutable_data<half_t, cl::Image2D>(
+        auto *concat_in_data1 = concat_in1.mutable_data<half_t, cl::Image2D>(
            concat_image2d_shape_in1["width"],
            concat_image2d_shape_in1["height"]);
-            auto *concat_out_data =
-                concat_out.mutable_data<half_t, cl::Image2D>(
-                    concat_image2d_shape["width"],
-                    concat_image2d_shape["height"]);
+        auto *concat_out_data = concat_out.mutable_data<half_t, cl::Image2D>(
+            concat_image2d_shape["width"], concat_image2d_shape["height"]);

        // set context and kernel args
        LOG(INFO) << "set context and kernel args";
@@ -210,28 +212,25 @@ TEST(concat_image2d, compute) {
        context->As<OpenCLContext>().InitOnce();

        buf_to_img_kernel->SetParam(BufferToImageParam0);
-            std::unique_ptr<KernelContext> buf_to_img_context(
-                new KernelContext);
+        std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
        context->As<OpenCLContext>().CopySharedTo(
            &(buf_to_img_context->As<OpenCLContext>()));
        buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
+
        buf_to_img_kernel1->SetParam(BufferToImageParam1);
-            std::unique_ptr<KernelContext> buf_to_img_context1(
-                new KernelContext);
+        std::unique_ptr<KernelContext> buf_to_img_context1(new KernelContext);
        context->As<OpenCLContext>().CopySharedTo(
            &(buf_to_img_context1->As<OpenCLContext>()));
        buf_to_img_kernel1->SetContext(std::move(buf_to_img_context1));

        img_to_buf_kernel->SetParam(ImageToBufferParam);
-            std::unique_ptr<KernelContext> img_to_buf_context(
-                new KernelContext);
+        std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
        context->As<OpenCLContext>().CopySharedTo(
            &(img_to_buf_context->As<OpenCLContext>()));
        img_to_buf_kernel->SetContext(std::move(img_to_buf_context));

        concat_img_kernel->SetParam(concatParam);
-            std::unique_ptr<KernelContext> concat_img_context(
-                new KernelContext);
+        std::unique_ptr<KernelContext> concat_img_context(new KernelContext);
        context->As<OpenCLContext>().CopySharedTo(
            &(concat_img_context->As<OpenCLContext>()));
        concat_img_kernel->SetContext(std::move(concat_img_context));
@@ -257,30 +256,38 @@ TEST(concat_image2d, compute) {
        concat_mul_compute_ref<float>(
            ins_ptr, in_dim, axis, out_dim, y_data_ref);
 // result
-#ifdef PRINT_RESULT
+#ifdef PRINT_RESULT_CONCAT2
        LOG(INFO) << "---- print kernel result (input -> output) ----";
-            for (int eidx = 0; eidx < out_dim.production(); ++eidx) {
-              std::cout << "x0[" << eidx << "]:" << mapped_x0[eidx] << ",\t x1["
-                        << eidx << "]:" << mapped_x1[eidx] << " -> y[" << eidx
-                        << "]:" << mapped_y[eidx] << "\t, y_ref[" << eidx
-                        << "]:" << y_data_ref[eidx] << ",\t IS_DIFF_PASSED:"
-                        << IS_DIFF_PASSED(
-                               y_data_ref[eidx], mapped_y[eidx], FP16_MAX_DIFF)
-                        << std::endl;
+        for (int i = 0; i < out_dim.production(); ++i) {
+          auto abs_diff = abs(y_data_ref[i] - mapped_y[i]);
+          auto relative_diff =
+              COMPUTE_RELATIVE_DIFF(y_data_ref[i], mapped_y[i]);
+          LOG(INFO) << "idx:" << i << " mapped_y[" << i << "]:" << mapped_y[i]
+                    << " y_data_ref[" << i << "]:" << y_data_ref[i]
+                    << " abs_diff:" << abs_diff
+                    << " relative_diff:" << relative_diff
+                    << " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
+#if 0  // note: used to write to file and check
+              std::cout << "idx:" << i << " mapped_y[" << i
+                        << "]:" << mapped_y[i] << " y_data_ref[" << i
+                        << "]:" << y_data_ref[i] << " abs_diff:" << abs_diff
+                        << " relative_diff:" << relative_diff
+                        << " FP16_MAX_DIFF:" << FP16_MAX_DIFF << "\n";
+#endif
        }
-#endif  // PRINT_RESULT
+#endif  // PRINT_RESULT_CONCAT2

+#ifdef CHECK_RESULT_CONCAT2
        // check result: compare kernel output and cpu output(y_data_ref)
        for (int i = 0; i < out_dim.production(); i++) {
          auto abs_diff = abs(y_data_ref[i] - mapped_y[i]);
          auto relative_diff =
              COMPUTE_RELATIVE_DIFF(y_data_ref[i], mapped_y[i]);
-              EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) ||
-                            (abs_diff <= FP16_MAX_DIFF),
+          EXPECT_EQ(
+              (relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
              true);
-              if ((relative_diff > FP16_MAX_DIFF) &&
-                  (abs_diff > FP16_MAX_DIFF)) {
-                LOG(ERROR) << "error idx:" << i << " mapped_y[" << i
+          if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
+            LOG(FATAL) << "error idx:" << i << " mapped_y[" << i
                       << "]:" << mapped_y[i] << " y_data_ref[" << i
                       << "]:" << y_data_ref[i] << " abs_diff:" << abs_diff
                       << " relative_diff:" << relative_diff
@@ -288,18 +295,283 @@ TEST(concat_image2d, compute) {
            break;
          }
        }
+#endif
+        // free
+        LOG(INFO) << "free: unmap x, y";
+        TargetWrapperCL::Unmap(x_data0, mapped_x0);
+        TargetWrapperCL::Unmap(x_data1, mapped_x1);
+        TargetWrapperCL::Unmap(y_data, mapped_y);
+#ifdef LOOP_TEST_CONCAT2
+      }  // w
+    }    // h
+  }      // c
+#else
+// nothing to do.
+#endif
+}
+
+//  #define LOOP_TEST_CONCAT_MUL
+//  #define PRINT_RESULT_CONCAT_MUL
+#define CHECK_RESULT_CONCAT_MUL
+TEST(concat_image2d_multi, compute) {
+  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> concat(img) -> "
+               "layout(img2buf) "
+               "-> host";
+#ifdef LOOP_TEST_CONCAT_MUL
+  const int axis = 1;
+  const int n = 1;
+  for (int c = 1; c < 4; ++c) {
+    for (int h = 1; h < 50; ++h) {
+      for (int w = 1; w < 50; ++w) {
+#else
+        const int axis = 1;
+        const int n = 1;
+        const int c = 3;
+        const int h = 4;
+        const int w = 5;
+#endif  // LOOP_TEST_CONCAT_MUL
+        LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
+                  << h << " " << w << " ========";
+        LOG(INFO) << "======== axis: " << axis;
+
+        // set layout kernels
+        auto buf_to_img_kernels =
+            KernelRegistry::Global().Create("layout",
+                                            TARGET(kOpenCL),
+                                            PRECISION(kAny),
+                                            DATALAYOUT(kImageDefault));
+        auto buf_to_img_kernels1 =
+            KernelRegistry::Global().Create("layout",
+                                            TARGET(kOpenCL),
+                                            PRECISION(kAny),
+                                            DATALAYOUT(kImageDefault));
+        auto buf_to_img_kernels2 =
+            KernelRegistry::Global().Create("layout",
+                                            TARGET(kOpenCL),
+                                            PRECISION(kAny),
+                                            DATALAYOUT(kImageDefault));
+        auto img_to_buf_kernels = KernelRegistry::Global().Create(
+            "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
+        auto concat_img_kernels =
+            KernelRegistry::Global().Create("concat",
+                                            TARGET(kOpenCL),
+                                            PRECISION(kFP16),
+                                            DATALAYOUT(kImageDefault));
+        ASSERT_FALSE(buf_to_img_kernels.empty());
+        ASSERT_FALSE(buf_to_img_kernels1.empty());
+        ASSERT_FALSE(buf_to_img_kernels2.empty());
+        ASSERT_FALSE(img_to_buf_kernels.empty());
+        ASSERT_FALSE(concat_img_kernels.empty());
+
+        auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
+        auto buf_to_img_kernel1 = std::move(buf_to_img_kernels1.front());
+        auto buf_to_img_kernel2 = std::move(buf_to_img_kernels2.front());
+        auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
+        auto concat_img_kernel = std::move(concat_img_kernels.front());
+        LOG(INFO) << "get 1st-0 kernel: " << buf_to_img_kernel->doc();
+        LOG(INFO) << "get 1st-1 kernel: " << buf_to_img_kernel1->doc();
+        LOG(INFO) << "get 1st-2 kernel: " << buf_to_img_kernel2->doc();
+        LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
+        LOG(INFO) << "get 3rd kernel: " << concat_img_kernel->doc();
+
+        // set tensors about op param
+        LOG(INFO) << "set tensors about op param";
+        lite::Tensor x0, x1, x2, y, concat_in0, concat_in1, concat_in2,
+            concat_out, y_ref;
+        operators::LayoutParam BufferToImageParam0, BufferToImageParam1,
+            BufferToImageParam2;
+        operators::LayoutParam ImageToBufferParam;
+        BufferToImageParam0.x = &x0;
+        BufferToImageParam0.y = &concat_in0;
+        BufferToImageParam1.x = &x1;
+        BufferToImageParam1.y = &concat_in1;
+        BufferToImageParam2.x = &x2;
+        BufferToImageParam2.y = &concat_in2;
+        ImageToBufferParam.x = &concat_out;
+        ImageToBufferParam.y = &y;
+        std::vector<lite::Tensor *> ins;
+        operators::ConcatParam concatParam;
+        ins.push_back(&concat_in0);
+        ins.push_back(&concat_in1);
+        ins.push_back(&concat_in2);
+        concatParam.x = ins;
+        concatParam.axis = axis;
+        concatParam.output = &concat_out;

+        DDim x0_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+        DDim x1_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+        DDim x2_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+        DDim out_dim = DDim(std::vector<DDim::value_type>{n, c * 3, h, w});
+        // x1_dim[axis] += 2;
+        out_dim[axis] = x0_dim[axis] + x1_dim[axis] + x2_dim[axis];
+        x0.Resize(x0_dim);
+        x1.Resize(x1_dim);
+        x2.Resize(x2_dim);
+        y.Resize(out_dim);
+        concat_in0.Resize(x0_dim);
+        concat_in1.Resize(x1_dim);
+        concat_in2.Resize(x2_dim);
+        concat_out.Resize(out_dim);
+        y_ref.Resize(out_dim);
+        auto concat_image2d_shape =
+            paddle::lite::kernels::opencl::InitImageDimInfoWith(out_dim);
+        auto concat_image2d_shape_in0 =
+            paddle::lite::kernels::opencl::InitImageDimInfoWith(x0_dim);
+        auto concat_image2d_shape_in1 =
+            paddle::lite::kernels::opencl::InitImageDimInfoWith(x1_dim);
+        auto concat_image2d_shape_in2 =
+            paddle::lite::kernels::opencl::InitImageDimInfoWith(x2_dim);
+
+        // initialize tensors
+        LOG(INFO) << "initialize tensors";
+        auto *x_data0 = x0.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+        auto *x_data1 = x1.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+        auto *x_data2 = x2.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+        auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+        auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
+        auto *mapped_x0 = static_cast<float *>(TargetWrapperCL::Map(
+            x_data0, 0, sizeof(float) * x0_dim.production()));
+        auto *mapped_x1 = static_cast<float *>(TargetWrapperCL::Map(
+            x_data1, 0, sizeof(float) * x1_dim.production()));
+        auto *mapped_x2 = static_cast<float *>(TargetWrapperCL::Map(
+            x_data2, 0, sizeof(float) * x2_dim.production()));
+        auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
+            y_data, 0, sizeof(float) * out_dim.production()));
+        for (int i = 0; i < x0_dim.production(); ++i) {
+          mapped_x0[i] = i + 1;  //  (i+1)  * 0.01;//*/static_cast<int>(i) -
+                                 //  x0_dim.production() / 2 * 0.00987;
+        }
+        for (int i = 0; i < x1_dim.production(); ++i) {
+          mapped_x1[i] = x0_dim.production() + i +
+                         1;  //  (i+1) * 0.1;//*/static_cast<int>(i) -
+                             //  x1_dim.production() / 2 * 0.00987;
+        }
+        for (int i = 0; i < x2_dim.production(); ++i) {
+          mapped_x2[i] = x0_dim.production() + x1_dim.production() + i +
+                         1;  //  (i+1) * 1.;//*/ static_cast<int>(i) -
+                             //  x2_dim.production() / 2 * 0.00987;
+        }
+        for (int i = 0; i < out_dim.production(); ++i) {
+          mapped_y[i] = static_cast<int>(0);
+        }
+        auto *concat_in_data0 = concat_in0.mutable_data<half_t, cl::Image2D>(
+            concat_image2d_shape_in0["width"],
+            concat_image2d_shape_in0["height"]);
+        auto *concat_in_data1 = concat_in1.mutable_data<half_t, cl::Image2D>(
+            concat_image2d_shape_in1["width"],
+            concat_image2d_shape_in1["height"]);
+        auto *concat_in_data2 = concat_in2.mutable_data<half_t, cl::Image2D>(
+            concat_image2d_shape_in2["width"],
+            concat_image2d_shape_in2["height"]);
+        auto *concat_out_data = concat_out.mutable_data<half_t, cl::Image2D>(
+            concat_image2d_shape["width"], concat_image2d_shape["height"]);
+
+        // set context and kernel args
+        LOG(INFO) << "set context and kernel args";
+        std::unique_ptr<KernelContext> context(new KernelContext);
+        context->As<OpenCLContext>().InitOnce();
+
+        buf_to_img_kernel->SetParam(BufferToImageParam0);
+        std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
+        context->As<OpenCLContext>().CopySharedTo(
+            &(buf_to_img_context->As<OpenCLContext>()));
+        buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
+
+        buf_to_img_kernel1->SetParam(BufferToImageParam1);
+        std::unique_ptr<KernelContext> buf_to_img_context1(new KernelContext);
+        context->As<OpenCLContext>().CopySharedTo(
+            &(buf_to_img_context1->As<OpenCLContext>()));
+        buf_to_img_kernel1->SetContext(std::move(buf_to_img_context1));
+
+        buf_to_img_kernel2->SetParam(BufferToImageParam2);
+        std::unique_ptr<KernelContext> buf_to_img_context2(new KernelContext);
+        context->As<OpenCLContext>().CopySharedTo(
+            &(buf_to_img_context2->As<OpenCLContext>()));
+        buf_to_img_kernel2->SetContext(std::move(buf_to_img_context2));
+
+        img_to_buf_kernel->SetParam(ImageToBufferParam);
+        std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
+        context->As<OpenCLContext>().CopySharedTo(
+            &(img_to_buf_context->As<OpenCLContext>()));
+        img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
+
+        concat_img_kernel->SetParam(concatParam);
+        std::unique_ptr<KernelContext> concat_img_context(new KernelContext);
+        context->As<OpenCLContext>().CopySharedTo(
+            &(concat_img_context->As<OpenCLContext>()));
+        concat_img_kernel->SetContext(std::move(concat_img_context));
+
+        // run kernels
+        LOG(INFO) << "run kernel: buf_to_img_kernel";
+        buf_to_img_kernel->Launch();
+        buf_to_img_kernel1->Launch();
+        buf_to_img_kernel2->Launch();
+        LOG(INFO) << "run kernel: concat_img_kernel";
+        concat_img_kernel->Launch();
+        LOG(INFO) << "run kernel: img_to_buf_kernel";
+        img_to_buf_kernel->Launch();
+
+        CLRuntime::Global()->command_queue().finish();
+
+        // compute ref cp_u
+        std::vector<const float *> ins_ptr;
+        std::vector<const DDim> in_dim;
+        ins_ptr.push_back(mapped_x0);
+        ins_ptr.push_back(mapped_x1);
+        ins_ptr.push_back(mapped_x2);
+        in_dim.push_back(x0_dim);
+        in_dim.push_back(x1_dim);
+        in_dim.push_back(x2_dim);
+        concat_mul_compute_ref<float>(
+            ins_ptr, in_dim, axis, out_dim, y_data_ref);
+// result
+#ifdef PRINT_RESULT_CONCAT_MUL
+        LOG(INFO) << "---- print kernel result (input -> output) ----";
+        for (int i = 0; i < out_dim.production(); ++i) {
+          auto abs_diff = abs(y_data_ref[i] - mapped_y[i]);
+          auto relative_diff =
+              COMPUTE_RELATIVE_DIFF(y_data_ref[i], mapped_y[i]);
+          LOG(INFO) << "idx:" << i << " mapped_y[" << i << "]:" << mapped_y[i]
+                    << " y_data_ref[" << i << "]:" << y_data_ref[i]
+                    << " abs_diff:" << abs_diff
+                    << " relative_diff:" << relative_diff
+                    << " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
+#if 0  // note: used to write to file and check
+              std::cout << "idx:" << i << " mapped_y[" << i
+                        << "]:" << mapped_y[i] << " y_data_ref[" << i
+                        << "]:" << y_data_ref[i] << " abs_diff:" << abs_diff
+                        << " relative_diff:" << relative_diff
+                        << " FP16_MAX_DIFF:" << FP16_MAX_DIFF << "\n";
+#endif
+        }
+#endif  // PRINT_RESULT_CONCAT_MUL
+
+#ifdef CHECK_RESULT_CONCAT_MUL
+        // check result: compare kernel output and cpu output(y_data_ref)
+        for (int i = 0; i < out_dim.production(); i++) {
+          auto abs_diff = abs(y_data_ref[i] - mapped_y[i]);
+          auto relative_diff =
+              COMPUTE_RELATIVE_DIFF(y_data_ref[i], mapped_y[i]);
+          if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
+            LOG(FATAL) << "error idx:" << i << " mapped_y[" << i
+                       << "]:" << mapped_y[i] << " y_data_ref[" << i
+                       << "]:" << y_data_ref[i] << " abs_diff:" << abs_diff
+                       << " relative_diff:" << relative_diff
+                       << " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
+            break;
+          }
+        }
+#endif
        // free
        LOG(INFO) << "free: unmap x, y";
        TargetWrapperCL::Unmap(x_data0, mapped_x0);
        TargetWrapperCL::Unmap(x_data1, mapped_x1);
+        TargetWrapperCL::Unmap(x_data2, mapped_x2);
        TargetWrapperCL::Unmap(y_data, mapped_y);
-#ifdef LOOP_TEST
-          }  // axis
+#ifdef LOOP_TEST_CONCAT_MUL
      }  // w
    }    // h
  }      // c
-  }          // n
 #else
 // nothing to do.
 #endif

--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
@@ -185,47 +185,29 @@ void ConvImageCompute::PrepareForRun() {

    impl_ = &ConvImageCompute::DepthwiseConv2d;
  } else if (filter_tensor_h_ == 3 && filter_tensor_w_ == 3) {
-// #define CONV3x3OPT_FALL_BACK
-#ifndef CONV3x3OPT_FALL_BACK
    // conv2d_3x3
-    kernel_func_names_.push_back(input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch"
-                                                     : "conv2d_3x3_opt");
+    if (groups_ == 1) {
+      kernel_func_names_.push_back(
+          input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch" : "conv2d_3x3_opt");
      kernel_func_paths_.push_back("image/conv2d_3x3_opt_kernel.cl");
-
-    CLImageConverterFolder converter;
-    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    filter_image_h_ = filter_image_dims[1];
-    filter_image_w_ = filter_image_dims[0];
-    tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
-
-    half_t* filter_image_data =
-        tensor_hold_filter_image_->mutable_data<half_t>();
-
-    converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
-    filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        filter_image_w_, filter_image_h_, filter_image_data);
-
      impl_ = &ConvImageCompute::Conv2d3x3opt;
-#else
+    } else {  // groups_ > 1
      kernel_func_names_.push_back("conv2d_3x3");
      kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl");
+      impl_ = &ConvImageCompute::Conv2d3x3;
+    }

    CLImageConverterFolder converter;
    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
    filter_image_h_ = filter_image_dims[1];
    filter_image_w_ = filter_image_dims[0];
    tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
-
    half_t* filter_image_data =
        tensor_hold_filter_image_->mutable_data<half_t>();

    converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
    filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
        filter_image_w_, filter_image_h_, filter_image_data);
-
-    impl_ = &ConvImageCompute::Conv2d3x3;
-#endif
-#undef CONV3x3OPT_FALL_BACK
  } else if (filter_tensor_h_ == 5 && filter_tensor_w_ == 5) {
 #define CONV_5x5_OPT
 #ifndef CONV_5x5_OPT
@@ -584,6 +566,11 @@ void ConvImageCompute::GetGlobalWorkSize() {
                                    static_cast<size_t>(w_blk_),
                                    static_cast<size_t>(nh_blk_)};
    input_c_block_ = static_cast<const int>((input_tensor_c_ + 3) / 4);
+  } else if (kernel_func_names_[0] == "conv2d_3x3") {
+    global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                    static_cast<size_t>(w_blk_),
+                                    static_cast<size_t>(nh_blk_)};
+
  } else if (kernel_func_names_[0] == "conv2d_3x3_multi_batch" ||
             kernel_func_names_[0] == "conv2d_3x3_opt") {
    int w_blk_size = 5;
@@ -1185,6 +1172,56 @@ void ConvImageCompute::PrintConvInfo() {
  VLOG(4) << "dilations: " << dilation_h_ << ", " << dilation_w_;
  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
          << global_work_size_[1] << "," << global_work_size_[2] << "}";
+  VLOG(4) << "groups_:" << groups_;
+
+  LOG(INFO) << "================================";
+  LOG(INFO) << "c_blk_=" << c_blk_ << ", w_blk_=" << w_blk_
+            << ",nh_blk_=" << nh_blk_;
+  LOG(INFO) << "input_image_p_:" << input_image_p_;
+  LOG(INFO) << "filter_image_p_:" << filter_image_p_;
+  LOG(INFO) << "bias_image_p_:" << bias_image_p_;
+  LOG(INFO) << "output_image_p_:" << output_image_p_;
+
+  LOG(INFO) << "stride_h_:" << stride_h_;
+  LOG(INFO) << "stride_w_:" << stride_w_;
+
+  LOG(INFO) << "dilation_h_:" << dilation_h_;
+  LOG(INFO) << "dilation_w_:" << dilation_w_;
+
+  LOG(INFO) << "pad_up_:" << pad_up_;
+  LOG(INFO) << "pad_down_:" << pad_down_;
+  LOG(INFO) << "pad_left_:" << pad_left_;
+  LOG(INFO) << "pad_right_:" << pad_right_;
+
+  LOG(INFO) << "offset_:" << offset_;
+  LOG(INFO) << "groups_:" << groups_;
+  LOG(INFO) << "relu_fused_:" << relu_fused_;
+  LOG(INFO) << "has_bias_:" << has_bias_;
+
+  LOG(INFO) << "input_tensor_n_:" << input_tensor_n_;
+  LOG(INFO) << "input_tensor_c_:" << input_tensor_c_;
+  LOG(INFO) << "input_tensor_h_:" << input_tensor_h_;
+  LOG(INFO) << "input_tensor_w_:" << input_tensor_w_;
+  LOG(INFO) << "input_image_h_:" << input_image_h_;
+  LOG(INFO) << "input_image_w_:" << input_image_w_;
+  LOG(INFO) << "input_c_block_:" << input_c_block_;
+
+  LOG(INFO) << "output_tensor_n_:" << output_tensor_n_;
+  LOG(INFO) << "output_tensor_c_:" << output_tensor_c_;
+  LOG(INFO) << "output_tensor_h_:" << output_tensor_h_;
+  LOG(INFO) << "output_tensor_w_:" << output_tensor_w_;
+  LOG(INFO) << "output_image_h_:" << output_image_h_;
+  LOG(INFO) << "output_image_w_:" << output_image_w_;
+
+  LOG(INFO) << "filter_tensor_n_:" << filter_tensor_n_;
+  LOG(INFO) << "filter_tensor_c_:" << filter_tensor_c_;
+  LOG(INFO) << "filter_tensor_h_:" << filter_tensor_h_;
+  LOG(INFO) << "filter_tensor_w_:" << filter_tensor_w_;
+  LOG(INFO) << "filter_image_h_:" << filter_image_h_;
+  LOG(INFO) << "filter_image_w_:" << filter_image_w_;
+
+  LOG(INFO) << "bias_image_h_" << bias_image_h_;
+  LOG(INFO) << "bias_image_w_" << bias_image_w_;
 }

 double ConvImageCompute::Tune(int times) {