[LITE][OPENCL][Image]develop 1x1/5x5/7x7 routing in conv_compute ,tes… (#2818)

* [LITE][OPENCL][Image]develop 1x1/5x5/7x7 routing in conv_compute ,test=develop * [LITE][OPENCL][Image]develop 1x1/5x5/7x7 routing in conv_compute ,convert bias filter in prepare for run ,test=develop

[LITE][OPENCL][Image]develop 1x1/5x5/7x7 routing in conv_compute ,tes… (#2818)
* [LITE][OPENCL][Image]develop 1x1/5x5/7x7 routing in conv_compute ,test=develop * [LITE][OPENCL][Image]develop 1x1/5x5/7x7 routing in conv_compute ,convert bias filter in prepare for run ,test=develop
8cfd96f2 · xiebaiyuan · GitHub · 396fc6f0 · 8cfd96f2 · 8cfd96f2
7 changed file
--- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
@@ -360,12 +360,12 @@ __read_only image2d_t new_scale,
            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
 #endif

-#ifdef RELU
+
  output0 = activation_type4(output0);
  output1 = activation_type4(output1);
  output2 = activation_type4(output2);
  output3 = activation_type4(output3);
-#endif
+

  if (out_w0 < old_w) {
    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0);

--- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl
+#include <cl_common.h>
+
+__kernel void conv2d_5x5(__private const int global_size_dim0,
+                         __private const int global_size_dim1,
+                         __private const int global_size_dim2,
+                         __read_only image2d_t input_image,
+                         __read_only image2d_t filter_image,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                         __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                         __read_only image2d_t new_scale,
+                         __read_only image2d_t new_biase,
+#endif
+                         __write_only image2d_t output_image,
+                         __private const int stride,
+                         __private const int offset,
+                         __private const int input_c,
+                         __private const int dilation,
+                         __private const int input_width,  /* of one block */
+                         __private const int input_height, /* of one block */
+                         __private const int output_width,
+                         __private const int output_height) {
+
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+
+  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
+      out_nh >= global_size_dim2) {
+    return;
+  }
+
+  const int batch_index = out_nh / output_height;
+  const int out_nh_in_one_batch = out_nh % output_height;
+
+  const int filter_n0 = 4 * out_c + 0;
+  const int filter_n1 = 4 * out_c + 1;
+  const int filter_n2 = 4 * out_c + 2;
+  const int filter_n3 = 4 * out_c + 3;
+
+  int2 stride_xy;
+  stride_xy.x = stride;
+  stride_xy.y = stride;
+
+  int2 ouput_pos_in_one_block;
+  ouput_pos_in_one_block.x = out_w;
+  ouput_pos_in_one_block.y = out_nh_in_one_batch;
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  int2 in_pos_in_one_block;
+  in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+
+#ifdef BIASE_CH
+  CL_DTYPE4 output =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
+#elif defined(BIASE_ELE)
+  CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
+#else
+  CL_DTYPE4 output = 0.0f;
+#endif
+
+  CL_DTYPE4 input;
+  CL_DTYPE4 filter[4];
+  int2 filter_pos0;
+  int2 filter_pos1;
+  int2 filter_pos2;
+  int2 filter_pos3;
+  for (int i = 0; i < input_c; ++i) {
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x,
+                         in_pos_in_one_block.y + batch_index * input_height);
+    for (int j = 0; j < 5; j++) {
+      for (int k = 0; k < 5; k++) {
+        input = select(
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_image,
+                          sampler,
+                          (int2)(pos_in.x + (j - 2) * dilation,
+                                 pos_in.y + (k - 2) * dilation)),
+            (CL_DTYPE4)(0.0f),
+            (ushort4)(
+                (in_pos_in_one_block.x + (j - 2) * dilation < 0 ||
+                 in_pos_in_one_block.y + (k - 2) * dilation < 0 ||
+                 in_pos_in_one_block.x + (j - 2) * dilation >= input_width ||
+                 in_pos_in_one_block.y + (k - 2) * dilation >= input_height)
+                << 15));
+        int filter_h = k;
+        int filter_w = j;
+        int filter_c = i;
+
+        filter_pos0.x = filter_c * 5 + filter_w;
+        filter_pos0.y = filter_n0 * 5 + filter_h;
+
+        filter_pos1.x = filter_c * 5 + filter_w;
+        filter_pos1.y = filter_n1 * 5 + filter_h;
+
+        filter_pos2.x = filter_c * 5 + filter_w;
+        filter_pos2.y = filter_n2 * 5 + filter_h;
+
+        filter_pos3.x = filter_c * 5 + filter_w;
+        filter_pos3.y = filter_n3 * 5 + filter_h;
+
+        filter[0] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos0);
+        filter[1] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos1);
+        filter[2] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos2);
+        filter[3] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos3);
+
+        output.x += dot(input, filter[0]);
+        output.y += dot(input, filter[1]);
+        output.z += dot(input, filter[2]);
+        output.w += dot(input, filter[3]);
+        //
+        //        if (output_pos.x == 0 && output_pos.y == 5) {
+        //          printf("i,j,k ={ %d, %d , %d }\n", i,j,k);
+        //          printf("in={ %f , %f , %f , %f } \n",
+        //                 convert_float(input.x),
+        //                 convert_float(input.y),
+        //                 convert_float(input.z),
+        //                 convert_float(input.w));
+        //          printf("filter0={ %f , %f , %f , %f } \n",
+        //                 convert_float(filter[0].x),
+        //                 convert_float(filter[0].y),
+        //                 convert_float(filter[0].z),
+        //                 convert_float(filter[0].w));
+        //          printf("filter1={ %f , %f , %f , %f } \n",
+        //                 convert_float(filter[1].x),
+        //                 convert_float(filter[1].y),
+        //                 convert_float(filter[1].z),
+        //                 convert_float(filter[1].w));
+        //          printf("filter2={ %f , %f , %f , %f } \n",
+        //                 convert_float(filter[2].x),
+        //                 convert_float(filter[2].y),
+        //                 convert_float(filter[2].z),
+        //                 convert_float(filter[2].w));
+        //          printf("filter3={ %f , %f , %f , %f } \n",
+        //                 convert_float(filter[3].x),
+        //                 convert_float(filter[3].y),
+        //                 convert_float(filter[3].z),
+        //                 convert_float(filter[3].w));
+        //          printf("output={ %f , %f , %f , %f } \n",
+        //                 convert_float(output.x),
+        //                 convert_float(output.y),
+        //                 convert_float(output.z),
+        //                 convert_float(output.w));
+        //        }
+      }
+    }
+  }
+
+#ifdef BATCH_NORM
+        output =
+            output * READ_IMG_TYPE(
+                         CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+#endif
+
+        output = activation_type4(output);
+
+        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
+      }
--- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
+#include <cl_common.h>
+
+__kernel void conv2d_7x7(__private const int global_size_dim0,
+                         __private const int global_size_dim1,
+                         __private const int global_size_dim2,
+                         __read_only image2d_t input_image,
+                         __read_only image2d_t filter_image,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                         __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                         __read_only image2d_t new_scale,
+                         __read_only image2d_t new_biase,
+#endif
+                         __write_only image2d_t output_image,
+                         __private const int stride,
+                         __private const int offset,
+                         __private const int input_c,
+                         __private const int dilation,
+                         __private const int input_width,  /* of one block */
+                         __private const int input_height, /* of one block */
+                         __private const int output_width,
+                         __private const int output_height) {
+
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+
+  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
+      out_nh >= global_size_dim2) {
+    return;
+  }
+
+  const int batch_index = out_nh / output_height;
+  const int out_nh_in_one_batch = out_nh % output_height;
+
+  const filter_n0 = 4 * out_c + 0;
+  const filter_n1 = 4 * out_c + 1;
+  const filter_n2 = 4 * out_c + 2;
+  const filter_n3 = 4 * out_c + 3;
+
+  int2 stride_xy;
+  stride_xy.x = stride;
+  stride_xy.y = stride;
+
+  int2 ouput_pos_in_one_block;
+  ouput_pos_in_one_block.x = out_w;
+  ouput_pos_in_one_block.y = out_nh_in_one_batch;
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  int2 in_pos_in_one_block;
+  in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+
+#ifdef BIASE_CH
+  CL_DTYPE4 output =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
+#elif defined(BIASE_ELE)
+  CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
+#else
+  CL_DTYPE4 output = 0.0f;
+#endif
+
+  CL_DTYPE4 input;
+  CL_DTYPE4 filter[4];
+  int2 filter_pos0;
+  int2 filter_pos1;
+  int2 filter_pos2;
+  int2 filter_pos3;
+  for (int i = 0; i < input_c; ++i) {
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x,
+                         in_pos_in_one_block.y + batch_index * input_height);
+    for (int j = 0; j < 7; j++) {
+      for (int k = 0; k < 7; k++) {
+        input = select(
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_image,
+                          sampler,
+                          (int2)(pos_in.x + (j - 3) * dilation,
+                                 pos_in.y + (k - 3) * dilation)),
+            (CL_DTYPE4)(0.0f),
+            (ushort4)(
+                (in_pos_in_one_block.x + (j - 3) * dilation < 0 ||
+                 in_pos_in_one_block.y + (k - 3) * dilation < 0 ||
+                 in_pos_in_one_block.x + (j - 3) * dilation >= input_width ||
+                 in_pos_in_one_block.y + (k - 3) * dilation >= input_height)
+                << 15));
+        int filter_h = k;
+        int filter_w = j;
+        int filter_c = i;
+
+        filter_pos0.x = filter_c * 7 + filter_w;
+        filter_pos0.y = filter_n0 * 7 + filter_h;
+
+        filter_pos1.x = filter_c * 7 + filter_w;
+        filter_pos1.y = filter_n1 * 7 + filter_h;
+
+        filter_pos2.x = filter_c * 7 + filter_w;
+        filter_pos2.y = filter_n2 * 7 + filter_h;
+
+        filter_pos3.x = filter_c * 7 + filter_w;
+        filter_pos3.y = filter_n3 * 7 + filter_h;
+
+        filter[0] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos0);
+        filter[1] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos1);
+        filter[2] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos2);
+        filter[3] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos3);
+
+        output.x += dot(input, filter[0]);
+        output.y += dot(input, filter[1]);
+        output.z += dot(input, filter[2]);
+        output.w += dot(input, filter[3]);
+      }
+    }
+  }
+
+#ifdef BATCH_NORM
+  output = output * READ_IMG_TYPE(
+                        CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+           READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+#endif
+
+  output = activation_type4(output);
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
+}
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -17,7 +17,7 @@ add_kernel(relu_opencl OPENCL basic SRCS relu_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc DEPS ${cl_kernel_deps})
 #add_kernel(conv2d_1x1_opencl OPENCL basic SRCS conv2d_1x1_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(reshape_opencl OPENCL basic SRCS reshape_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps} cl_image_converter)
 add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps})

 lite_cc_test(test_elementwise_add_opencl SRCS elementwise_add_compute_test.cc
@@ -70,6 +70,10 @@ lite_cc_test(test_conv_opencl SRCS conv_compute_test.cc
             DEPS conv_opencl op_registry program context
             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)

+lite_cc_test(test_conv_image2d_opencl SRCS conv_image2d_compute_test.cc
+        DEPS conv_opencl op_registry program context cl_image_converter
+        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+
 lite_cc_test(test_layout_opencl SRCS layout_compute_test.cc
             DEPS layout_opencl op_registry program context
             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
--- a/lite/kernels/opencl/conv_compute.cc
+++ b/lite/kernels/opencl/conv_compute.cc
@@ -13,9 +13,13 @@
 // limitations under the License.

 #include "lite/kernels/opencl/conv_compute.h"
+
 #include <sstream>
+
+#include "lite/backends/opencl/cl_image_converter.h"
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"

 namespace paddle {
@@ -242,7 +246,6 @@ void ConvCompute::Conv2d1x1() {

  GemmBatched(kernel, x_d, filter_d, bias_d, output_d, batch_size, m, n, k);
 }
-
 // a: filter_d ==> <m, k> <=> <oc, ic>
 // b: x_d      ==> <k, n> <=> <ic, ih*iw>
 // c: output_d ==> <m, n> <=> <oc, ih*iw>
@@ -294,6 +297,582 @@ void ConvCompute::GemmBatched(cl::Kernel& kernel,

 void ConvCompute::Run() { (this->*impl_)(); }

+/* image kernel*/
+void ConvImageCompute::PrepareForRun() {
+  const auto& param = this->Param<param_t>();
+  auto x_dims = param.x->dims();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+
+  float* filter_cpu = param.filter->mutable_data<float>();
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+
+  int bs = x_dims[0];
+  int c_in = x_dims[1];
+  int h_out = output_dims[2];
+  int w_out = output_dims[3];
+  int kernel_h = filter_dims[2];  // oihw
+  int kernel_w = filter_dims[3];
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+  int stride_h = param.strides[0];
+  int stride_w = param.strides[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
+  int groups = param.groups;
+  bool relu_fused = param.fuse_relu;
+  bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
+  bool zero_pad = (pad_h == 0) && (pad_w == 0);
+
+  bool pad_equal =
+      ((paddings[0] == paddings[1]) && (paddings[1] == paddings[2]) &&
+       (paddings[2] == paddings[3]));
+  bool stride_equal = stride_h == stride_w;
+  bool dilation_equal = dilations[0] == dilations[1];
+
+  CHECK(pad_equal && stride_equal && dilation_equal);
+
+  VLOG(3) << "Is relu fused? / " << (relu_fused ? "Yes" : "No");
+  VLOG(3) << "groups:" << groups << " stride_h:" << stride_h
+          << " stride_w:" << stride_w << " pad_h:" << pad_h
+          << " pad_w:" << pad_w << " kernel_h:" << kernel_h
+          << " kernel_h:" << kernel_h;
+  VLOG(3) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2]
+          << " " << x_dims[3];
+  VLOG(3) << "output_dims:" << output_dims[0] << " " << output_dims[1] << " "
+          << output_dims[2] << " " << output_dims[3];
+  VLOG(3) << "filter_dims:" << filter_dims[0] << " " << filter_dims[1] << " "
+          << filter_dims[2] << " " << filter_dims[3];
+  if (kernel_h == 1 && kernel_w == 1) {
+    // conv2d_1x1
+    if (param.x->dims()[1] % 4 == 0) {
+      kernel_func_names_.push_back("conv2d_1x1_simple");
+    } else {
+      kernel_func_names_.push_back("conv2d_1x1");
+    }
+    kernel_func_paths_.push_back("image/conv2d_1x1_kernel.cl");
+
+    CLImageConverterNWBlock converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<float> filter_image_v(filter_image_dims[0] *
+                                      filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    filter_gpu_image_.mutable_data<float, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+
+    impl_ = &ConvImageCompute::Conv2d1x1;
+  } else if (kernel_h == 5 && kernel_w == 5) {
+    // conv2d_5x5
+    kernel_func_names_.push_back("conv2d_5x5");
+    kernel_func_paths_.push_back("image/conv2d_5x5_kernel.cl");
+
+    CLImageConverterFolder converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<float> filter_image_v(filter_image_dims[0] *
+                                      filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    filter_gpu_image_.mutable_data<float, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+
+    impl_ = &ConvImageCompute::Conv2d5x5;
+  } else if (kernel_h == 7 && kernel_w == 7) {
+    // conv2d_7x7
+    kernel_func_names_.push_back("conv2d_7x7");
+    kernel_func_paths_.push_back("image/conv2d_7x7_kernel.cl");
+
+    CLImageConverterFolder converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<float> filter_image_v(filter_image_dims[0] *
+                                      filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    this->filter_gpu_image_.mutable_data<float, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+
+    impl_ = &ConvImageCompute::Conv2d7x7;
+  } else {
+    LOG(FATAL) << "conv image compute not support this condition yet! ";
+  }
+
+  std::string build_options_single(" -DCL_DTYPE_float");
+  // relu options
+  if (relu_fused) {
+    build_options_single += " -DRELU";
+  } else if (param.activation_param.active_type ==
+             lite_api::ActivationType::kRelu6) {
+    build_options_single += " -DRELU6";
+  } else {
+    // do nothing
+  }
+  // bias options
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+  if (has_bias) {
+    build_options_single +=
+        is_element_wise_bias ? " -DBIASE_ELE" : " -DBIASE_CH";
+
+    // convert cpu buffer bias --> gpu image
+    CLImageConverterFolder bias_converter;
+    const DDim& bias_image_dims =
+        bias_converter.InitImageDimInfoWith(param.bias->dims());
+    std::vector<float> bias_image_v(bias_image_dims[0] * bias_image_dims[1] *
+                                    4);
+    float* bias_cpu_data = param.bias->mutable_data<float>();
+    bias_converter.NCHWToImage(
+        bias_cpu_data, bias_image_v.data(), param.bias->dims());
+    this->bias_gpu_image_.mutable_data<float, cl::Image2D>(
+        bias_image_dims[0], bias_image_dims[1], bias_image_v.data());
+    // convert cpu buffer bias --> gpu image --- end ----
+  }
+
+  build_options_.push_back(build_options_single);
+
+  for (size_t i = 0; i < kernel_func_names_.size(); i++) {
+    context.cl_context()->AddKernel(
+        kernel_func_names_[i], kernel_func_paths_[i], build_options_[i]);
+  }
+}
+
+void ConvImageCompute::Conv2d1x1() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto input_dims = param.x->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto* input_image = param.x->data<float, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<float, cl::Image2D>();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+
+  int input_width = input_dims[3];
+  int input_height = input_dims[2];
+  int output_width = output_dims[3];
+  int output_height = output_dims[2];
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+  auto* out_image = param.output->mutable_data<float, cl::Image2D>(
+      out_image_shape["width"], out_image_shape["height"]);
+
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
+               static_cast<int>(paddings[0]);
+
+  // calc input_c_block
+  auto input_image_shape = InitImageDimInfoWith(input_dims);
+  int input_c_block = input_image_shape["width"] / input_dims[3];
+  int input_c = input_dims[1];
+  auto dilations = *param.dilations;
+
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(out_image_shape["width"]),
+                          static_cast<int64_t>(out_image_shape["height"])}));
+
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+
+  VLOG(4) << "============ conv2d_1x1 params ============";
+  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+          << input_image_shape["height"];
+  VLOG(4) << "input_c_block: " << input_c_block;
+  VLOG(4) << "input_c: " << input_c;
+  VLOG(4) << "input_image: " << input_image;
+  VLOG(4) << "filter_dims: " << filter_dims;
+  VLOG(4) << "filter_image: " << filter_image;
+  VLOG(4) << "output_dims: " << output_dims;
+  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+          << out_image_shape["height"];
+  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+  VLOG(4) << "offset: " << offset;
+  VLOG(4) << "dilations.size : " << dilations.size();
+  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  VLOG(4) << "default work size{c_block, w, nh}: "
+          << "{" << c_block << ", " << w << ", " << nh << ""
+          << "}";
+
+  CHECK_GE(dilations.size(), 2);
+  CHECK(dilations[0] == dilations[1]);
+  CHECK_GE(input_dims.size(), 4);
+  CHECK_GE(paddings.size(), 2);
+  CHECK(paddings[0] == paddings[1]);
+  CHECK_GE(strides.size(), 2);
+  CHECK(strides[0] == strides[1]);
+
+  // handle bias  use buffer for channel wise , use image for element wise
+  const cl::Buffer* bias_buf = nullptr;
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<float, cl::Image2D>();
+  }
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  std::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  int maped_w = maptofactor(w, 4);
+
+  VLOG(4) << "kernel_key: " << kernel_key.str();
+  VLOG(4) << "kernel ready ... " << kernel_key.str();
+  VLOG(4) << "maped_w: " << maped_w;
+  VLOG(4) << "hasbias: " << has_bias;
+
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, maped_w);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, nh);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_image);
+  CL_CHECK_FATAL(status);
+  if (has_bias) {
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
+  status = kernel.setArg(++arg_idx, *out_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, strides[0]);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, offset);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_c);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, dilations[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, w);
+  CL_CHECK_FATAL(status);
+
+  auto global_work_size =
+      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                  static_cast<size_t>(maped_w),
+                  static_cast<size_t>(default_work_size.data()[2])};
+
+  VLOG(4) << "out_image: " << out_image;
+  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
+          << global_work_size[1] << "," << global_work_size[2] << "}";
+
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_image, event_);
+}
+void ConvImageCompute::Conv2d5x5() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto input_dims = param.x->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto* input_image = param.x->data<float, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<float, cl::Image2D>();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+
+  int input_width = input_dims[3];
+  int input_height = input_dims[2];
+  int output_width = output_dims[3];
+  int output_height = output_dims[2];
+  int filter_width = filter_dims[3];
+  int filter_height = filter_dims[2];
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+  auto* out_image = param.output->mutable_data<float, cl::Image2D>(
+      out_image_shape["width"], out_image_shape["height"]);
+
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
+               static_cast<int>(paddings[0]);
+
+  // calc input_c_block
+  auto input_image_shape = InitImageDimInfoWith(input_dims);
+  int input_c_block = input_image_shape["width"] / input_dims[3];
+  int input_c = input_dims[1];
+  auto dilations = *param.dilations;
+
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(out_image_shape["width"]),
+                          static_cast<int64_t>(out_image_shape["height"])}));
+
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+
+  VLOG(4) << "============ conv2d params ============";
+  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+          << input_image_shape["height"];
+  VLOG(4) << "input_c_block: " << input_c_block;
+  VLOG(4) << "input_c: " << input_c;
+  VLOG(4) << "input_image: " << input_image;
+  VLOG(4) << "input_dims: " << input_dims;
+  VLOG(4) << "filter_dims: " << filter_dims;
+  VLOG(4) << "filter_image: " << filter_image;
+  VLOG(4) << "output_dims: " << output_dims;
+  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+          << out_image_shape["height"];
+  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+  VLOG(4) << "offset: " << offset;
+  VLOG(4) << "dilations.size : " << dilations.size();
+  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  VLOG(4) << "default work size{c_block, w, nh}: "
+          << "{" << c_block << ", " << w << ", " << nh << ""
+          << "}";
+
+  CHECK_GE(dilations.size(), 2);
+  CHECK(dilations[0] == dilations[1]);
+  CHECK_GE(input_dims.size(), 4);
+  CHECK_GE(paddings.size(), 2);
+  CHECK(paddings[0] == paddings[1]);
+  CHECK_GE(strides.size(), 2);
+  CHECK(strides[0] == strides[1]);
+
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<float, cl::Image2D>();
+  }
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  VLOG(4) << "kernel_key: " << kernel_key.str();
+  VLOG(4) << "kernel ready ... " << kernel_key.str();
+  VLOG(4) << "w: " << w;
+
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, w);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, nh);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_image);
+  CL_CHECK_FATAL(status);
+  if (has_bias) {
+    VLOG(4) << "set bias_image: ";
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
+  status = kernel.setArg(++arg_idx, *out_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, strides[0]);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, offset);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_c_block);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, dilations[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_height);
+  CL_CHECK_FATAL(status);
+
+  auto global_work_size =
+      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                  static_cast<size_t>(default_work_size.data()[1]),
+                  static_cast<size_t>(default_work_size.data()[2])};
+
+  VLOG(4) << "out_image: " << out_image;
+  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
+          << global_work_size[1] << "," << global_work_size[2] << "}";
+
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_image, event_);
+}
+void ConvImageCompute::Conv2d7x7() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto input_dims = param.x->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto* input_image = param.x->data<float, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<float, cl::Image2D>();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+
+  int input_width = input_dims[3];
+  int input_height = input_dims[2];
+  int output_width = output_dims[3];
+  int output_height = output_dims[2];
+  int filter_width = filter_dims[3];
+  int filter_height = filter_dims[2];
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+  auto* out_image = param.output->mutable_data<float, cl::Image2D>(
+      out_image_shape["width"], out_image_shape["height"]);
+
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
+               static_cast<int>(paddings[0]);
+
+  // calc input_c_block
+  auto input_image_shape = InitImageDimInfoWith(input_dims);
+  int input_c_block = input_image_shape["width"] / input_dims[3];
+  int input_c = input_dims[1];
+  auto dilations = *param.dilations;
+
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(out_image_shape["width"]),
+                          static_cast<int64_t>(out_image_shape["height"])}));
+
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+
+  VLOG(4) << "============ conv2d params ============";
+  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+          << input_image_shape["height"];
+  VLOG(4) << "input_c_block: " << input_c_block;
+  VLOG(4) << "input_c: " << input_c;
+  VLOG(4) << "input_image: " << input_image;
+  VLOG(4) << "input_dims: " << input_dims;
+  VLOG(4) << "filter_dims: " << filter_dims;
+  VLOG(4) << "filter_image: " << filter_image;
+  VLOG(4) << "output_dims: " << output_dims;
+  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+          << out_image_shape["height"];
+  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+  VLOG(4) << "offset: " << offset;
+  VLOG(4) << "dilations.size : " << dilations.size();
+  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  VLOG(4) << "default work size{c_block, w, nh}: "
+          << "{" << c_block << ", " << w << ", " << nh << ""
+          << "}";
+
+  CHECK_GE(dilations.size(), 2);
+  CHECK(dilations[0] == dilations[1]);
+  CHECK_GE(input_dims.size(), 4);
+  CHECK_GE(paddings.size(), 2);
+  CHECK(paddings[0] == paddings[1]);
+  CHECK_GE(strides.size(), 2);
+  CHECK(strides[0] == strides[1]);
+
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<float, cl::Image2D>();
+  }
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  VLOG(4) << "kernel_key: " << kernel_key.str();
+  VLOG(4) << "kernel ready ... " << kernel_key.str();
+  VLOG(4) << "w: " << w;
+
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, w);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, nh);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_image);
+  CL_CHECK_FATAL(status);
+  if (has_bias) {
+    VLOG(4) << "set bias_image: ";
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
+  status = kernel.setArg(++arg_idx, *out_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, strides[0]);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, offset);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_c_block);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, dilations[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_height);
+  CL_CHECK_FATAL(status);
+
+  auto global_work_size =
+      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                  static_cast<size_t>(default_work_size.data()[1]),
+                  static_cast<size_t>(default_work_size.data()[2])};
+
+  VLOG(4) << "out_image: " << out_image;
+  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
+          << global_work_size[1] << "," << global_work_size[2] << "}";
+
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_image, event_);
+}
+
+void ConvImageCompute::Run() { (this->*impl_)(); }
+
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite
@@ -310,3 +889,21 @@ REGISTER_LITE_KERNEL(conv2d,
    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kOpenCL))})
    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL))})
    .Finalize();
+
+REGISTER_LITE_KERNEL(conv2d,
+                     kOpenCL,
+                     kFloat,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::ConvImageCompute,
+                     image2d)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
--- a/lite/kernels/opencl/conv_compute.h
+++ b/lite/kernels/opencl/conv_compute.h
@@ -17,6 +17,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/kernel.h"
 #include "lite/core/tensor.h"
@@ -57,6 +58,30 @@ class ConvCompute
  std::shared_ptr<cl::Event> event_{new cl::Event};
 };

+class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
+                                           PRECISION(kFloat),
+                                           DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ConvParam;
+  using kernel_t = void (ConvImageCompute::*)();
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+ private:
+  void Conv2d1x1();
+  void Conv2d5x5();
+  void Conv2d7x7();
+
+  kernel_t impl_;
+  std::vector<std::string> kernel_func_names_{};
+  std::vector<std::string> kernel_func_paths_{};
+  std::vector<std::string> build_options_{};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+  Tensor filter_gpu_image_;
+  Tensor bias_gpu_image_;
+};
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite

--- a/lite/kernels/opencl/conv_image2d_compute_test.cc
+++ b/lite/kernels/opencl/conv_image2d_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+#include "lite/backends/opencl/cl_image_converter.h"
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+// #define SHADOW_LOG LOG(INFO)
+#define SHADOW_LOG VLOG(4)
+
+template <typename Dtype1, typename Dtype2>
+static void conv_basic(const Dtype1* din,
+                       Dtype2* dout,
+                       int num,
+                       int chout,
+                       int hout,
+                       int wout,
+                       int chin,
+                       int hin,
+                       int win,
+                       const Dtype1* weights,
+                       const Dtype2* bias,
+                       int group,
+                       int kernel_w,
+                       int kernel_h,
+                       int stride_w,
+                       int stride_h,
+                       int dila_w,
+                       int dila_h,
+                       int pad_w,
+                       int pad_h,
+                       bool flag_bias,
+                       std::string flag_relu) {
+  Dtype2 beta = 0;
+  auto src_data = din;
+  auto dst_data_ref = dout;
+  auto weights_data = weights;
+  auto with_bias = flag_bias;
+  auto bias_data = bias;
+
+  int in_num = num;
+  int out_channels = chout;
+  int out_h = hout;
+  int out_w = wout;
+
+  int in_channel = chin;
+  int in_h = hin;
+  int in_w = win;
+  int out_c_group = out_channels / group;
+  int in_c_group = in_channel / group;
+
+  for (int n = 0; n < in_num; ++n) {
+    for (int g = 0; g < group; ++g) {
+      for (int oc = 0; oc < out_c_group; ++oc) {
+        for (int oh = 0; oh < out_h; ++oh) {
+          for (int ow = 0; ow < out_w; ++ow) {
+            int out_idx = n * group * out_c_group * out_h * out_w +
+                          g * out_c_group * out_h * out_w + oc * out_h * out_w +
+                          oh * out_w + ow;
+            Dtype2 bias_d =
+                with_bias ? (bias_data[g * out_c_group + oc]) : (Dtype2)0;
+            dst_data_ref[out_idx] = bias_d;  // + dst_data_ref[out_idx] * beta;
+            for (int ic = 0; ic < in_c_group; ++ic) {
+              for (int kh = 0; kh < kernel_h; ++kh) {
+                for (int kw = 0; kw < kernel_w; ++kw) {
+                  int iw = ow * stride_w - pad_w + kw * (dila_w);
+                  int ih = oh * stride_h - pad_h + kh * (dila_h);
+                  if (iw < 0 || iw >= in_w) continue;
+                  if (ih < 0 || ih >= in_h) continue;
+
+                  int iidx = n * in_channel * in_h * in_w +
+                             g * in_c_group * in_h * in_w + ic * in_h * in_w +
+                             ih * in_w + iw;
+                  int widx =
+                      g * out_c_group * in_c_group * kernel_h * kernel_w +
+                      oc * in_c_group * kernel_h * kernel_w +
+                      ic * kernel_h * kernel_w + kh * kernel_w + kw;
+
+                  dst_data_ref[out_idx] += src_data[iidx] * weights_data[widx];
+                }
+              }
+            }
+            if (flag_relu == "relu") {
+              dst_data_ref[out_idx] = dst_data_ref[out_idx] > (Dtype2)0
+                                          ? dst_data_ref[out_idx]
+                                          : (Dtype2)0;
+            } else if (flag_relu == "relu6") {
+              auto dst_tmp = (dst_data_ref[out_idx] > (Dtype2)0)
+                                 ? dst_data_ref[out_idx]
+                                 : (Dtype2)0;
+              dst_data_ref[out_idx] = (dst_tmp < 6.f) ? dst_tmp : 6.f;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+int ConvOutputSize(int input_size,
+                   int filter_size,
+                   int dilation,
+                   int pad_left,
+                   int pad_right,
+                   int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  int output_size =
+      (input_size + (pad_left + pad_right) - dkernel) / stride + 1;
+
+  return output_size;
+}
+// #define PRINT_RESULT
+// #define LOOP_TEST
+TEST(conv2d, compute_image2d_1x1) {
+  // conv infos
+  const int ksize = 1;
+  const int stride = 1;
+  const int pad = 0;
+  const int group = 1;
+  const int dilation = 0;
+//  int loop_cnt = 0;
+
+#ifdef LOOP_TEST
+  for (int batch_size = 1; batch_size < 4; ++batch_size) {
+    for (int oc = 4; oc < 10; oc += 1) {   // oc
+      for (int ih = 4; ih < 9; ih += 1) {  // ih
+        int iw = ih;
+        for (int iw = 4; iw < 10; iw += 1) {    // iw
+          for (int ic = 4; ic < 10; ic += 1) {  // ic
+            for (bool bias_flag : {true, false}) {
+              for (std::string relu_flag : {"relu"}) {
+#else
+  const int batch_size = 1;
+  const int oc = 4;
+  const int ih = 8;
+  const int iw = 8;
+  const int ic = 4;
+  const bool bias_flag = true;
+  const std::string relu_flag = "relu";
+#endif
+                const int oh = ih;
+                const int ow = iw;
+
+                SHADOW_LOG << "to get kernel ...";
+                auto kernels =
+                    KernelRegistry::Global().Create("conv2d",
+                                                    TARGET(kOpenCL),
+                                                    PRECISION(kFloat),
+                                                    DATALAYOUT(kImageDefault));
+                ASSERT_FALSE(kernels.empty());
+
+                auto kernel = std::move(kernels.front());
+                SHADOW_LOG << "created conv2d_1x1 kernel";
+
+                SHADOW_LOG << "prepare kernel ------";
+
+                lite::Tensor input, filter, bias, output;
+                operators::ConvParam param;
+                param.x = &input;
+                param.filter = &filter;
+                param.output = &output;
+                if (bias_flag) {
+                  param.bias = &bias;
+                }
+                if (relu_flag == "relu") {
+                  param.fuse_relu = true;
+                } else if (relu_flag == "None") {
+                  param.fuse_relu = false;
+                } else if (relu_flag == "relu6") {
+                  param.activation_param.Relu_clipped_coef = 6.f;
+                  param.activation_param.has_active = true;
+                  param.activation_param.active_type =
+                      lite_api::ActivationType::kRelu6;
+                }
+
+                std::vector<int> paddings = {pad, pad, pad, pad};
+                std::vector<int> dilations = {dilation, dilation};
+
+                param.paddings = std::make_shared<std::vector<int>>(paddings);
+                param.dilations = std::make_shared<std::vector<int>>(dilations);
+                param.strides = std::vector<int>{stride, stride};
+
+                std::unique_ptr<KernelContext> context(new KernelContext);
+                context->As<OpenCLContext>().InitOnce();
+
+                std::unique_ptr<KernelContext> conv_1x1_context(
+                    new KernelContext);
+                context->As<OpenCLContext>().CopySharedTo(
+                    &(conv_1x1_context->As<OpenCLContext>()));
+                kernel->SetContext(std::move(conv_1x1_context));
+
+                const DDim& input_dim =
+                    lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
+
+                const DDim& filter_dim =
+                    lite::DDim{std::vector<int64_t>({oc, ic, ksize, ksize})};
+                const DDim& out_dim =
+                    lite::DDim{std::vector<int64_t>({batch_size, oc, ih, iw})};
+                // element wise bias
+                const DDim& bias_dim = lite::DDim{std::vector<int64_t>({oc})};
+
+                param.x->Resize(input_dim);
+                param.filter->Resize(filter_dim);
+                param.output->Resize(out_dim);
+                if (bias_flag) {
+                  param.bias->Resize(bias_dim);
+                }
+
+                kernel->SetParam(param);
+
+                size_t input_image_width = iw * ((ic + 3) / 4);
+                size_t input_image_height = ih * batch_size;
+
+                size_t out_image_width = ow * ((oc + 3) / 4);
+                size_t out_image_height = oh * batch_size;
+
+                size_t bias_image_width = ow * ((oc + 3) / 4);
+                size_t bias_image_height = oh * batch_size;
+
+                size_t filter_image_width = ksize * ((oc + 3) / 4);
+                size_t filter_image_height = ic * ksize;
+
+                const size_t cl_image2d_row_pitch{0};
+                const size_t cl_image2d_slice_pitch{0};
+
+                std::default_random_engine engine;
+                std::uniform_real_distribution<float> gen(-5, 5);
+
+                std::vector<float> input_v(batch_size * ic * ih * iw);
+                std::vector<float> filter_v(oc * ic * ksize * ksize);
+                std::vector<float> output_v(batch_size * oc * ih * iw);
+                std::vector<float> bias_v(oc);
+
+                SHADOW_LOG << "gen input and filter ...";
+
+                for (auto& i : input_v) {
+                  i = gen(engine);
+                }
+                for (auto& f : filter_v) {
+                  f = gen(engine);
+                }
+
+                SHADOW_LOG << "after gen input and filter ...";
+                SHADOW_LOG << "input_v.size(): " << input_v.size();
+                SHADOW_LOG << "filter_v.size(): " << filter_v.size();
+                SHADOW_LOG << "output_v.size(): " << output_v.size();
+                SHADOW_LOG << "bias_v.size(): " << bias_v.size();
+                SHADOW_LOG << "input_dim.production(): "
+                           << input_dim.production();
+                SHADOW_LOG << "filter_dim.production(): "
+                           << filter_dim.production();
+                SHADOW_LOG << "out_dim.production(): " << out_dim.production();
+                SHADOW_LOG << "bias_dim.production(): "
+                           << bias_dim.production();
+                SHADOW_LOG << "4 * input_image_height * input_image_width: "
+                           << 4 * input_image_height * input_image_width;
+                SHADOW_LOG << "4 * filter_image_width * filter_image_height: "
+                           << 4 * filter_image_width * filter_image_height;
+
+                CHECK(input_dim.production() == input_v.size());
+                CHECK_LE(input_dim.production(),
+                         4 * input_image_height * input_image_width);
+                CHECK(filter_dim.production() == filter_v.size());
+                CHECK_LE(filter_dim.production(),
+                         4 * filter_image_width * filter_image_height);
+
+                paddle::lite::CLImageConverterDefault default_convertor;
+                SHADOW_LOG << "set mapped input  ...";
+                std::vector<float> x_image_v(
+                    input_image_width * input_image_height * 4);  // 4 : RGBA
+                std::vector<float> filter_image_v(
+                    filter_image_width * filter_image_height * 4);  // 4 :RGBA
+                std::vector<float> bias_image_v(
+                    bias_image_width * bias_image_height * 4);  // 4 : RGBA
+                std::vector<float> out_image_v(
+                    out_image_width * out_image_height * 4);  // 4 : RGBA
+
+                default_convertor.NCHWToImage(
+                    input_v.data(), x_image_v.data(), input_dim);
+
+                SHADOW_LOG << "set mapped filter  ...";
+                paddle::lite::CLImageConverterNWBlock nw_convertor;
+                nw_convertor.NCHWToImage(
+                    filter_v.data(), filter_image_v.data(), filter_dim);
+
+                auto* input_image2d = input.mutable_data<float, cl::Image2D>(
+                    input_image_width, input_image_height, x_image_v.data());
+                // assign filter as target arm
+                filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
+                                                               filter_dim);
+                //                auto* filter_image2d =
+                //                filter.mutable_data<float, cl::Image2D>(
+                //                    filter_image_width,
+                //                    filter_image_height,
+                //                    filter_image_v.data());
+                SHADOW_LOG << "卷积核: ----  ";
+                for (int i = 0; i < filter_v.size(); i++) {
+                  SHADOW_LOG << "(" << i << ")" << filter_v[i];
+                }
+
+                SHADOW_LOG << "卷积核1: ----  ";
+                const float* filter_p = filter.data<float>();
+                for (int i = 0; i < filter_v.size(); i++) {
+                  SHADOW_LOG << "(" << i << ")" << *filter_p;
+                  filter_p++;
+                }
+                SHADOW_LOG << "卷积核2: ----  ";
+                const float* filter_p2 = filter.mutable_data<float>();
+                for (int i = 0; i < filter_v.size(); i++) {
+                  SHADOW_LOG << "(" << i << ")" << *filter_p2;
+                  filter_p2++;
+                }
+                if (bias_flag) {
+                  for (int i = 0; i < bias_dim.production(); ++i) {
+                    bias_v[i] = static_cast<int>(gen(engine));
+                  }
+                  bias.Assign<float, lite::DDim, TARGET(kARM)>(bias_v.data(),
+                                                               bias_dim);
+                  //                CLImageConverterFolder folder_convertor;
+                  //                folder_convertor.NCHWToImage(
+                  //                    bias_v.data(), bias_image_v.data(),
+                  //                    bias_dim);
+                  //
+                  //                auto* bias_data = bias.mutable_data<float,
+                  //                cl::Image2D>(
+                  //                    bias_image_width, bias_image_height,
+                  //                    bias_image_v.data());
+                }
+
+                SHADOW_LOG << "resize output  ...";
+                output.Resize(out_dim);
+
+                // cpu conv basic calc
+                lite::Tensor out_ref;
+                out_ref.Resize(out_dim);
+
+                SHADOW_LOG << "prepare kernel ready";
+
+                SHADOW_LOG << "kernel launch ...";
+                kernel->Launch();
+                SHADOW_LOG << "mutable output ...";
+                auto* output_image2d = output.mutable_data<float, cl::Image2D>(
+                    out_image_width, out_image_height);
+
+                auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+                auto* out_ptr = param.output->data<float, cl::Image2D>();
+                auto it = wait_list->find(out_ptr);
+
+                if (it != wait_list->end()) {
+                  SHADOW_LOG << "--- Find the sync event for the target cl "
+                                "tensor. ---";
+                  auto& event = *(it->second);
+                  event.wait();
+                } else {
+                  LOG(FATAL) << "Could not find the sync event for the target"
+                                "cl tensor.";
+                }
+
+                TargetWrapperCL::ImgcpySync(out_image_v.data(),
+                                            output.data<float, cl::Image2D>(),
+                                            out_image_width,
+                                            out_image_height,
+                                            cl_image2d_row_pitch,
+                                            cl_image2d_slice_pitch,
+                                            IoDirection::DtoH);
+
+                DDim out_image_shape =
+                    default_convertor.InitImageDimInfoWith(output.dims());
+
+                default_convertor.ImageToNCHW(out_image_v.data(),
+                                              output_v.data(),
+                                              out_image_shape,
+                                              output.dims());
+                SHADOW_LOG << "mutable_data out_ref_data: ";
+
+                // run cpu ref
+                auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
+
+                SHADOW_LOG << " conv_basic beigin ..... ";
+
+                conv_basic<float, float>(input_v.data(),
+                                         out_ref_data,
+                                         batch_size,
+                                         oc,
+                                         oh,
+                                         ow,
+                                         ic,
+                                         ih,
+                                         iw,
+                                         filter_v.data(),
+                                         bias_v.data(),  // mapped_bias,
+                                         group,
+                                         ksize,
+                                         ksize,
+                                         stride,
+                                         stride,
+                                         dilation,
+                                         dilation,
+                                         pad,
+                                         pad,
+                                         bias_flag,
+                                         relu_flag);
+                SHADOW_LOG << " conv_basic end ..... ";
+
+                SHADOW_LOG << " out_dim: " << out_dim;
+                const DDim& out_image_dims = lite::DDim{std::vector<int64_t>(
+                    {static_cast<int64_t>(out_image_width),
+                     static_cast<int64_t>(out_image_height)})};
+
+                for (int i = 0; i < out_dim.production(); i++) {
+                  EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2);
+                  if (abs(output_v[i] - out_ref_data[i]) > 1e-2) {
+                    LOG(FATAL) << "error idx:" << i;
+                  }
+                }
+
+#ifdef LOOP_TEST
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+#else
+// nothing to do.
+#endif
+}
+#undef LOOP_TEST
+#undef PRINT_RESULT
+
+// #define PRINT_RESULT
+// #define LOOP_TEST
+TEST(conv2d, compute_image2d_5x5) {
+  // conv infos
+  const int ksize = 5;
+  const int stride = 1;
+  const int pad = 2;
+  const int group = 1;
+  const int dilation = 1;
+//  int loop_cnt = 0;
+
+#ifdef LOOP_TEST
+  for (int batch_size = 2; batch_size < 4; ++batch_size) {
+    for (int oc = 1; oc < 10; oc += 1) {   // oc
+      for (int ih = 5; ih < 9; ih += 1) {  // ih
+        int iw = ih;
+        for (int ic = 1; ic < 10; ic += 1) {  // ic
+          for (bool bias_flag : {true, false}) {
+            for (std::string relu_flag : {/*true,*/ "relu"}) {
+#else
+                const int batch_size = 2;
+                const int oc = 1;
+                const int ih = 5;
+                const int iw = 5;
+                const int ic = 1;
+                const bool bias_flag = true;
+                const std::string relu_flag = "relu";
+#endif
+
+              const int oh =
+                  ConvOutputSize(ih, ksize, dilation, pad, pad, stride);
+              const int ow =
+                  ConvOutputSize(iw, ksize, dilation, pad, pad, stride);
+              SHADOW_LOG << "to get kernel ...";
+              auto kernels =
+                  KernelRegistry::Global().Create("conv2d",
+                                                  TARGET(kOpenCL),
+                                                  PRECISION(kFloat),
+                                                  DATALAYOUT(kImageDefault));
+              ASSERT_FALSE(kernels.empty());
+
+              auto kernel = std::move(kernels.front());
+              SHADOW_LOG << "created conv2d kernel";
+
+              SHADOW_LOG << "prepare kernel ------";
+
+              lite::Tensor input, filter, bias, output;
+              operators::ConvParam param;
+              param.x = &input;
+              param.filter = &filter;
+              param.output = &output;
+              if (bias_flag) {
+                param.bias = &bias;
+              }
+              if (relu_flag == "relu") {
+                param.fuse_relu = true;
+              } else if (relu_flag == "None") {
+                param.fuse_relu = false;
+              } else if (relu_flag == "relu6") {
+                param.activation_param.Relu_clipped_coef = 6.f;
+                param.activation_param.has_active = true;
+                param.activation_param.active_type =
+                    lite_api::ActivationType::kRelu6;
+              }
+
+              std::vector<int> paddings = {pad, pad, pad, pad};
+              std::vector<int> dilations = {dilation, dilation};
+
+              param.paddings = std::make_shared<std::vector<int>>(paddings);
+              param.dilations = std::make_shared<std::vector<int>>(dilations);
+              param.strides = std::vector<int>{stride, stride};
+
+              std::unique_ptr<KernelContext> context(new KernelContext);
+              context->As<OpenCLContext>().InitOnce();
+
+              std::unique_ptr<KernelContext> conv_1x1_context(
+                  new KernelContext);
+              context->As<OpenCLContext>().CopySharedTo(
+                  &(conv_1x1_context->As<OpenCLContext>()));
+              kernel->SetContext(std::move(conv_1x1_context));
+
+              const DDim& input_dim =
+                  lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
+
+              const DDim& filter_dim =
+                  lite::DDim{std::vector<int64_t>({oc, ic, ksize, ksize})};
+              const DDim& out_dim =
+                  lite::DDim{std::vector<int64_t>({batch_size, oc, oh, ow})};
+              // element wise bias
+              const DDim& bias_dim = lite::DDim{std::vector<int64_t>({oc})};
+
+              param.x->Resize(input_dim);
+              param.filter->Resize(filter_dim);
+              param.output->Resize(out_dim);
+              if (bias_flag) {
+                param.bias->Resize(bias_dim);
+              }
+
+              kernel->SetParam(param);
+
+              size_t input_image_width = iw * ((ic + 3) / 4);
+              size_t input_image_height = ih * batch_size;
+
+              size_t out_image_width = ow * ((oc + 3) / 4);
+              size_t out_image_height = oh * batch_size;
+
+              size_t bias_image_width = ow * ((oc + 3) / 4);
+              size_t bias_image_height = oh * batch_size;
+
+              size_t filter_image_width = ksize * ((ic + 3) / 4);
+              size_t filter_image_height = oc * ksize;
+
+              const size_t cl_image2d_row_pitch{0};
+              const size_t cl_image2d_slice_pitch{0};
+
+              std::default_random_engine engine;
+              std::uniform_real_distribution<float> gen(-5, 5);
+
+              std::vector<float> input_v(batch_size * ic * ih * iw);
+              std::vector<float> filter_v(oc * ic * ksize * ksize);
+              std::vector<float> output_v(batch_size * oc * oh * ow);
+              std::vector<float> bias_v(oc);
+
+              SHADOW_LOG << "gen input and filter ...";
+              for (auto& i : input_v) {
+                i = gen(engine);
+              }
+              for (auto& f : filter_v) {
+                f = gen(engine);
+              }
+
+              SHADOW_LOG << "after gen input and filter ...";
+              SHADOW_LOG << "input_v.size(): " << input_v.size();
+              SHADOW_LOG << "filter_v.size(): " << filter_v.size();
+              SHADOW_LOG << "output_v.size(): " << output_v.size();
+              SHADOW_LOG << "bias_v.size(): " << bias_v.size();
+              SHADOW_LOG << "input_dim.production(): "
+                         << input_dim.production();
+              SHADOW_LOG << "filter_dim.production(): "
+                         << filter_dim.production();
+              SHADOW_LOG << "out_dim.production(): " << out_dim.production();
+              SHADOW_LOG << "bias_dim.production(): " << bias_dim.production();
+              SHADOW_LOG << "4 * input_image_height *input_image_width: "
+                         << 4 * input_image_height * input_image_width;
+              SHADOW_LOG << "4 * filter_image_width * filter_image_height: "
+                         << 4 * filter_image_width * filter_image_height;
+
+              CHECK(input_dim.production() == input_v.size());
+              CHECK_LE(input_dim.production(),
+                       4 * input_image_height * input_image_width);
+              CHECK(filter_dim.production() == filter_v.size());
+              CHECK_LE(filter_dim.production(),
+                       4 * filter_image_width * filter_image_height);
+
+              paddle::lite::CLImageConverterDefault default_convertor;
+              SHADOW_LOG << "set mapped input  ...";
+              std::vector<float> x_image_v(input_image_width *
+                                           input_image_height * 4);  // 4 :RGBA
+              std::vector<float> filter_image_v(
+                  filter_image_width * filter_image_height * 4);  // 4 : RGBA
+              std::vector<float> bias_image_v(
+                  bias_image_width * bias_image_height * 4);  // 4 : RGBA
+              std::vector<float> out_image_v(out_image_width *
+                                             out_image_height * 4);  // 4 :RGBA
+
+              default_convertor.NCHWToImage(
+                  input_v.data(), x_image_v.data(), input_dim);
+              SHADOW_LOG << "输入: ----  ";
+              for (int i = 0; i < input_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << input_v[i];
+              }
+              SHADOW_LOG << "输入image : ----  ";
+              for (int i = 0; i < x_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << x_image_v[i];
+              }
+              SHADOW_LOG << "set mapped filter  ...";
+              CLImageConverterFolder folder_convertor;
+
+              folder_convertor.NCHWToImage(
+                  filter_v.data(), filter_image_v.data(), filter_dim);
+              SHADOW_LOG << "卷积核: ----  ";
+              for (int i = 0; i < filter_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << filter_v[i];
+              }
+              SHADOW_LOG << "卷积核image: ----  ";
+              for (int i = 0; i < filter_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << filter_image_v[i];
+              }
+              auto* input_image2d = input.mutable_data<float, cl::Image2D>(
+                  input_image_width, input_image_height, x_image_v.data());
+              // assign filter as target arm
+              filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
+                                                             filter_dim);
+              // filter kernel
+              //              auto* filter_image2d = filter.mutable_data<float,
+              //              cl::Image2D>(
+              //                  filter_image_width,
+              //                  filter_image_height,
+              //                  filter_image_v.data());
+
+              if (bias_flag) {
+                for (int i = 0; i < bias_dim.production(); ++i) {
+                  bias_v[i] = static_cast<int>(gen(engine));
+                }
+                bias.Assign<float, lite::DDim, TARGET(kARM)>(bias_v.data(),
+                                                             bias_dim);
+                //                CLImageConverterFolder folder_convertor;
+                //                folder_convertor.NCHWToImage(
+                //                    bias_v.data(), bias_image_v.data(),
+                //                    bias_dim);
+                //
+                //                auto* bias_data = bias.mutable_data<float,
+                //                cl::Image2D>(
+                //                    bias_image_width, bias_image_height,
+                //                    bias_image_v.data());
+              }
+
+              SHADOW_LOG << "resize output  ...";
+              output.Resize(out_dim);
+
+              // cpu conv basic calc
+              lite::Tensor out_ref;
+              out_ref.Resize(out_dim);
+
+              SHADOW_LOG << "prepare kernel ready";
+
+              SHADOW_LOG << "kernel launch ...";
+              kernel->Launch();
+              SHADOW_LOG << "mutable output ...";
+              auto* output_image2d = output.mutable_data<float, cl::Image2D>(
+                  out_image_width, out_image_height);
+
+              auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+              auto* out_ptr = param.output->data<float, cl::Image2D>();
+              auto it = wait_list->find(out_ptr);
+
+              if (it != wait_list->end()) {
+                SHADOW_LOG << "--- Find the sync event for the target cl "
+                              "tensor. ---";
+                auto& event = *(it->second);
+                event.wait();
+              } else {
+                LOG(FATAL) << "Could not find the sync event for the target "
+                              "cl tensor.";
+              }
+
+              TargetWrapperCL::ImgcpySync(out_image_v.data(),
+                                          output.data<float, cl::Image2D>(),
+                                          out_image_width,
+                                          out_image_height,
+                                          cl_image2d_row_pitch,
+                                          cl_image2d_slice_pitch,
+                                          IoDirection::DtoH);
+
+              DDim out_image_shape =
+                  default_convertor.InitImageDimInfoWith(output.dims());
+
+              default_convertor.ImageToNCHW(out_image_v.data(),
+                                            output_v.data(),
+                                            out_image_shape,
+                                            output.dims());
+
+              SHADOW_LOG << "输出: ----  ";
+              for (int i = 0; i < output_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << output_v[i];
+              }
+
+              SHADOW_LOG << "输出image: ----  ";
+              for (int i = 0; i < out_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << out_image_v[i];
+              }
+              SHADOW_LOG << "mutable_data out_ref_data: ";
+
+              // run cpu ref
+              auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
+
+              SHADOW_LOG << " conv_basic beigin ..... ";
+
+              conv_basic<float, float>(input_v.data(),
+                                       out_ref_data,
+                                       batch_size,
+                                       oc,
+                                       oh,
+                                       ow,
+                                       ic,
+                                       ih,
+                                       iw,
+                                       filter_v.data(),
+                                       bias_v.data(),  // mapped_bias,
+                                       group,
+                                       ksize,
+                                       ksize,
+                                       stride,
+                                       stride,
+                                       dilation,
+                                       dilation,
+                                       pad,
+                                       pad,
+                                       bias_flag,
+                                       relu_flag);
+              SHADOW_LOG << " conv_basic end ..... ";
+
+              SHADOW_LOG << " out_dim: " << out_dim;
+              const DDim& out_image_dims = lite::DDim{std::vector<int64_t>(
+                  {static_cast<int64_t>(out_image_width),
+                   static_cast<int64_t>(out_image_height)})};
+
+              for (int i = 0; i < out_dim.production(); i++) {
+                EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2);
+                if (abs(output_v[i] - out_ref_data[i]) > 1e-2) {
+                  LOG(FATAL) << "error idx:" << i;
+                }
+              }
+
+#ifdef LOOP_TEST
+            }
+          }
+        }
+      }
+    }
+  }
+#else
+// nothing to do.
+#endif
+}
+#undef LOOP_TEST
+#undef PRINT_RESULT
+
+// #define LOOP_TEST
+TEST(conv2d, compute_image2d_7x7) {
+  // conv infos
+  const int ksize = 7;
+  const int stride = 1;
+  const int pad = 2;
+  const int group = 1;
+  const int dilation = 1;
+//  int loop_cnt = 0;
+
+#ifdef LOOP_TEST
+  for (int batch_size = 2; batch_size < 4; ++batch_size) {
+    for (int oc = 1; oc < 10; oc += 1) {    // oc
+      for (int ih = 7; ih < 15; ih += 1) {  // ih
+        int iw = ih;
+        for (int ic = 1; ic < 10; ic += 1) {  // ic
+          for (bool bias_flag : {true, false}) {
+            for (std::string relu_flag : {"relu"}) {
+#else
+                const int batch_size = 2;
+                const int oc = 1;
+                const int ih = 7;
+                const int iw = 7;
+                const int ic = 1;
+                const bool bias_flag = false;
+                const std::string relu_flag = "";
+#endif
+
+              const int oh =
+                  ConvOutputSize(ih, ksize, dilation, pad, pad, stride);
+              const int ow =
+                  ConvOutputSize(iw, ksize, dilation, pad, pad, stride);
+              SHADOW_LOG << "to get kernel ...";
+              auto kernels =
+                  KernelRegistry::Global().Create("conv2d",
+                                                  TARGET(kOpenCL),
+                                                  PRECISION(kFloat),
+                                                  DATALAYOUT(kImageDefault));
+              ASSERT_FALSE(kernels.empty());
+
+              auto kernel = std::move(kernels.front());
+              SHADOW_LOG << "created conv2d kernel";
+
+              SHADOW_LOG << "prepare kernel ------";
+
+              lite::Tensor input, filter, bias, output;
+              operators::ConvParam param;
+              param.x = &input;
+              param.filter = &filter;
+              param.output = &output;
+              if (bias_flag) {
+                param.bias = &bias;
+              }
+              if (relu_flag == "relu") {
+                param.fuse_relu = true;
+              } else if (relu_flag == "None") {
+                param.fuse_relu = false;
+              } else if (relu_flag == "relu6") {
+                param.activation_param.Relu_clipped_coef = 6.f;
+                param.activation_param.has_active = true;
+                param.activation_param.active_type =
+                    lite_api::ActivationType::kRelu6;
+              }
+              std::vector<int> paddings = {pad, pad, pad, pad};
+              std::vector<int> dilations = {dilation, dilation};
+
+              param.paddings = std::make_shared<std::vector<int>>(paddings);
+              param.dilations = std::make_shared<std::vector<int>>(dilations);
+              param.strides = std::vector<int>{stride, stride};
+
+              std::unique_ptr<KernelContext> context(new KernelContext);
+              context->As<OpenCLContext>().InitOnce();
+
+              std::unique_ptr<KernelContext> conv_1x1_context(
+                  new KernelContext);
+              context->As<OpenCLContext>().CopySharedTo(
+                  &(conv_1x1_context->As<OpenCLContext>()));
+              kernel->SetContext(std::move(conv_1x1_context));
+
+              const DDim& input_dim =
+                  lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
+
+              const DDim& filter_dim =
+                  lite::DDim{std::vector<int64_t>({oc, ic, ksize, ksize})};
+              const DDim& out_dim =
+                  lite::DDim{std::vector<int64_t>({batch_size, oc, oh, ow})};
+              // element wise bias
+              const DDim& bias_dim = lite::DDim{std::vector<int64_t>({oc})};
+
+              param.x->Resize(input_dim);
+              param.filter->Resize(filter_dim);
+              param.output->Resize(out_dim);
+              if (bias_flag) {
+                param.bias->Resize(bias_dim);
+              }
+
+              kernel->SetParam(param);
+
+              size_t input_image_width = iw * ((ic + 3) / 4);
+              size_t input_image_height = ih * batch_size;
+
+              size_t out_image_width = ow * ((oc + 3) / 4);
+              size_t out_image_height = oh * batch_size;
+
+              size_t bias_image_width = ow * ((oc + 3) / 4);
+              size_t bias_image_height = oh * batch_size;
+
+              size_t filter_image_width = ksize * ((ic + 3) / 4);
+              size_t filter_image_height = oc * ksize;
+
+              const size_t cl_image2d_row_pitch{0};
+              const size_t cl_image2d_slice_pitch{0};
+
+              std::default_random_engine engine;
+              std::uniform_real_distribution<float> gen(-5, 5);
+
+              std::vector<float> input_v(batch_size * ic * ih * iw);
+              std::vector<float> filter_v(oc * ic * ksize * ksize);
+              std::vector<float> output_v(batch_size * oc * oh * ow);
+              std::vector<float> bias_v(oc);
+
+              SHADOW_LOG << "gen input and filter ...";
+              for (auto& i : input_v) {
+                i = gen(engine);
+                //                i = 1;
+              }
+              for (auto& f : filter_v) {
+                f = gen(engine);
+                //                f = 1;
+              }
+              LOG(INFO) << "bias: " << bias_flag;
+              LOG(INFO) << "relu: " << relu_flag;
+
+              LOG(INFO) << "inputdims : " << input_dim;
+              LOG(INFO) << "filterdims: " << filter.dims();
+              LOG(INFO) << "outputdims : " << output.dims();
+              SHADOW_LOG << "after gen input and filter ...";
+              SHADOW_LOG << "input_v.size(): " << input_v.size();
+              SHADOW_LOG << "filter_v.size(): " << filter_v.size();
+              SHADOW_LOG << "output_v.size(): " << output_v.size();
+              SHADOW_LOG << "bias_v.size(): " << bias_v.size();
+              SHADOW_LOG << "input_dim.production(): "
+                         << input_dim.production();
+              SHADOW_LOG << "filter_dim.production(): "
+                         << filter_dim.production();
+              SHADOW_LOG << "out_dim.production(): " << out_dim.production();
+              SHADOW_LOG << "bias_dim.production(): " << bias_dim.production();
+              SHADOW_LOG << "4 * input_image_height * input_image_width: "
+                         << 4 * input_image_height * input_image_width;
+              SHADOW_LOG << "4 * filter_image_width * filter_image_height: "
+                         << 4 * filter_image_width * filter_image_height;
+
+              CHECK(input_dim.production() == input_v.size());
+              CHECK_LE(input_dim.production(),
+                       4 * input_image_height * input_image_width);
+              CHECK(filter_dim.production() == filter_v.size());
+              CHECK_LE(filter_dim.production(),
+                       4 * filter_image_width * filter_image_height);
+
+              paddle::lite::CLImageConverterDefault default_convertor;
+              SHADOW_LOG << "set mapped input  ...";
+              std::vector<float> x_image_v(input_image_width *
+                                           input_image_height * 4);  // 4 : RGBA
+              std::vector<float> filter_image_v(
+                  filter_image_width * filter_image_height * 4);  // 4 : RGBA
+              std::vector<float> bias_image_v(
+                  bias_image_width * bias_image_height * 4);  // 4 : RGBA
+              std::vector<float> out_image_v(out_image_width *
+                                             out_image_height * 4);  // 4 : RGBA
+
+              default_convertor.NCHWToImage(
+                  input_v.data(), x_image_v.data(), input_dim);
+              SHADOW_LOG << "输入: ----  ";
+              for (int i = 0; i < input_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << input_v[i];
+              }
+              SHADOW_LOG << "输入image : ----  ";
+              for (int i = 0; i < x_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << x_image_v[i];
+              }
+              SHADOW_LOG << "set mapped filter  ...";
+              CLImageConverterFolder folder_convertor;
+
+              folder_convertor.NCHWToImage(
+                  filter_v.data(), filter_image_v.data(), filter_dim);
+              SHADOW_LOG << "卷积核: ----  ";
+              for (int i = 0; i < filter_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << filter_v[i];
+              }
+              SHADOW_LOG << "卷积核image: ----  ";
+              for (int i = 0; i < filter_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << filter_image_v[i];
+              }
+              auto* input_image2d = input.mutable_data<float, cl::Image2D>(
+                  input_image_width, input_image_height, x_image_v.data());
+
+              // assign filter as target arm
+              filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
+                                                             filter_dim);
+
+              //              auto* filter_image2d = filter.mutable_data<float,
+              //              cl::Image2D>(
+              //                  filter_image_width,
+              //                  filter_image_height,
+              //                  filter_image_v.data());
+
+              if (bias_flag) {
+                for (int i = 0; i < bias_dim.production(); ++i) {
+                  bias_v[i] = static_cast<int>(gen(engine));
+                }
+                bias.Assign<float, lite::DDim, TARGET(kARM)>(bias_v.data(),
+                                                             bias_dim);
+                //                CLImageConverterFolder folder_convertor;
+                //                folder_convertor.NCHWToImage(
+                //                    bias_v.data(), bias_image_v.data(),
+                //                    bias_dim);
+                //
+                //                auto* bias_data = bias.mutable_data<float,
+                //                cl::Image2D>(
+                //                    bias_image_width, bias_image_height,
+                //                    bias_image_v.data());
+              }
+
+              SHADOW_LOG << "resize output  ...";
+              output.Resize(out_dim);
+
+              // cpu conv basic calc
+              lite::Tensor out_ref;
+              out_ref.Resize(out_dim);
+
+              SHADOW_LOG << "prepare kernel ready";
+
+              SHADOW_LOG << "kernel launch ...";
+              kernel->Launch();
+              SHADOW_LOG << "mutable output ...";
+              auto* output_image2d = output.mutable_data<float, cl::Image2D>(
+                  out_image_width, out_image_height);
+
+              auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+              auto* out_ptr = param.output->data<float, cl::Image2D>();
+              auto it = wait_list->find(out_ptr);
+
+              if (it != wait_list->end()) {
+                SHADOW_LOG << "--- Find the sync event for the target cl "
+                              "tensor. ---";
+                auto& event = *(it->second);
+                event.wait();
+              } else {
+                LOG(FATAL) << "Could not find the sync event for the target "
+                              "cl tensor.";
+              }
+
+              TargetWrapperCL::ImgcpySync(out_image_v.data(),
+                                          output.data<float, cl::Image2D>(),
+                                          out_image_width,
+                                          out_image_height,
+                                          cl_image2d_row_pitch,
+                                          cl_image2d_slice_pitch,
+                                          IoDirection::DtoH);
+
+              DDim out_image_shape =
+                  default_convertor.InitImageDimInfoWith(output.dims());
+
+              default_convertor.ImageToNCHW(out_image_v.data(),
+                                            output_v.data(),
+                                            out_image_shape,
+                                            output.dims());
+
+              SHADOW_LOG << "输出: ----  ";
+              for (int i = 0; i < output_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << output_v[i];
+              }
+
+              SHADOW_LOG << "输出image: ----  ";
+              for (int i = 0; i < out_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << out_image_v[i];
+              }
+              SHADOW_LOG << "mutable_data out_ref_data: ";
+
+              // run cpu ref
+              auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
+
+              SHADOW_LOG << " conv_basic beigin ..... ";
+
+              conv_basic<float, float>(input_v.data(),
+                                       out_ref_data,
+                                       batch_size,
+                                       oc,
+                                       oh,
+                                       ow,
+                                       ic,
+                                       ih,
+                                       iw,
+                                       filter_v.data(),
+                                       bias_v.data(),  // mapped_bias,
+                                       group,
+                                       ksize,
+                                       ksize,
+                                       stride,
+                                       stride,
+                                       dilation,
+                                       dilation,
+                                       pad,
+                                       pad,
+                                       bias_flag,
+                                       relu_flag);
+              SHADOW_LOG << " conv_basic end ..... ";
+
+              SHADOW_LOG << " out_dim: " << out_dim;
+              const DDim& out_image_dims = lite::DDim{std::vector<int64_t>(
+                  {static_cast<int64_t>(out_image_width),
+                   static_cast<int64_t>(out_image_height)})};
+
+              for (int i = 0; i < out_dim.production(); i++) {
+                EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2);
+                if (abs(output_v[i] - out_ref_data[i]) > 1e-2) {
+                  LOG(FATAL) << "error idx:" << i;
+                }
+              }
+
+#ifdef LOOP_TEST
+            }
+          }
+        }
+      }
+    }
+  }
+#else
+// nothing to do.
+#endif
+}
+#undef LOOP_TEST
+#undef PRINT_RESULT
+#undef SHADOW_LOG
+
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(conv2d, kOpenCL, kFloat, kImageDefault, image2d);