[LITE][OPENCL] Add depthwise_conv_3x3 opencl kernel (#2601)

* [LITE][OPENCL] Add depthwise_conv_3x3 opencl kernel * [LITE][OPENCL] Add depthwise_conv_3x3 opencl kernel. test=develop * [LITE][OPENCL] Add Pool opencl kernel. test=develop

[LITE][OPENCL] Add depthwise_conv_3x3 opencl kernel (#2601)
* [LITE][OPENCL] Add depthwise_conv_3x3 opencl kernel * [LITE][OPENCL] Add depthwise_conv_3x3 opencl kernel. test=develop * [LITE][OPENCL] Add Pool opencl kernel. test=develop
600c8c20 · Jiaying Zhao · GitHub · af37a14f · 600c8c20 · 600c8c20
13 changed file
--- a/lite/backends/opencl/cl_image_converter.h
+++ b/lite/backends/opencl/cl_image_converter.h
@@ -114,6 +114,7 @@ class CLImageConverterNWBlock : public CLImageConverterBase {
                   const DDim &tensor_dim) override;
 };
 class CLImageConverterDWBlock : public CLImageConverterBase {
+ public:
  DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
  void NCHWToImage(float *tensor,
                   float *image,

--- a/lite/backends/opencl/cl_kernel/cl_common.h
+++ b/lite/backends/opencl/cl_kernel/cl_common.h
@@ -40,10 +40,10 @@ limitations under the License. */
 #define WRITE_IMG_TYPE(type_char, img, pos, value) \
  _WRITE_IMG_TYPE(type_char, img, pos, value)

-#define _READ_IMG_TYPE(type_char, img, pos, sampler) \
+#define _READ_IMG_TYPE(type_char, img, sampler, pos) \
  read_image##type_char(img, sampler, pos)
-#define READ_IMG_TYPE(type_char, img, pos, sampler) \
-  _READ_IMG_TYPE(type_char, img, pos, sampler)
+#define READ_IMG_TYPE(type_char, img, sampler, pos) \
+  _READ_IMG_TYPE(type_char, img, sampler, pos)

 inline CL_DTYPE activation(CL_DTYPE in
 #ifdef PRELU

--- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <cl_common.h>
+
+__kernel void depth_conv_3x3(__private const int global_size_dim0,
+                                              __private const int global_size_dim1,
+                                              __private const int global_size_dim2,
+                                              __read_only image2d_t input,
+                                              __read_only image2d_t filter,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                                              __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                                              __read_only image2d_t new_scale,
+                                              __read_only image2d_t new_biase,
+#endif
+                                              __write_only image2d_t output_image,
+                                              __private const int stride,
+                                              __private const int offset,
+                                              __private const int dilation,
+                                              __private const int input_c,
+                                              __private const int input_width,/* of one block */
+                                              __private const int input_height, /* of one block */
+                                              __private const int output_width,
+                                              __private const int output_height) {
+
+    const int out_c = get_global_id(0);
+    const int out_w = get_global_id(1);
+    const int out_nh = get_global_id(2);
+
+    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+
+
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+
+    const int batch_index = out_nh / output_height;
+
+    const int out_nh_in_one_batch = out_nh % output_height;
+
+
+    int2 stride_xy = (int2)(stride, stride);
+    int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
+
+    int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+
+#ifdef BIASE_CH
+    CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
+#elif defined(BIASE_ELE)
+    CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
+#else
+    CL_DTYPE4 output = 0.0f;
+#endif
+
+    const int filter_width = 3;
+    const int filter_height = 3;
+
+    int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height);
+
+    int2 pos_in_filter_block = (int2)(out_c * filter_width, batch_index * filter_height);
+
+    int filter_x = pos_in_filter_block.x ;
+    int filter_y = pos_in_filter_block.y ;
+
+    CL_DTYPE4 inputs[9];
+
+        inputs[0] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                           (CL_DTYPE4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
+
+        inputs[1] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                           (CL_DTYPE4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
+
+        inputs[2] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                           (CL_DTYPE4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
+
+        inputs[3] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)),
+                           (CL_DTYPE4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+        /*
+        if (output_pos.x == 112 && output_pos.y == 0) {
+              CL_DTYPE4 input1 = inputs[3];
+              float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
+              printf(" input4 3 - %v4hlf \n", in);
+              printf(" --- %d ---\n", in_pos_in_one_block.x - 1);
+        }
+        */
+
+
+        inputs[4] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)),
+                           (CL_DTYPE4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+
+        inputs[5] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)),
+                           (CL_DTYPE4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+
+        inputs[6] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                           (CL_DTYPE4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
+
+        inputs[7] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                           (CL_DTYPE4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
+
+        inputs[8] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                           (CL_DTYPE4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
+
+    CL_DTYPE4 filters[9];
+    filters[0] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y));
+    filters[1] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y));
+    filters[2] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y));
+    filters[3] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 1));
+    filters[4] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 1));
+    filters[5] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 1));
+    filters[6] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 2));
+    filters[7] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 2));
+    filters[8] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 2));
+
+    for(int i = 0 ;i < 9 ; i++){
+     output += inputs[i] * filters[i];
+    }
+#ifdef BATCH_NORM
+    output = output * READ_IMG_TYPE(CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) + READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+#endif
+
+#ifdef RELU
+    output = activation(output);
+#endif
+
+
+    /*
+
+    if (output_pos.x == 112 && output_pos.y == 0) {
+
+        for (int i = 0; i < 9; ++i) {
+            CL_DTYPE4 input1 = inputs[i];
+            float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
+            printf(" input4 %d - %v4hlf \n", i, in);
+        }
+
+        float4 out = (float4)(output.x, output.y, output.z, output.w);
+        printf(" depth wise output output4 = %v4hlf \n", out);
+        printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x);
+        printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y);
+        printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x);
+        printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y);
+    }
+
+    */
+
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
+
+}
+
+
+
+__kernel void depth_conv_3x3s1(__private const int ou_ch_blk,
+                                              __private const int ou_w_blk,
+                                              __private const int ou_nh,
+                                              __read_only image2d_t input,
+                                              __read_only image2d_t filter,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                                              __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                                              __read_only image2d_t new_scale,
+                                              __read_only image2d_t new_biase,
+#endif
+                                              __write_only image2d_t output_image,
+                                              __private const int stride,
+                                              __private const int pad,
+                                              __private const int dilation,
+                                              __private const int in_ch,
+                                              __private const int in_w,/* of one block */
+                                              __private const int in_h, /* of one block */
+                                              __private const int ou_w,
+                                              __private const int ou_h) {
+
+    const int ou_ch_blk_id = get_global_id(0);
+    const int ou_w_blk_id = get_global_id(1);
+    const int ou_nh_id = get_global_id(2);
+    const int w_blk_size = 2;
+
+    const int batch_id = ou_nh_id / ou_h;
+    int ou_col_id = ou_w_blk_id * w_blk_size;
+    int ou_row_id = ou_nh_id % ou_h;
+    int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id);
+
+    // input pos in one block and on batch
+    int col_id = ou_col_id - pad;
+    int row_id = ou_row_id - pad;
+
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+
+#ifdef BIASE_CH
+    CL_DTYPE4 output[2];
+    output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_ch_blk_id, 0));
+    output[1] = output[0];
+#elif defined(BIASE_ELE)
+    CL_DTYPE4 output[2];
+    output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x, ou_nh_id));
+    if (ou_col_id + 1 < ou_w) {
+        output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x + 1, ou_nh_id));
+    }
+#else
+    CL_DTYPE4 output[2] = {0.0f};
+#endif
+
+    CL_DTYPE4 inputs[12];
+
+    int filter_x = ou_ch_blk_id * 3;
+    int filter_y = 0;
+    CL_DTYPE4 filters[9];
+    filters[0] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y));
+    filters[1] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y));
+    filters[2] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y));
+
+    int in_x = mad24(ou_ch_blk_id, in_w, col_id);
+    int in_y = mad24(batch_id, in_h, row_id);
+
+    int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h);
+    int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w);
+    inputs[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y0));
+    int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w);
+    inputs[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y0));
+    int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w);
+    inputs[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y0));
+    int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w);
+    inputs[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y0));
+
+    output[0] = mad(inputs[0], filters[0], output[0]);
+    output[1] = mad(inputs[1], filters[0], output[1]);
+
+    output[0] = mad(inputs[1], filters[1], output[0]);
+    output[1] = mad(inputs[2], filters[1], output[1]);
+
+    output[0] = mad(inputs[2], filters[2], output[0]);
+    output[1] = mad(inputs[3], filters[2], output[1]);
+
+
+    filters[3] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 1));
+    filters[4] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 1));
+    filters[5] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 1));
+
+
+    int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h);
+    inputs[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y1));
+    inputs[5] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y1));
+    inputs[6] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y1));
+    inputs[7] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y1));
+
+
+    output[0] = mad(inputs[4], filters[3], output[0]);
+    output[1] = mad(inputs[5], filters[3], output[1]);
+
+    output[0] = mad(inputs[5], filters[4], output[0]);
+    output[1] = mad(inputs[6], filters[4], output[1]);
+
+    output[0] = mad(inputs[6], filters[5], output[0]);
+    output[1] = mad(inputs[7], filters[5], output[1]);
+
+
+    filters[6] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 2));
+    filters[7] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 2));
+    filters[8] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 2));
+
+    int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h);
+    inputs[8] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y2));
+    inputs[9] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y2));
+    inputs[10] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y2));
+    inputs[11] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y2));
+
+
+    output[0] = mad(inputs[8], filters[6], output[0]);
+    output[1] = mad(inputs[9], filters[6], output[1]);
+
+    output[0] = mad(inputs[9], filters[7], output[0]);
+    output[1] = mad(inputs[10], filters[7], output[1]);
+
+    output[0] = mad(inputs[10], filters[8], output[0]);
+    output[1] = mad(inputs[11], filters[8], output[1]);
+#ifdef BATCH_NORM
+    CL_DTYPE4 scale = READ_IMG_TYPE(CL_DTYPE_CHAR, new_scale, sampler, (int2)(ou_ch_blk_id, 0));
+    CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(ou_ch_blk_id, 0));
+    output[0] = mad(scale, output[0], biase);
+    if (ou_col_id + 1 < ou_w) {
+        output[1] = mad(scale, output[1], biase);
+    }
+#endif
+
+#ifdef RELU
+    output[0] = activation(output[0]);
+    output[1] = activation(output[1]);
+#endif
+
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(ou_x, ou_nh_id), output[0]);
+    if (ou_col_id + 1 < ou_w) {
+        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(ou_x + 1, ou_nh_id), output[1]);
+    }
+
+}
+
--- a/lite/backends/opencl/cl_kernel/image/pool_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/pool_kernel.cl
@@ -12,15 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include <cl_common.h>
 #define MIN_VALUE -FLT_MAX

-__kernel void pool_max(
-    __private const int in_height, __private const int in_width,
-    __private const int out_height, __private const int out_width,
-    __private const int pad_top, __private const int pad_left,
-    __private const int stride_h, __private const int stride_w,
-    __private const int ksize_h, __private const int ksize_w,
-    __read_only image2d_t input, __write_only image2d_t output) {
+__kernel void pool_max(__read_only image2d_t input,
+    __write_only image2d_t output,
+    __private const int in_height,
+    __private const int in_width,
+    __private const int out_height,
+    __private const int out_width,
+    __private const int ksize_h,
+    __private const int ksize_w,
+    __private const int stride_h,
+    __private const int stride_w,
+    __private const int pad_top,
+    __private const int pad_left) {
  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
  const int out_nh = get_global_id(2);
@@ -40,25 +46,30 @@ __kernel void pool_max(

  const int pos_in_x = out_c * in_width;
  const int pos_in_y = out_n * in_height;
-  float4 max_value = (float4)(MIN_VALUE);
+  CL_DTYPE4 max_value = (CL_DTYPE4)(MIN_VALUE);
  for (int y = start_h; y < end_h; ++y) {
    for (int x = start_w; x < end_w; ++x) {
-      float4 tmp = read_imagef(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
+      CL_DTYPE4 tmp = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
      max_value = max(max_value, tmp);
    }
  }

  const int pos_out_x = mad24(out_c, out_width, out_w);
-  write_imagef(output, (int2)(pos_out_x, out_nh), max_value);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(pos_out_x, out_nh), max_value);
 }

-__kernel void pool_avg(
-    __private const int in_height, __private const int in_width,
-    __private const int out_height, __private const int out_width,
-    __private const int pad_top, __private const int pad_left,
-    __private const int stride_h, __private const int stride_w,
-    __private const int ksize_h, __private const int ksize_w,
-    __read_only image2d_t input, __write_only image2d_t output) {
+__kernel void pool_avg(__read_only image2d_t input,
+  __write_only image2d_t output,
+  __private const int in_height,
+  __private const int in_width,
+  __private const int out_height,
+  __private const int out_width,
+  __private const int ksize_h,
+  __private const int ksize_w,
+  __private const int stride_h,
+  __private const int stride_w,
+  __private const int pad_top,
+  __private const int pad_left) {
  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
  const int out_nh = get_global_id(2);
@@ -76,15 +87,14 @@ __kernel void pool_avg(

  const int pos_in_x = out_c * in_width;
  const int pos_in_y = out_n * in_height;
-  float4 sum = (float4)(0.0f);
-  int num = 0;
+  CL_DTYPE4 sum = (CL_DTYPE4)(0.0f);
+
  for (int y = start_h; y < end_h; ++y) {
    for (int x = start_w; x < end_w; ++x) {
-      sum += read_imagef(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
-      num++;
+      sum += READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
    }
  }
-  float4 avg = sum / num;
+  CL_DTYPE4 avg = sum / (ksize_h * ksize_w);
  const int pos_out_x = mad24(out_c, out_width, out_w);
-  write_imagef(output, (int2)(pos_out_x, out_nh), avg);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(pos_out_x, out_nh), avg);
 }
--- a/lite/backends/opencl/target_wrapper.cc
+++ b/lite/backends/opencl/target_wrapper.cc
@@ -58,17 +58,18 @@ void TargetWrapperCL::Free(void *ptr) {

 template <>
 void *TargetWrapperCL::MallocImage<float>(const size_t cl_image2d_width,
-                                          const size_t cl_image2d_height) {
+                                          const size_t cl_image2d_height,
+                                          void *host_ptr) {
  cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kFloat)));
  cl_int status;
  cl::Image2D *cl_image =
      new cl::Image2D(CLRuntime::Global()->context(),
-                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0),
                      img_format,
                      cl_image2d_width,
                      cl_image2d_height,
                      0,
-                      nullptr,
+                      host_ptr,
                      &status);
  if (status != CL_SUCCESS) {
    delete cl_image;
@@ -80,17 +81,18 @@ void *TargetWrapperCL::MallocImage<float>(const size_t cl_image2d_width,

 template <>
 void *TargetWrapperCL::MallocImage<int8_t>(const size_t cl_image2d_width,
-                                           const size_t cl_image2d_height) {
+                                           const size_t cl_image2d_height,
+                                           void *host_ptr) {
  cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kInt8)));
  cl_int status;
  cl::Image2D *cl_image =
      new cl::Image2D(CLRuntime::Global()->context(),
-                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0),
                      img_format,
                      cl_image2d_width,
                      cl_image2d_height,
                      0,
-                      nullptr,
+                      host_ptr,
                      &status);
  if (status != CL_SUCCESS) {
    delete cl_image;
@@ -102,17 +104,18 @@ void *TargetWrapperCL::MallocImage<int8_t>(const size_t cl_image2d_width,

 template <>
 void *TargetWrapperCL::MallocImage<int32_t>(const size_t cl_image2d_width,
-                                            const size_t cl_image2d_height) {
+                                            const size_t cl_image2d_height,
+                                            void *host_ptr) {
  cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kInt32)));
  cl_int status;
  cl::Image2D *cl_image =
      new cl::Image2D(CLRuntime::Global()->context(),
-                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0),
                      img_format,
                      cl_image2d_width,
                      cl_image2d_height,
                      0,
-                      nullptr,
+                      host_ptr,
                      &status);
  if (status != CL_SUCCESS) {
    delete cl_image;

--- a/lite/backends/opencl/target_wrapper.h
+++ b/lite/backends/opencl/target_wrapper.h
@@ -48,7 +48,8 @@ class TargetWrapper<TARGET(kOpenCL), cl::CommandQueue, cl::Event> {

  template <typename R>
  static void* MallocImage(const size_t cl_image2d_width,
-                           const size_t cl_image2d_height);
+                           const size_t cl_image2d_height,
+                           void* host_ptr = nullptr);
  static void FreeImage(void* image);

  static void* Map(void* buffer, size_t offset, size_t size);

--- a/lite/core/memory.h
+++ b/lite/core/memory.h
@@ -100,13 +100,14 @@ class Buffer {
  template <typename T>
  void ResetLazyImage2D(TargetType target,
                        const size_t img_w,
-                        const size_t img_h) {
+                        const size_t img_h,
+                        void* host_ptr = nullptr) {
    size_t size = sizeof(T) * img_w * img_h *
                  4;  // 4 for RGBA, un-used for opencl Image2D
    if (target != target_ || cl_image2d_width_ < img_w ||
        cl_image2d_height_ < img_h) {
      Free();
-      data_ = TargetWrapperCL::MallocImage<T>(img_w, img_h);
+      data_ = TargetWrapperCL::MallocImage<T>(img_w, img_h, host_ptr);
      target_ = target;
      space_ = size;  // un-used for opencl Image2D
      cl_image2d_width_ = img_w;

--- a/lite/core/tensor.h
+++ b/lite/core/tensor.h
@@ -147,9 +147,11 @@ class TensorLite {

 #ifdef LITE_WITH_OPENCL
  template <typename T, typename R = T>
-  R *mutable_data(const size_t img_w, const size_t img_h) {
+  R *mutable_data(const size_t img_w,
+                  const size_t img_h,
+                  void *host_ptr = nullptr) {
    target_ = TARGET(kOpenCL);
-    buffer_->ResetLazyImage2D<T>(target_, img_w, img_h);
+    buffer_->ResetLazyImage2D<T>(target_, img_w, img_h, host_ptr);
    return static_cast<cl::Image2D *>(buffer_->data());
  }
 #endif

--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -23,7 +23,7 @@ lite_cc_test(test_elementwise_add_opencl SRCS elementwise_add_compute_test.cc
        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)

 lite_cc_test(test_pool_opencl SRCS pool_compute_test.cc
-        DEPS pool_opencl op_registry program context
+        DEPS pool_opencl op_registry program context cl_image_converter
        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)

 lite_cc_test(test_fc_opencl SRCS fc_compute_test.cc
@@ -45,7 +45,7 @@ lite_cc_test(test_io_copy_compute_opencl SRCS io_copy_compute_test.cc
 #        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)

 lite_cc_test(test_depthwise_conv2d_opencl SRCS depthwise_conv2d_compute_test.cc
-        DEPS depthwise_conv2d_opencl op_registry program context
+        DEPS depthwise_conv2d_opencl op_registry program context cl_image_converter
        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)

 lite_cc_test(test_conv2d_1x1_opencl SRCS conv2d_1x1_compute_test.cc

--- a/lite/kernels/opencl/depthwise_conv2d_compute.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_compute.cc
@@ -16,6 +16,7 @@
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/replace_stl/stream.h"

@@ -114,6 +115,216 @@ class DepthwiseConv2dCompute
  std::shared_ptr<cl::Event> event_{new cl::Event};
 };

+class DepthwiseConv2dComputeFP16Image
+    : public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::ConvParam;
+
+  void PrepareForRun() override {
+    const auto& param = *param_.get_mutable<param_t>();
+    if (param.fuse_relu) {
+      build_options_ += " -DRELU";
+    }
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/depthwise_conv2d_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    const auto& param = *param_.get_mutable<param_t>();
+    auto x_dims = param.x->dims();
+    auto filter_dims = param.filter->dims();
+    auto output_dims = param.output->dims();
+    auto paddings = *param.paddings;
+    auto strides = param.strides;
+    auto dilations = *param.dilations;
+    int offset = filter_dims[2] / 2 - paddings[0];
+    int input_c_block = (x_dims[1] + 3) / 4;
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    auto* input_img = param.x->data<float, cl::Image2D>();
+    auto* filter_img = param.filter->data<float, cl::Image2D>();
+
+    auto* bias_img = param.bias == nullptr
+                         ? static_cast<cl::Image2D*>(nullptr)
+                         : param.bias->data<float, cl::Image2D>();
+
+    auto image_shape = InitImageDimInfoWith(output_dims);
+
+    auto* output_img = param.output->mutable_data<float, cl::Image2D>(
+        image_shape["width"], image_shape["height"]);
+
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    int c_block = (output_dims[1] + 3) / 4;
+    int w = output_dims[3];
+    int nh = output_dims[0] * output_dims[2];
+    auto global_work_size = cl::NDRange(c_block, w, nh);
+
+    LOG(INFO) << "setArg";
+    LOG(INFO) << "c_block = " << c_block;
+    LOG(INFO) << "w = " << w;
+    LOG(INFO) << "nh = " << nh;
+
+    LOG(INFO) << "strides = " << strides[0];
+    LOG(INFO) << "offset = " << offset;
+    LOG(INFO) << "dilations = " << dilations[0];
+    LOG(INFO) << "input_c_block = " << input_c_block;
+    LOG(INFO) << "x_dims[3] = " << x_dims[3];
+    LOG(INFO) << "x_dims[2] = " << x_dims[2];
+    LOG(INFO) << "output_dims[3] = " << output_dims[3];
+    LOG(INFO) << "output_dims[2] = " << output_dims[2];
+
+    cl_int status;
+    int arg_idx = 0;
+    status = kernel.setArg(arg_idx, static_cast<const int>(c_block));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(w));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(nh));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *input_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *filter_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *output_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(offset));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(dilations[0]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(input_c_block));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[3]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[2]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[3]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
+    CL_CHECK_FATAL(status);
+
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(output_img, event_);
+  }
+
+ private:
+  std::string kernel_func_name_{"depth_conv_3x3"};
+  std::string build_options_{"-DCL_DTYPE_float"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+class DepthwiseConv2d3x3s1ComputeFP16Image
+    : public KernelLite<TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::ConvParam;
+
+  void PrepareForRun() override {
+    const auto& param = *param_.get_mutable<param_t>();
+    if (param.fuse_relu) {
+      build_options_ += " -DRELU";
+    }
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/depthwise_conv2d_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    const auto& param = *param_.get_mutable<param_t>();
+    auto x_dims = param.x->dims();
+    auto filter_dims = param.filter->dims();
+    auto output_dims = param.output->dims();
+    auto paddings = *param.paddings;
+    auto strides = param.strides;
+    auto dilations = *param.dilations;
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    auto* input_img = param.x->data<float, cl::Image2D>();
+    auto* filter_img = param.filter->data<float, cl::Image2D>();
+
+    auto* bias_img = param.bias == nullptr
+                         ? static_cast<cl::Image2D*>(nullptr)
+                         : param.bias->data<float, cl::Image2D>();
+
+    auto image_shape = InitImageDimInfoWith(output_dims);
+
+    auto* output_img = param.output->mutable_data<float, cl::Image2D>(
+        image_shape["width"], image_shape["height"]);
+
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    int c_block = (output_dims[1] + 3) / 4;
+    int w = output_dims[3];
+    int nh = output_dims[0] * output_dims[2];
+
+    int w_blk_size = 2;
+    int w_blk = (w + w_blk_size - 1) / w_blk_size;
+
+    auto global_work_size = cl::NDRange(c_block, w_blk, nh);
+
+    cl_int status;
+    int arg_idx = 0;
+    status = kernel.setArg(arg_idx, static_cast<const int>(c_block));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(w_blk));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(nh));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *input_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *filter_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *output_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(dilations[0]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[1]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[3]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[2]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[3]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
+    CL_CHECK_FATAL(status);
+
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(output_img, event_);
+  }
+
+ private:
+  std::string kernel_func_name_{"depth_conv_3x3s1"};
+  std::string build_options_{"-DCL_DTYPE_float"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite
@@ -130,3 +341,28 @@ REGISTER_LITE_KERNEL(depthwise_conv2d,
    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kOpenCL))})
    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL))})
    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    depthwise_conv2d,
+    kOpenCL,
+    kFloat,
+    kNHWC,
+    paddle::lite::kernels::opencl::DepthwiseConv2dComputeFP16Image,
+    image2d)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Bias",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Filter",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
--- a/lite/kernels/opencl/depthwise_conv2d_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_compute_test.cc
@@ -14,6 +14,7 @@

 #include <gtest/gtest.h>
 #include <random>
+#include "lite/backends/opencl/cl_image_converter.h"
 #include "lite/backends/opencl/target_wrapper.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
@@ -89,7 +90,7 @@ void depth_conv(const T* input_data,
  }
 }

-TEST(depthwise_conv2d, compute) {
+TEST(depthwise_conv2d, compute_buffer) {
  LOG(INFO) << "to get kernel ...";
  auto kernels = KernelRegistry::Global().Create("depthwise_conv2d",
                                                 TARGET(kOpenCL),
@@ -176,7 +177,135 @@ TEST(depthwise_conv2d, compute) {
  TargetWrapperCL::Unmap(input_data, mapped_input);
 }

+TEST(depthwise_conv2d, compute_image2d) {
+  LOG(INFO) << "to get kernel ...";
+  auto kernels = KernelRegistry::Global().Create("depthwise_conv2d",
+                                                 TARGET(kOpenCL),
+                                                 PRECISION(kFloat),
+                                                 DATALAYOUT(kNHWC));
+  ASSERT_FALSE(kernels.empty());
+
+  auto kernel = std::move(kernels.front());
+
+  LOG(INFO) << "get kernel";
+  lite::Tensor input, filter, output;
+  operators::ConvParam param;
+  param.x = &input;
+  param.filter = &filter;
+  param.output = &output;
+  std::vector<int> paddings = {0, 0};
+  param.paddings = std::make_shared<std::vector<int>>(paddings);
+  param.strides = std::vector<int>{1, 1};
+  std::vector<int> dilations = {1, 1};
+  param.dilations = std::make_shared<std::vector<int>>(dilations);
+
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> dep_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(dep_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(dep_context));
+
+  LOG(INFO) << "kernel ready";
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> gen(-5, 5);
+  std::vector<float> input_v(1 * 32 * 112 * 112);
+  std::vector<float> filter_v(32 * 1 * 3 * 3);
+  for (auto& i : input_v) {
+    i = gen(engine);
+  }
+  for (auto& f : filter_v) {
+    f = gen(engine);
+  }
+
+  LOG(INFO) << "prepare input";
+  input.Resize({1, 32, 112, 112});
+  CLImageConverterDefault* default_converter = new CLImageConverterDefault();
+  DDim input_image_shape =
+      default_converter->InitImageDimInfoWith(input.dims());
+  LOG(INFO) << "input_image_shape = " << input_image_shape[0] << " "
+            << input_image_shape[1];
+  std::vector<float> input_image_data(input_image_shape.production() *
+                                      4);  // 4 : RGBA
+  default_converter->NCHWToImage(
+      input_v.data(), input_image_data.data(), input.dims());
+  auto* input_image = input.mutable_data<float, cl::Image2D>(
+      input_image_shape[0], input_image_shape[1], input_image_data.data());
+
+  LOG(INFO) << "prepare kernel";
+  filter.Resize({32, 1, 3, 3});
+  CLImageConverterNWBlock* nw_converter = new CLImageConverterNWBlock();
+  DDim filter_image_shape = nw_converter->InitImageDimInfoWith(filter.dims());
+  LOG(INFO) << "filter_image_shape = " << filter_image_shape[0] << " "
+            << filter_image_shape[1];
+  std::vector<float> filter_image_data(filter_image_shape.production() *
+                                       4);  // 4 : RGBA
+  nw_converter->NCHWToImage(
+      filter_v.data(), filter_image_data.data(), filter.dims());
+  auto* filter_image = filter.mutable_data<float, cl::Image2D>(
+      filter_image_shape[0], filter_image_shape[1], filter_image_data.data());
+
+  LOG(INFO) << "launch";
+  output.Resize({1, 32, 110, 110});
+  DDim output_image_shape =
+      default_converter->InitImageDimInfoWith(output.dims());
+  LOG(INFO) << "output_image_shape = " << output_image_shape[0] << " "
+            << output_image_shape[1];
+  auto* output_image = output.mutable_data<float, cl::Image2D>(
+      output_image_shape[0], output_image_shape[1]);
+
+  kernel->Launch();
+
+  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+  auto* out_ptr = param.output->data<float, cl::Image2D>();
+  auto it = wait_list->find(out_ptr);
+  if (it != wait_list->end()) {
+    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+    LOG(INFO) << "--- Find the sync event for the target cl tensor. ---";
+    auto& event = *(it->second);
+    event.wait();
+  } else {
+    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
+    LOG(INFO) << "Could not find the sync event for the target cl tensor.";
+  }
+
+  lite::Tensor output_ref;
+  output_ref.Resize({1, 32, 110, 110});
+  auto* output_ref_data = output_ref.mutable_data<float>(TARGET(kARM));
+  depth_conv<float, 1, 1>(input_v.data(),
+                          input.dims(),
+                          filter_v.data(),
+                          filter.dims(),
+                          output_ref_data,
+                          output_ref.dims());
+
+  const size_t cl_image2d_row_pitch{0};
+  const size_t cl_image2d_slice_pitch{0};
+
+  float* output_image_data = new float[output_image_shape.production() * 4];
+  TargetWrapperCL::ImgcpySync(output_image_data,
+                              output_image,
+                              output_image_shape[0],
+                              output_image_shape[1],
+                              cl_image2d_row_pitch,
+                              cl_image2d_slice_pitch,
+                              IoDirection::DtoH);
+
+  float* output_data = new float[output_image_shape.production() * 4];
+  default_converter->ImageToNCHW(
+      output_image_data, output_data, output_image_shape, output.dims());
+
+  LOG(INFO) << "output_data vs output_ref_data";
+  for (int i = 0; i < output.dims().production(); i++) {
+    EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-4);
+    LOG(INFO) << output_data[i] << " " << output_ref_data[i];
+  }
+}
+
 }  // namespace lite
 }  // namespace paddle

 USE_LITE_KERNEL(depthwise_conv2d, kOpenCL, kFloat, kNCHW, def);
+USE_LITE_KERNEL(depthwise_conv2d, kOpenCL, kFloat, kNHWC, image2d);
--- a/lite/kernels/opencl/pool_compute.cc
+++ b/lite/kernels/opencl/pool_compute.cc
@@ -16,6 +16,7 @@
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/replace_stl/stream.h"
 #include "lite/utils/string.h"
@@ -117,6 +118,107 @@ class PoolCompute
  std::shared_ptr<cl::Event> event_{new cl::Event};
 };

+class PoolComputeImage2D
+    : public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::PoolParam;
+
+  void PrepareForRun() override {
+    const auto& param = *param_.get_mutable<param_t>();
+    kernel_func_name_ += param.pooling_type;
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/pool_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    const auto& param = *param_.get_mutable<param_t>();
+    const auto& in_dims = param.x->dims();
+    const auto& out_dims = param.output->dims();
+    const std::string pooling_type = param.pooling_type;
+    const bool global_pooling = param.global_pooling;
+    std::vector<int> paddings = *param.paddings;
+    std::vector<int> strides = param.strides;
+    std::vector<int> ksize = param.ksize;
+    if (global_pooling) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[2 * i] = 0;
+        paddings[2 * i + 1] = 0;
+        ksize[i] = static_cast<int>(in_dims[i + 2]);
+      }
+    }
+    bool pads_equal =
+        (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]);
+    if (!pads_equal) {
+      LOG(FATAL)
+          << "padding requires pad_left == pad_right, pad_top == pad_bottom";
+    }
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+
+    auto* x_img = param.x->data<float, cl::Image2D>();
+    LOG(INFO) << "x_image" << x_img;
+
+    auto out_image_shape = InitImageDimInfoWith(out_dims);
+    LOG(INFO) << "out_image_shape = " << out_image_shape["width"] << " "
+              << out_image_shape["height"];
+    auto* out_img = param.output->mutable_data<float, cl::Image2D>(
+        out_image_shape["width"], out_image_shape["height"]);
+    LOG(INFO) << "out_image" << out_img;
+
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    int c_block = (out_dims[1] + 3) / 4;
+    int w = out_dims[3];
+    int nh = out_dims[0] * out_dims[2];
+    auto global_work_size = cl::NDRange(c_block, w, nh);
+
+    cl_int status;
+    int arg_idx = 0;
+    status = kernel.setArg(arg_idx, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims[2]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims[3]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims[2]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims[3]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(ksize[0]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(ksize[1]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(strides[1]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[2]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
+    CL_CHECK_FATAL(status);
+
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_img, event_);
+  }
+
+ private:
+  std::string kernel_func_name_{"pool_"};
+  std::string build_options_{"-DCL_DTYPE_float"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite
@@ -131,3 +233,19 @@ REGISTER_LITE_KERNEL(pool2d,
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
    .Finalize();
+
+REGISTER_LITE_KERNEL(pool2d,
+                     kOpenCL,
+                     kFloat,
+                     kNHWC,
+                     paddle::lite::kernels::opencl::PoolComputeImage2D,
+                     image2d)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
--- a/lite/kernels/opencl/pool_compute_test.cc
+++ b/lite/kernels/opencl/pool_compute_test.cc
@@ -73,7 +73,7 @@ void pool_avg(const int padding_height,
  }
 }

-TEST(pool2d, compute) {
+TEST(pool2d, compute_buffer) {
  LOG(INFO) << "to get kernel ...";
  auto kernels = KernelRegistry::Global().Create(
      "pool2d", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
@@ -143,7 +143,102 @@ TEST(pool2d, compute) {
  TargetWrapperCL::Unmap(out_data, mapped_out);
 }

+TEST(pool2d, compute_image2d) {
+  LOG(INFO) << "to get kernel ...";
+  auto kernels = KernelRegistry::Global().Create(
+      "pool2d", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC));
+  ASSERT_FALSE(kernels.empty());
+
+  auto kernel = std::move(kernels.front());
+
+  LOG(INFO) << "get kernel";
+
+  lite::Tensor x, out;
+  operators::PoolParam param;
+  param.x = &x;
+  param.output = &out;
+  param.global_pooling = false;
+  param.pooling_type = "avg";
+  std::vector<int> paddings = {0, 0, 0, 0};
+  param.strides = std::vector<int>{1, 1};
+  param.ksize = std::vector<int>{7, 7};
+  param.paddings = std::make_shared<std::vector<int>>(paddings);
+
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> pool_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(pool_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(pool_context));
+
+  const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 11, 107, 107});
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 11, 101, 101});
+  x.Resize(in_dim);
+  out.Resize(out_dim);
+
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-5, 5);
+  std::vector<float> input_v(4 * 11 * 107 * 107);
+  for (auto& i : input_v) {
+    i = dist(engine);
+  }
+
+  LOG(INFO) << "prepare input";
+  CLImageConverterDefault* default_converter = new CLImageConverterDefault();
+  DDim x_image_shape = default_converter->InitImageDimInfoWith(in_dim);
+  LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " "
+            << x_image_shape[1];
+  std::vector<float> x_image_data(x_image_shape.production() * 4);  // 4 : RGBA
+  default_converter->NCHWToImage(input_v.data(), x_image_data.data(), in_dim);
+  auto* x_image = x.mutable_data<float, cl::Image2D>(
+      x_image_shape[0], x_image_shape[1], x_image_data.data());
+  LOG(INFO) << "x_image" << x_image;
+
+  DDim out_image_shape = default_converter->InitImageDimInfoWith(out_dim);
+  LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
+            << out_image_shape[1];
+  auto* out_image = out.mutable_data<float, cl::Image2D>(out_image_shape[0],
+                                                         out_image_shape[1]);
+  LOG(INFO) << "out_image" << out_image;
+  kernel->Launch();
+
+  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+  auto* out_ptr = param.output->data<float, cl::Image2D>();
+  auto it = wait_list->find(out_ptr);
+  if (it != wait_list->end()) {
+    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+    auto& event = *(it->second);
+    event.wait();
+  } else {
+    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
+  }
+
+  std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
+  pool_avg(0, 0, 1, 1, 7, 7, input_v.data(), in_dim, out_ref.get(), out_dim);
+
+  const size_t cl_image2d_row_pitch{0};
+  const size_t cl_image2d_slice_pitch{0};
+  float* out_image_data = new float[out_image_shape.production() * 4];
+  TargetWrapperCL::ImgcpySync(out_image_data,
+                              out_image,
+                              out_image_shape[0],
+                              out_image_shape[1],
+                              cl_image2d_row_pitch,
+                              cl_image2d_slice_pitch,
+                              IoDirection::DtoH);
+  float* out_data = new float[out_image_shape.production() * 4];
+  default_converter->ImageToNCHW(
+      out_image_data, out_data, out_image_shape, out_dim);
+
+  for (int i = 0; i < out_dim.production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref[i], 1e-6);
+  }
+}
+
 }  // namespace lite
 }  // namespace paddle

 USE_LITE_KERNEL(pool2d, kOpenCL, kFloat, kNCHW, def);
+USE_LITE_KERNEL(pool2d, kOpenCL, kFloat, kNHWC, image2d);