[LITE][OPENCL] Add opencl image2d conv3x3. test=develop (#2853)

* [LITE][OPENCL] Add opencl image2d conv3x3. test=develop

[LITE][OPENCL] Add opencl image2d conv3x3. test=develop (#2853)
* [LITE][OPENCL] Add opencl image2d conv3x3. test=develop
b38753da · Yuan Shuai · GitHub · 6e39cfa6 · b38753da · b38753da
4 changed file
--- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <cl_common.h>
+__kernel void conv2d_3x3(__private const int global_size_dim0,
+                         __private const int global_size_dim1,
+                         __private const int global_size_dim2,
+                         __read_only image2d_t input_image,
+                         __read_only image2d_t filter,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                         __read_only image2d_t bias,
+#endif
+                         __write_only image2d_t output_image,
+                         __private const int stride,
+                         __private const int offset,
+                         __private const int input_c,
+                         __private const int dilation,
+                         __private const int input_width,/* of one block */
+                         __private const int input_height,/* of one block */
+                         __private const int output_width,
+                         __private const int output_height,
+                         __private const int output_c,
+                         __private const int filter_channel,
+						 __private const int filter_width,
+						 __private const int filter_height,
+                         __private const int group) {
+    const int out_c = get_global_id(0);
+    const int out_w = get_global_id(1);
+    const int out_nh = get_global_id(2);
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+    if (out_c >= global_size_dim0 ||
+        out_w >= global_size_dim1 ||
+        out_nh >= global_size_dim2) {
+        return;
+    }
+    int2 stride_xy;
+    stride_xy.x = stride;
+    stride_xy.y = stride;
+    int2 ouput_pos_in_one_block;
+    ouput_pos_in_one_block.x = out_w;
+    ouput_pos_in_one_block.y = out_nh;
+    int2 in_pos_in_one_block;
+    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+#ifdef BIASE_CH
+    CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
+#elif defined(BIASE_ELE)
+    CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
+#else
+    CL_DTYPE4 output = 0.0f;
+#endif
+    CL_DTYPE4 input[9]; // 3x3 region of input
+    if (group == 1) {
+        for (int i = 0; i < input_c; ++i) { // each run for 3x3
+            int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+            input[0] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                                (int2)(pos_in.x - dilation, pos_in.y - dilation)),
+                                (CL_DTYPE4)(0.0f),
+                                (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+            input[1] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                              (int2)(pos_in.x, pos_in.y - dilation)),
+                              (CL_DTYPE4)(0.0f),
+                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+            input[2] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                              (int2)(pos_in.x + dilation, pos_in.y - dilation)),
+                              (CL_DTYPE4)(0.0f),
+                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+            input[3] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                              (int2)(pos_in.x - dilation, pos_in.y)),
+                              (CL_DTYPE4)(0.0f),
+                              (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+            input[4] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                              (int2)(pos_in.x, pos_in.y)),
+                              (CL_DTYPE4)(0.0f),
+                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+            input[5] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                              (int2)(pos_in.x + dilation, pos_in.y)),
+                              (CL_DTYPE4)(0.0f),
+                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+            input[6] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                              (int2)(pos_in.x - dilation, pos_in.y + dilation)),
+                              (CL_DTYPE4)(0.0f),
+                              (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+            input[7] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                              (int2)(pos_in.x, pos_in.y + dilation)),
+                              (CL_DTYPE4)(0.0f),
+                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+            input[8] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                              (int2)(pos_in.x + dilation, pos_in.y + dilation)),
+                              (CL_DTYPE4)(0.0f),
+                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+                int j = 0;
+                int2 pos_of_weight;
+                pos_of_weight.x = i * 3 + j % 3;
+                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+                CL_DTYPE4 weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.x += dot(input[j], weight_x);
+                pos_of_weight.y += 3;
+                CL_DTYPE4 weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.y += dot(input[j], weight_y);
+                pos_of_weight.y += 3;
+                CL_DTYPE4 weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.z += dot(input[j], weight_z);
+                pos_of_weight.y += 3;
+                CL_DTYPE4 weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.w += dot(input[j], weight_w);
+                j = 1;
+                pos_of_weight.x = i * 3 + j % 3;
+                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+                weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.x += dot(input[j], weight_x);
+                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+                weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.y += dot(input[j], weight_y);
+                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+                weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.z += dot(input[j], weight_z);
+                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+                weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.w += dot(input[j], weight_w);
+                j = 2;
+                pos_of_weight.x = i * 3 + j % 3;
+                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+                weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.x += dot(input[j], weight_x);
+                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+                weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.y += dot(input[j], weight_y);
+                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+                weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.z += dot(input[j], weight_z);
+                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+                weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.w += dot(input[j], weight_w);
+                j = 3;
+                pos_of_weight.x = i * 3 + j % 3;
+                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+                weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.x += dot(input[j], weight_x);
+                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+                weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.y += dot(input[j], weight_y);
+                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+                weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.z += dot(input[j], weight_z);
+                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+                weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.w += dot(input[j], weight_w);
+                j = 4;
+                pos_of_weight.x = i * 3 + j % 3;
+                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+                weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.x += dot(input[j], weight_x);
+                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+                weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.y += dot(input[j], weight_y);
+                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+                weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.z += dot(input[j], weight_z);
+                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+                weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.w += dot(input[j], weight_w);
+                j = 5;
+                pos_of_weight.x = i * 3 + j % 3;
+                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+                weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.x += dot(input[j], weight_x);
+                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+                weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.y += dot(input[j], weight_y);
+                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+                weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.z += dot(input[j], weight_z);
+                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+                weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+                output.w += dot(input[j], weight_w);
+               j = 6;
+               pos_of_weight.x = i * 3 + j % 3;
+               pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+               weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.x += dot(input[j], weight_x);
+               pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+               weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.y += dot(input[j], weight_y);
+               pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+               weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.z += dot(input[j], weight_z);
+               pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+               weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.w += dot(input[j], weight_w);
+               j = 7;
+               pos_of_weight.x = i * 3 + j % 3;
+               pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+               weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.x += dot(input[j], weight_x);
+               pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+               weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.y += dot(input[j], weight_y);
+               pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+               weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.z += dot(input[j], weight_z);
+               pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+               weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.w += dot(input[j], weight_w);
+               j = 8;
+               pos_of_weight.x = i * 3 + j % 3;
+               pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+               weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.x += dot(input[j], weight_x);
+               pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+               weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.y += dot(input[j], weight_y);
+               pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+               weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.z += dot(input[j], weight_z);
+               pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+               weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+               output.w += dot(input[j], weight_w);
+        }
+    } else { // group != 1
+      for (int i = 0; i < 4; i++) {
+        int used_input_channel_num =
+          (out_c * 4 + i) / (output_c / group) * filter_channel;
+        for (int f_c = 0; f_c < filter_channel; ++f_c) {
+          int input_c = used_input_channel_num + f_c;
+          int input_block = input_c / 4;
+          int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x,
+                               in_pos_in_one_block.y);
+          input[0] = select(
+              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                          (int2)(pos_in.x - dilation, pos_in.y - dilation)),
+              (CL_DTYPE4)(0.0f),
+              (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                         in_pos_in_one_block.y - dilation < 0 ||
+                         in_pos_in_one_block.x - dilation >= input_width ||
+                         in_pos_in_one_block.y - dilation >= input_height)
+                        << 15));
+          input[1] =
+              select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                                 (int2)(pos_in.x, pos_in.y - dilation)),
+                     (CL_DTYPE4)(0.0f),
+                     (ushort4)((in_pos_in_one_block.x < 0 ||
+                                in_pos_in_one_block.y - dilation < 0 ||
+                                in_pos_in_one_block.x >= input_width ||
+                                in_pos_in_one_block.y - dilation >= input_height)
+                               << 15));
+          input[2] = select(
+              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y - dilation)),
+                          (CL_DTYPE4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                         in_pos_in_one_block.y - dilation < 0 ||
+                         in_pos_in_one_block.x + dilation >= input_width ||
+                         in_pos_in_one_block.y - dilation >= input_height)
+                        << 15));
+          input[3] = select(
+              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                          (int2)(pos_in.x - dilation, pos_in.y)),
+                          (CL_DTYPE4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                                     in_pos_in_one_block.y < 0 ||
+                                     in_pos_in_one_block.x - dilation >= input_width ||
+                                     in_pos_in_one_block.y >= input_height)
+                                    << 15));
+          input[4] = select(
+              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(pos_in.x, pos_in.y)),
+                          (CL_DTYPE4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
+                                     in_pos_in_one_block.x >= input_width ||
+                                     in_pos_in_one_block.y >= input_height)
+                                     << 15));
+          input[5] =
+            select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                               (int2)(pos_in.x + dilation, pos_in.y)),
+                   (CL_DTYPE4)(0.0f),
+                   (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                              in_pos_in_one_block.y < 0 ||
+                              in_pos_in_one_block.x + dilation >= input_width ||
+                              in_pos_in_one_block.y >= input_height)
+                             << 15));
+          input[6] = select(
+              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                          (int2)(pos_in.x - dilation, pos_in.y + dilation)),
+                          (CL_DTYPE4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                                     in_pos_in_one_block.y + dilation < 0 ||
+                                     in_pos_in_one_block.x - dilation >= input_width ||
+                                     in_pos_in_one_block.y + dilation >= input_height)
+                                     << 15));
+          input[7] =
+              select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                                 (int2)(pos_in.x, pos_in.y + dilation)),
+                     (CL_DTYPE4)(0.0f),
+                     (ushort4)((in_pos_in_one_block.x < 0 ||
+                                in_pos_in_one_block.y + dilation < 0 ||
+                                in_pos_in_one_block.x >= input_width ||
+                                in_pos_in_one_block.y + dilation >= input_height)
+                                 << 15));
+          input[8] = select(
+              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y + dilation)),
+                          (CL_DTYPE4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                                     in_pos_in_one_block.y + dilation < 0 ||
+                                     in_pos_in_one_block.x + dilation >= input_width ||
+                                     in_pos_in_one_block.y + dilation >= input_height)
+                                      << 15));
+          CL_DTYPE tmp_out = 0;
+          for (int j = 0; j < 9; j++) {
+            int2 pos_of_weight;
+            pos_of_weight.x = (f_c / 4) * 3 + j % 3;
+            pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3;
+            CL_DTYPE4 weight = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+            int f_c_offset = f_c % 4;
+            CL_DTYPE f_value;
+            if (f_c_offset == 0) {
+              f_value = weight.x;
+            } else if (f_c_offset == 1) {
+              f_value = weight.y;
+            } else if (f_c_offset == 2) {
+              f_value = weight.z;
+            } else if (f_c_offset == 3) {
+              f_value = weight.w;
+            }
+            int input_c_offset = input_c % 4;
+            CL_DTYPE input_value;
+            if (input_c_offset == 0) {
+              input_value = input[j].x;
+            } else if (input_c_offset == 1) {
+              input_value = input[j].y;
+            } else if (input_c_offset == 2) {
+              input_value = input[j].z;
+            } else if (input_c_offset == 3) {
+              input_value = input[j].w;
+            }
+            tmp_out += f_value * input_value;
+          }
+          if (i == 0) {
+            output.x += tmp_out;
+          } else if (i == 1) {
+            output.y += tmp_out;
+          } else if (i == 2) {
+            output.z += tmp_out;
+          } else if (i == 3) {
+            output.w += tmp_out;
+          }
+        }
+      }
+    }
+	output = activation_type4(output);
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
+}
--- a/lite/kernels/opencl/conv_compute.cc
+++ b/lite/kernels/opencl/conv_compute.cc
@@ -362,6 +362,20 @@ void ConvImageCompute::PrepareForRun() {
        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
    impl_ = &ConvImageCompute::Conv2d1x1;
+  } else if (kernel_h == 3 && kernel_h == 3) {
+    // conv2d_3x3
+    kernel_func_names_.push_back("conv2d_3x3");
+    kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl");
+    CLImageConverterFolder converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<float> filter_image_v(filter_image_dims[0] *
+                                      filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    filter_gpu_image_.mutable_data<float, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+    impl_ = &ConvImageCompute::Conv2d3x3;
  } else if (kernel_h == 5 && kernel_w == 5) {
    // conv2d_5x5
    kernel_func_names_.push_back("conv2d_5x5");
@@ -582,6 +596,184 @@ void ConvImageCompute::Conv2d1x1() {
  CL_CHECK_FATAL(status);
  context.cl_wait_list()->emplace(out_image, event_);
 }
+void ConvImageCompute::Conv2d3x3() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto input_dims = param.x->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto* input_image = param.x->data<float, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<float, cl::Image2D>();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+  int input_width = input_dims[3];
+  int input_height = input_dims[2];
+  int input_channel = input_dims[1];
+  int output_width = output_dims[3];
+  int output_height = output_dims[2];
+  int output_channel = output_dims[1];
+  int filter_width = filter_dims[3];
+  int filter_height = filter_dims[2];
+  int filter_channel = filter_dims[1];
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+  auto* out_image = param.output->mutable_data<float, cl::Image2D>(
+      out_image_shape["width"], out_image_shape["height"]);
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
+               static_cast<int>(paddings[0]);
+  // calc input_c_block
+  auto input_image_shape = InitImageDimInfoWith(input_dims);
+  int input_c_block = input_image_shape["width"] / input_dims[3];
+  int input_c = input_dims[1];
+  auto dilations = *param.dilations;
+  // re-calc group
+  int new_groups{param.groups};
+  if (filter_dims[0] == output_dims[1] && filter_dims[1] == input_dims[1]) {
+    new_groups = 1;
+  } else if (!(filter_dims[0] == input_dims[1] && filter_dims[1] == 1)) {
+    new_groups = input_channel / filter_channel;
+  }
+  /* TODO(ysh329): mobile has no case below
+     else {
+      LOG(FATAL) << "Not support conv3x3 case with"
+                 << " input_dims:" << input_dims << " output_dims:" <<
+    output_dims
+                 << " filter_dims:" << filter_dims;
+    }
+  */
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(out_image_shape["width"]),
+                          static_cast<int64_t>(out_image_shape["height"])}));
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  VLOG(4) << "============ conv2d params ============";
+  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+          << input_image_shape["height"];
+  VLOG(4) << "input_c_block: " << input_c_block;
+  VLOG(4) << "input_c: " << input_c;
+  VLOG(4) << "input_image: " << input_image;
+  VLOG(4) << "input_dims: " << input_dims;
+  VLOG(4) << "filter_dims: " << filter_dims;
+  VLOG(4) << "filter_image: " << filter_image;
+  VLOG(4) << "output_dims: " << output_dims;
+  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+          << out_image_shape["height"];
+  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+  VLOG(4) << "offset: " << offset;
+  VLOG(4) << "dilations.size : " << dilations.size();
+  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  VLOG(4) << "param.groups(groups):" << param.groups;
+  VLOG(4) << "new_groups:" << new_groups;
+  VLOG(4) << "default work size{c_block, w, nh}: "
+          << "{" << c_block << ", " << w << ", " << nh << ""
+          << "}";
+  CHECK_GE(dilations.size(), 2);
+  CHECK(dilations[0] == dilations[1]);
+  CHECK_GE(input_dims.size(), 4);
+  CHECK_GE(paddings.size(), 2);
+  CHECK(paddings[0] == paddings[1]);
+  CHECK_GE(strides.size(), 2);
+  CHECK(strides[0] == strides[1]);
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<float, cl::Image2D>();
+  }
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  VLOG(4) << "kernel_key: " << kernel_key.str();
+  VLOG(4) << "kernel ready ... " << kernel_key.str();
+  VLOG(4) << "w: " << w;
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, w);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, nh);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_image);
+  CL_CHECK_FATAL(status);
+  if (has_bias) {
+    VLOG(4) << "set bias_image: ";
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
+  status = kernel.setArg(++arg_idx, *out_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, strides[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, offset);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, dilations[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_channel);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, filter_channel);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, filter_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, filter_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, new_groups);
+  CL_CHECK_FATAL(status);
+  auto global_work_size =
+      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                  static_cast<size_t>(default_work_size.data()[1]),
+                  static_cast<size_t>(default_work_size.data()[2])};
+  VLOG(4) << "out_image: " << out_image;
+  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
+          << global_work_size[1] << "," << global_work_size[2] << "}";
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_image, event_);
+}
 void ConvImageCompute::Conv2d5x5() {
  const auto& param = *param_.get_mutable<param_t>();
  auto input_dims = param.x->dims();
@@ -726,6 +918,7 @@ void ConvImageCompute::Conv2d5x5() {
  CL_CHECK_FATAL(status);
  context.cl_wait_list()->emplace(out_image, event_);
 }
 void ConvImageCompute::Conv2d7x7() {
  const auto& param = *param_.get_mutable<param_t>();
  auto input_dims = param.x->dims();

--- a/lite/kernels/opencl/conv_compute.h
+++ b/lite/kernels/opencl/conv_compute.h
@@ -71,6 +71,7 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
 private:
  void Conv2d1x1();
+  void Conv2d3x3();
  void Conv2d5x5();
  void Conv2d7x7();

--- a/lite/kernels/opencl/conv_image2d_compute_test.cc
+++ b/lite/kernels/opencl/conv_image2d_compute_test.cc
@@ -446,6 +446,371 @@ TEST(conv2d, compute_image2d_1x1) {
 #undef LOOP_TEST
 #undef PRINT_RESULT
+// #define PRINT_RESULT
+// #define LOOP_TEST
+TEST(conv2d, compute_image2d_3x3) {
+  // conv infos
+  const int ksize = 3;
+//  int loop_cnt = 0;
+#ifdef LOOP_TEST
+  const int pad = 1;
+  const int dilation = 1;
+  const int stride = 2;
+  const int group = 1;
+  for (int batch_size = 1; batch_size < 2; ++batch_size) {
+    for (int oc = 1; oc < 10; oc += 1) {   // oc
+      for (int ih = 5; ih < 9; ih += 1) {  // ih
+        int iw = ih;
+        for (int ic = 1; ic < 10; ic += 1) {  // ic
+          for (bool bias_flag : {true, false}) {
+            for (std::string relu_flag : {/*true,*/ "relu"}) {
+#else
+                const int pad = 1;
+                const int dilation = 1;
+#if 0  // small scale with group, but result of cpu reference is wrong
+                const int stride = 2;
+                const int group = 2;
+                const int batch_size = 1;
+                const int ic = 1;
+                const int ih = 3;
+                const int iw = 3;
+                const int oc = 2;
+#else  // big scale with group
+                const int stride = 1;
+                const int group = 32;
+                const int batch_size = 1;
+                const int ic = 32;
+                const int ih = 112;
+                const int iw = 112;
+                const int oc = 32;
+#endif
+                const bool bias_flag = false;
+                const std::string relu_flag = "relu";
+#endif
+              int filter_channel = ic;
+              if (group > 1) {
+                filter_channel = 1;
+              }
+              const int oh =
+                  ConvOutputSize(ih, ksize, dilation, pad, pad, stride);
+              const int ow =
+                  ConvOutputSize(iw, ksize, dilation, pad, pad, stride);
+              SHADOW_LOG << "to get kernel ...";
+              auto kernels =
+                  KernelRegistry::Global().Create("conv2d",
+                                                  TARGET(kOpenCL),
+                                                  PRECISION(kFloat),
+                                                  DATALAYOUT(kImageDefault));
+              ASSERT_FALSE(kernels.empty());
+              CHECK(batch_size == 1) << "conv3x3 only supprt batch_size == 1";
+              auto kernel = std::move(kernels.front());
+              SHADOW_LOG << "created conv2d kernel";
+              SHADOW_LOG << "prepare kernel ------";
+              lite::Tensor input, filter, bias, output;
+              operators::ConvParam param;
+              param.x = &input;
+              param.filter = &filter;
+              param.output = &output;
+              param.groups = group;
+              if (bias_flag) {
+                param.bias = &bias;
+              }
+              if (relu_flag == "relu") {
+                param.fuse_relu = true;
+              } else if (relu_flag == "None") {
+                param.fuse_relu = false;
+              } else if (relu_flag == "relu6") {
+                param.activation_param.Relu_clipped_coef = 6.f;
+                param.activation_param.has_active = true;
+                param.activation_param.active_type =
+                    lite_api::ActivationType::kRelu6;
+              }
+              std::vector<int> paddings = {pad, pad, pad, pad};
+              std::vector<int> dilations = {dilation, dilation};
+              param.paddings = std::make_shared<std::vector<int>>(paddings);
+              param.dilations = std::make_shared<std::vector<int>>(dilations);
+              param.strides = std::vector<int>{stride, stride};
+              std::unique_ptr<KernelContext> context(new KernelContext);
+              context->As<OpenCLContext>().InitOnce();
+              std::unique_ptr<KernelContext> conv_1x1_context(
+                  new KernelContext);
+              context->As<OpenCLContext>().CopySharedTo(
+                  &(conv_1x1_context->As<OpenCLContext>()));
+              kernel->SetContext(std::move(conv_1x1_context));
+              const DDim& input_dim =
+                  lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
+              const DDim& filter_dim = lite::DDim{
+                  std::vector<int64_t>({oc, filter_channel, ksize, ksize})};
+              const DDim& out_dim =
+                  lite::DDim{std::vector<int64_t>({batch_size, oc, oh, ow})};
+              // element wise bias
+              const DDim& bias_dim = lite::DDim{std::vector<int64_t>({oc})};
+              LOG(INFO) << "input_dim:" << input_dim
+                        << " filter_dim:" << filter_dim
+                        << " out_dim:" << out_dim;
+              param.x->Resize(input_dim);
+              param.filter->Resize(filter_dim);
+              param.output->Resize(out_dim);
+              if (bias_flag) {
+                param.bias->Resize(bias_dim);
+              }
+              kernel->SetParam(param);
+              size_t input_image_width = iw * ((ic + 3) / 4);
+              size_t input_image_height = ih * batch_size;
+              size_t out_image_width = ow * ((oc + 3) / 4);
+              size_t out_image_height = oh * batch_size;
+              size_t bias_image_width = ow * ((oc + 3) / 4);
+              size_t bias_image_height = oh * batch_size;
+              size_t filter_image_width = ksize * ((filter_channel + 3) / 4);
+              size_t filter_image_height = oc * ksize;
+              const size_t cl_image2d_row_pitch{0};
+              const size_t cl_image2d_slice_pitch{0};
+              std::default_random_engine engine;
+              std::uniform_real_distribution<float> gen(-5, 5);
+              std::vector<float> input_v(batch_size * ic * ih * iw);
+              std::vector<float> filter_v(oc * filter_channel * ksize * ksize);
+              std::vector<float> output_v(batch_size * oc * oh * ow);
+              std::vector<float> bias_v(oc);
+              SHADOW_LOG << "gen input and filter ...";
+              for (int i = 0; i < input_v.size(); ++i) {
+                input_v[i] = i;  // gen(engine);
+              }
+              for (int i = 0; i < filter_v.size(); ++i) {
+                filter_v[i] = 1;  // gen(engine);
+              }
+              SHADOW_LOG << "after gen input and filter ...";
+              SHADOW_LOG << "input_v.size(): " << input_v.size();
+              SHADOW_LOG << "filter_v.size(): " << filter_v.size();
+              SHADOW_LOG << "output_v.size(): " << output_v.size();
+              SHADOW_LOG << "bias_v.size(): " << bias_v.size();
+              SHADOW_LOG << "input_dim.production(): "
+                         << input_dim.production();
+              SHADOW_LOG << "filter_dim.production(): "
+                         << filter_dim.production();
+              SHADOW_LOG << "out_dim.production(): " << out_dim.production();
+              SHADOW_LOG << "bias_dim.production(): " << bias_dim.production();
+              SHADOW_LOG << "input_image_height:" << input_image_height
+                         << " input_image_width:" << input_image_width;
+              SHADOW_LOG << "filter_image_height:" << filter_image_height
+                         << " filter_image_width:" << filter_image_width;
+              SHADOW_LOG << "4 * input_image_height *input_image_width: "
+                         << 4 * input_image_height * input_image_width;
+              SHADOW_LOG << "4 * filter_image_width * filter_image_height: "
+                         << 4 * filter_image_width * filter_image_height;
+              CHECK(input_dim.production() == input_v.size());
+              CHECK_LE(input_dim.production(),
+                       4 * input_image_height * input_image_width);
+              CHECK(filter_dim.production() == filter_v.size());
+              CHECK_LE(filter_dim.production(),
+                       4 * filter_image_width * filter_image_height);
+              paddle::lite::CLImageConverterDefault default_convertor;
+              SHADOW_LOG << "set mapped input  ...";
+              std::vector<float> x_image_v(input_image_width *
+                                           input_image_height * 4);  // 4 :RGBA
+              std::vector<float> filter_image_v(
+                  filter_image_width * filter_image_height * 4);  // 4 : RGBA
+              std::vector<float> bias_image_v(
+                  bias_image_width * bias_image_height * 4);  // 4 : RGBA
+              std::vector<float> out_image_v(out_image_width *
+                                             out_image_height * 4);  // 4 :RGBA
+              default_convertor.NCHWToImage(
+                  input_v.data(), x_image_v.data(), input_dim);
+              SHADOW_LOG << "输入: ----  ";
+              for (int i = 0; i < input_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << input_v[i];
+              }
+              SHADOW_LOG << "输入image : ----  ";
+              for (int i = 0; i < x_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << x_image_v[i];
+              }
+              SHADOW_LOG << "set mapped filter  ...";
+              CLImageConverterFolder folder_convertor;
+              folder_convertor.NCHWToImage(
+                  filter_v.data(), filter_image_v.data(), filter_dim);
+              SHADOW_LOG << "卷积核: ----  ";
+              for (int i = 0; i < filter_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << filter_v[i];
+              }
+              SHADOW_LOG << "卷积核image: ----  ";
+              for (int i = 0; i < filter_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << filter_image_v[i];
+              }
+              auto* input_image2d = input.mutable_data<float, cl::Image2D>(
+                  input_image_width, input_image_height, x_image_v.data());
+              // assign filter as target arm
+              filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
+                                                             filter_dim);
+              // filter kernel
+              //              auto* filter_image2d = filter.mutable_data<float,
+              //              cl::Image2D>(
+              //                  filter_image_width,
+              //                  filter_image_height,
+              //                  filter_image_v.data());
+              if (bias_flag) {
+                for (int i = 0; i < bias_dim.production(); ++i) {
+                  bias_v[i] = static_cast<int>(gen(engine));
+                }
+                bias.Assign<float, lite::DDim, TARGET(kARM)>(bias_v.data(),
+                                                             bias_dim);
+                //                CLImageConverterFolder folder_convertor;
+                //                folder_convertor.NCHWToImage(
+                //                    bias_v.data(), bias_image_v.data(),
+                //                    bias_dim);
+                //
+                //                auto* bias_data = bias.mutable_data<float,
+                //                cl::Image2D>(
+                //                    bias_image_width, bias_image_height,
+                //                    bias_image_v.data());
+              }
+              SHADOW_LOG << "resize output  ...";
+              output.Resize(out_dim);
+              // cpu conv basic calc
+              lite::Tensor out_ref;
+              out_ref.Resize(out_dim);
+              SHADOW_LOG << "prepare kernel ready";
+              SHADOW_LOG << "kernel launch ...";
+              kernel->Launch();
+              SHADOW_LOG << "mutable output ...";
+              auto* output_image2d = output.mutable_data<float, cl::Image2D>(
+                  out_image_width, out_image_height);
+              auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+              auto* out_ptr = param.output->data<float, cl::Image2D>();
+              auto it = wait_list->find(out_ptr);
+              if (it != wait_list->end()) {
+                SHADOW_LOG << "--- Find the sync event for the target cl "
+                              "tensor. ---";
+                auto& event = *(it->second);
+                event.wait();
+              } else {
+                LOG(FATAL) << "Could not find the sync event for the target "
+                              "cl tensor.";
+              }
+              TargetWrapperCL::ImgcpySync(out_image_v.data(),
+                                          output.data<float, cl::Image2D>(),
+                                          out_image_width,
+                                          out_image_height,
+                                          cl_image2d_row_pitch,
+                                          cl_image2d_slice_pitch,
+                                          IoDirection::DtoH);
+              DDim out_image_shape =
+                  default_convertor.InitImageDimInfoWith(output.dims());
+              default_convertor.ImageToNCHW(out_image_v.data(),
+                                            output_v.data(),
+                                            out_image_shape,
+                                            output.dims());
+              SHADOW_LOG << "输出: ----  ";
+              for (int i = 0; i < output_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << output_v[i];
+              }
+              SHADOW_LOG << "输出image: ----  ";
+              for (int i = 0; i < out_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << out_image_v[i];
+              }
+              SHADOW_LOG << "mutable_data out_ref_data: ";
+              // run cpu ref
+              auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
+              SHADOW_LOG << " conv_basic beigin ..... ";
+              conv_basic<float, float>(input_v.data(),
+                                       out_ref_data,
+                                       batch_size,
+                                       oc,
+                                       oh,
+                                       ow,
+                                       ic,
+                                       ih,
+                                       iw,
+                                       filter_v.data(),
+                                       bias_v.data(),  // mapped_bias,
+                                       group,
+                                       ksize,
+                                       ksize,
+                                       stride,
+                                       stride,
+                                       dilation,
+                                       dilation,
+                                       pad,
+                                       pad,
+                                       bias_flag,
+                                       relu_flag);
+              SHADOW_LOG << " conv_basic end ..... ";
+              SHADOW_LOG << " out_dim: " << out_dim;
+              const DDim& out_image_dims = lite::DDim{std::vector<int64_t>(
+                  {static_cast<int64_t>(out_image_width),
+                   static_cast<int64_t>(out_image_height)})};
+#ifdef PRINT_RESULT
+              for (int i = 0; i < out_dim.production(); i++) {
+                VLOG(4) << "output_v[" << i << "]:" << output_v[i]
+                        << " out_ref_data[" << i << "]:" << out_ref_data[i];
+              }
+#endif
+              for (int i = 0; i < out_dim.production(); i++) {
+                EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2);
+                if (abs(output_v[i] - out_ref_data[i]) > 1e-2) {
+                  LOG(FATAL) << "error idx:" << i;
+                }
+              }
+#ifdef LOOP_TEST
+            }
+          }
+        }
+      }
+    }
+  }
+#else
+// nothing to do.
+#endif
+}
+#undef LOOP_TEST
+#undef PRINT_RESULT
 // #define PRINT_RESULT
 // #define LOOP_TEST
 TEST(conv2d, compute_image2d_5x5) {