add OpenCL depthwise3x3s1 kernel

1ab1fe02 · zhaojiaying01 · 00a81db2 · 1ab1fe02 · 1ab1fe02 · 1ab1fe02
9 changed file
--- a/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
+++ b/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
@@ -233,5 +233,113 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
  }
 }

+void DWConvAddBnRelu(framework::CLHelper *cl_helper,
+                     const ConvParam<GPU_CL> &param, bool ifRelu,
+                     const framework::CLImage *biase,
+                     const framework::CLImage *new_scale,
+                     const framework::CLImage *new_bias) {
+  auto kernel = cl_helper->KernelAt(0);
+  auto default_work_size = cl_helper->DefaultWorkSize(*param.Output());
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  int w_blk_size = 2;
+  int w_blk = (w + w_blk_size - 1) / w_blk_size;
+
+  default_work_size[1] = w_blk;
+  auto input = param.Input()->GetCLImage();
+  auto filter = param.Filter()->GetCLImage();
+
+  auto output = param.Output()->GetCLImage();
+  int stride = param.Strides()[0];
+  int pad = param.Paddings()[0];
+  int dilation = param.Dilations()[0];
+
+  int input_channel = param.Input()->dims()[1];
+  int input_height = param.Input()->dims()[2];
+  int input_width = param.Input()->dims()[3];
+
+  int output_height = param.Output()->dims()[2];
+  int output_width = param.Output()->dims()[3];
+
+  //  DLOG << " w " << w;
+  //  DLOG << " nh " << nh;
+  //  DLOG << " stride " << stride;
+  //  DLOG << " dilation " << dilation;
+  //  DLOG << " input width " << input_width;
+  //  DLOG << " input height " << input_height;
+  //  DLOG << " output width " << output_width;
+  //  DLOG << " output height " << output_height;
+  //  DLOG << " input dim " << param.Input()->dims();
+  //  DLOG << " output dim " << param.Output()->dims();
+  //  DLOG << " filter dim " << param.Filter()->dims();
+
+  cl_int status;
+  int index = 0;
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &w_blk);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &nh);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter);
+  CL_CHECK_ERRORS(status);
+
+  if (biase) {
+    auto bias_mem = biase->GetCLImage();
+    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem);
+    CL_CHECK_ERRORS(status);
+  }
+
+  if (new_scale && new_bias) {
+    auto new_scale_mem = new_scale->GetCLImage();
+    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem);
+    CL_CHECK_ERRORS(status);
+
+    auto new_bias_mem = new_bias->GetCLImage();
+    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem);
+    CL_CHECK_ERRORS(status);
+  }
+
+  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &stride);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &pad);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &input_channel);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &input_width);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &input_height);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &output_width);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
+  CL_CHECK_ERRORS(status);
+
+  status = clEnqueueNDRangeKernel(
+      cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+}
+
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/kernel/cl/cl-kernel-func/conv_func.h
+++ b/src/operators/kernel/cl/cl-kernel-func/conv_func.h
@@ -41,6 +41,12 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
                   const framework::CLImage *new_scale = nullptr,
                   const framework::CLImage *new_bias = nullptr);

+void DWConvAddBnRelu(framework::CLHelper *cl_helper,
+                     const ConvParam<GPU_CL> &param, bool ifRelu = false,
+                     const framework::CLImage *biase = nullptr,
+                     const framework::CLImage *new_scale = nullptr,
+                     const framework::CLImage *new_bias = nullptr);
+
 }  // namespace operators
 }  // namespace paddle_mobile


--- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
@@ -583,6 +583,155 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0,
 }


+
+__kernel void depth_conv_3x3s1(__private const int ou_ch_blk,
+                                              __private const int ou_w_blk,
+                                              __private const int ou_nh,
+                                              __read_only image2d_t input,
+                                              __read_only image2d_t filter,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                                              __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                                              __read_only image2d_t new_scale,
+                                              __read_only image2d_t new_biase,
+#endif
+                                              __write_only image2d_t output_image,
+                                              __private const int stride,
+                                              __private const int pad,
+                                              __private const int dilation,
+                                              __private const int in_ch,
+                                              __private const int in_w,/* of one block */
+                                              __private const int in_h, /* of one block */
+                                              __private const int ou_w,
+                                              __private const int ou_h) {
+
+    const int ou_ch_blk_id = get_global_id(0);
+    const int ou_w_blk_id = get_global_id(1);
+    const int ou_nh_id = get_global_id(2);
+    const int w_blk_size = 2;
+
+    const int batch_id = ou_nh_id / ou_h;
+    int ou_col_id = ou_w_blk_id * w_blk_size;
+    int ou_row_id = ou_nh_id % ou_h;
+    int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id);
+
+    // input pos in one block and on batch
+    int col_id = ou_col_id - pad;
+    int row_id = ou_row_id - pad;
+
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+
+#ifdef BIASE_CH
+    half4 output[2];
+    output[0] = read_imageh(bias, sampler, (int2)(ou_ch_blk_id, 0));
+    output[1] = output[0];
+#elif defined(BIASE_ELE)
+    half4 output[2];
+    output[0] = read_imageh(bias, sampler, (int2)(ou_x, ou_nh_id));
+    if (ou_col_id + 1 < ou_w) {
+        output[1] = read_imageh(bias, sampler, (int2)(ou_x + 1, ou_nh_id));
+    }
+#else
+    half4 output[2] = {0.0f};
+#endif
+
+    half4 inputs[12];
+
+    int filter_x = ou_ch_blk_id * 3;
+    int filter_y = 0;
+    half4 filters[9];
+    filters[0] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y));
+    filters[1] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y));
+    filters[2] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y));
+
+    int in_x = mad24(ou_ch_blk_id, in_w, col_id);
+    int in_y = mad24(batch_id, in_h, row_id);
+
+    int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h);
+    int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w);
+    inputs[0] = read_imageh(input, sampler, (int2)(x0, y0));
+    int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w);
+    inputs[1] = read_imageh(input, sampler, (int2)(x1, y0));
+    int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w);
+    inputs[2] = read_imageh(input, sampler, (int2)(x2, y0));
+    int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w);
+    inputs[3] = read_imageh(input, sampler, (int2)(x3, y0));
+
+    output[0] = mad(inputs[0], filters[0], output[0]);
+    output[1] = mad(inputs[1], filters[0], output[1]);
+
+    output[0] = mad(inputs[1], filters[1], output[0]);
+    output[1] = mad(inputs[2], filters[1], output[1]);
+
+    output[0] = mad(inputs[2], filters[2], output[0]);
+    output[1] = mad(inputs[3], filters[2], output[1]);
+
+
+    filters[3] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y + 1));
+    filters[4] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 1));
+    filters[5] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 1));
+
+
+    int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h);
+    inputs[4] = read_imageh(input, sampler, (int2)(x0, y1));
+    inputs[5] = read_imageh(input, sampler, (int2)(x1, y1));
+    inputs[6] = read_imageh(input, sampler, (int2)(x2, y1));
+    inputs[7] = read_imageh(input, sampler, (int2)(x3, y1));
+
+
+    output[0] = mad(inputs[4], filters[3], output[0]);
+    output[1] = mad(inputs[5], filters[3], output[1]);
+
+    output[0] = mad(inputs[5], filters[4], output[0]);
+    output[1] = mad(inputs[6], filters[4], output[1]);
+
+    output[0] = mad(inputs[6], filters[5], output[0]);
+    output[1] = mad(inputs[7], filters[5], output[1]);
+
+
+    filters[6] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y + 2));
+    filters[7] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 2));
+    filters[8] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 2));
+
+    int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h);
+    inputs[8] = read_imageh(input, sampler, (int2)(x0, y2));
+    inputs[9] = read_imageh(input, sampler, (int2)(x1, y2));
+    inputs[10] = read_imageh(input, sampler, (int2)(x2, y2));
+    inputs[11] = read_imageh(input, sampler, (int2)(x3, y2));
+
+
+    output[0] = mad(inputs[8], filters[6], output[0]);
+    output[1] = mad(inputs[9], filters[6], output[1]);
+
+    output[0] = mad(inputs[9], filters[7], output[0]);
+    output[1] = mad(inputs[10], filters[7], output[1]);
+
+    output[0] = mad(inputs[10], filters[8], output[0]);
+    output[1] = mad(inputs[11], filters[8], output[1]);
+#ifdef BATCH_NORM
+    half4 scale = read_imageh(new_scale, sampler, (int2)(ou_ch_blk_id, 0));
+    half4 biase = read_imageh(new_biase, sampler, (int2)(ou_ch_blk_id, 0));
+    output[0] = mad(scale, output[0], biase);
+    if (ou_col_id + 1 < ou_w) {
+        output[1] = mad(scale, output[1], biase);
+    }
+#endif
+
+#ifdef RELU
+    output[0] = activation(output[0]);
+    output[1] = activation(output[1]);
+#endif
+
+    write_imageh(output_image, (int2)(ou_x, ou_nh_id), output[0]);
+    if (ou_col_id + 1 < ou_w) {
+        write_imageh(output_image, (int2)(ou_x + 1, ou_nh_id), output[1]);
+    }
+
+}
+
 __kernel void conv_1x1(__private const int global_size_dim0,
                       __private const int global_size_dim1,
                       __private const int global_size_dim2,

--- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
@@ -157,12 +157,17 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
  } else if (param->Filter()->dims()[1] == 1 &&
             param->Input()->dims()[1] == param->Output()->dims()[1] &&
             param->Filter()->dims()[2] == 3) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
    param->Filter()->InitDWImage(cl_helper_.CLContext(),
                                 cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
-                               build_options);
+    if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
+      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT;
+      this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file,
+                                 build_options);
+    } else {
+      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
+      this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
+                                 build_options);
+    }

  } else if (param->Filter()->dims()[2] == 3 &&
             param->Filter()->dims()[3] == 3) {
@@ -207,6 +212,10 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
      ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(),
                    param.NewScale(), param.NewBias());
      break;
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
+      DWConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(),
+                      param.NewScale(), param.NewBias());
+      break;
    default:
      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
                                    param.ExecMode());

--- a/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_kernel.cpp
@@ -54,12 +54,17 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
  } else if (param->Filter()->dims()[1] == 1 &&
             param->Input()->dims()[1] == param->Output()->dims()[1] &&
             param->Filter()->dims()[2] == 3) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
    param->Filter()->InitDWImage(cl_helper_.CLContext(),
                                 cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
-                               build_options);
+    if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
+      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT;
+      this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file,
+                                 build_options);
+    } else {
+      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
+      this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
+                                 build_options);
+    }

  } else if (param->Filter()->dims()[2] == 3 &&
             param->Filter()->dims()[3] == 3) {
@@ -118,6 +123,9 @@ void ConvAddKernel<GPU_CL, float>::Compute(
    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
      ConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
      break;
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
+      DWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
+      break;
    default:
      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
                                    param.ExecMode());

--- a/src/operators/kernel/cl/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_relu_kernel.cpp
@@ -55,12 +55,17 @@ bool ConvAddReluKernel<GPU_CL, float>::Init(
  } else if (param->Filter()->dims()[1] == 1 &&
             param->Input()->dims()[1] == param->Output()->dims()[1] &&
             param->Filter()->dims()[2] == 3) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
    param->Filter()->InitDWImage(cl_helper_.CLContext(),
                                 cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
-                               build_options);
+    if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
+      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT;
+      this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file,
+                                 build_options);
+    } else {
+      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
+      this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
+                                 build_options);
+    }

  } else if (param->Filter()->dims()[2] == 3 &&
             param->Filter()->dims()[3] == 3) {
@@ -122,6 +127,9 @@ void ConvAddReluKernel<GPU_CL, float>::Compute(
    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
      ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias());
      break;
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
+      DWConvAddBnRelu(&this->cl_helper_, param, true, param.Bias());
+      break;
    default:
      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
                                    param.ExecMode());

--- a/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
@@ -112,12 +112,17 @@ bool ConvBNReluKernel<GPU_CL, float>::Init(
  } else if (param->Filter()->dims()[1] == 1 &&
             param->Input()->dims()[1] == param->Output()->dims()[1] &&
             param->Filter()->dims()[2] == 3) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
    param->Filter()->InitDWImage(cl_helper_.CLContext(),
                                 cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
-                               build_options);
+    if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
+      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT;
+      this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file,
+                                 build_options);
+    } else {
+      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
+      this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
+                                 build_options);
+    }

  } else if (param->Filter()->dims()[2] == 3 &&
             param->Filter()->dims()[3] == 3) {
@@ -161,6 +166,10 @@ void ConvBNReluKernel<GPU_CL, float>::Compute(
      ConvAddBnRelu(&this->cl_helper_, param, true, nullptr, param.NewScale(),
                    param.NewBias());
      break;
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
+      DWConvAddBnRelu(&this->cl_helper_, param, true, nullptr, param.NewScale(),
+                      param.NewBias());
+      break;
    default:
      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
                                    param.ExecMode());

--- a/src/operators/kernel/cl/conv_kernel.cpp
+++ b/src/operators/kernel/cl/conv_kernel.cpp
@@ -51,11 +51,15 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
  } else if (param->Filter()->dims()[1] == 1 &&
             param->Input()->dims()[1] == param->Output()->dims()[1] &&
             param->Filter()->dims()[2] == 3) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
    param->Filter()->InitDWImage(cl_helper_.CLContext(),
                                 cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file);
+    if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
+      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT;
+      this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file);
+    } else {
+      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
+      this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file);
+    }
    DLOG << "depth_conv 3x3";

  } else if (param->Filter()->dims()[2] == 3 &&
@@ -100,6 +104,9 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
      ConvAddBnRelu(&this->cl_helper_, param);
      break;
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
+      DWConvAddBnRelu(&this->cl_helper_, param);
+      break;
    default:
      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
                                    param.ExecMode());

--- a/src/operators/kernel/cl/conv_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_relu_kernel.cpp
@@ -52,12 +52,18 @@ bool ConvReluKernel<GPU_CL, float>::Init(FusionConvReluParam<GPU_CL> *param) {
  } else if (param->Filter()->dims()[1] == 1 &&
             param->Input()->dims()[1] == param->Output()->dims()[1] &&
             param->Filter()->dims()[2] == 3) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
    param->Filter()->InitDWImage(cl_helper_.CLContext(),
                                 cl_helper_.CLCommandQueue());
+    if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
+      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT;
+      this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file,
+                                 build_options);
+    } else {
+      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
+      this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
+                                 build_options);
+    }

-    this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
-                               build_options);
    DLOG << "depth_conv 3x3";

  } else if (param->Filter()->dims()[2] == 3 &&
@@ -103,6 +109,9 @@ void ConvReluKernel<GPU_CL, float>::Compute(
    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
      ConvAddBnRelu(&this->cl_helper_, param, true);
      break;
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
+      DWConvAddBnRelu(&this->cl_helper_, param, true);
+      break;
    default:
      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
                                    param.ExecMode());