add opencl depthwise_conv_trans_op (#1949)

* add opencl depthwise_conv_trans_op * test=develop

add opencl depthwise_conv_trans_op (#1949)
* add opencl depthwise_conv_trans_op * test=develop
4b1a4ca0 · StarryRain · Jiaying Zhao · fc5d91d4 · 4b1a4ca0 · 4b1a4ca0
7 changed file
--- a/mobile/src/framework/load_ops.h
+++ b/mobile/src/framework/load_ops.h
@@ -151,7 +151,7 @@ LOAD_OP1(shape, CPU);
 LOAD_OP2(depthwise_conv2d, CPU, GPU_CL);
 #endif
 #ifdef CONV_TRANSPOSE_OP
-LOAD_OP1(conv2d_transpose, CPU);
+LOAD_OP2(conv2d_transpose, CPU, GPU_CL);
 #endif
 #ifdef SCALE_OP
 LOAD_OP2(scale, CPU, GPU_CL);

--- a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
@@ -468,5 +468,175 @@ void SWConvAddBnRelu(framework::CLHelper *cl_helper,
  }
  CL_CHECK_ERRORS(status);
 }
+
+void DWConvTransposeAddBnRelu(framework::CLHelper *cl_helper,
+                              const ConvTransposeParam<GPU_CL> &param,
+                              bool ifRelu, const framework::CLImage *biase,
+                              const framework::CLImage *new_scale,
+                              const framework::CLImage *new_bias) {
+  auto kernel = cl_helper->KernelAt(0);
+  auto default_work_size = cl_helper->DefaultWorkSize(*param.Output());
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+
+  int w_blk_size = 1;
+  int w_blk = (w + w_blk_size - 1) / w_blk_size;
+  default_work_size[1] = w_blk;
+
+  int h_blk_size = 1;
+  int h_blk = (nh + h_blk_size - 1) / h_blk_size;
+  default_work_size[2] = h_blk;
+
+  auto input = param.Input()->GetCLImage();
+  auto filter = param.Filter()->GetCLImage();
+
+  auto output = param.Output()->GetCLImage();
+  int stride = param.Strides()[0];
+  int pad = param.Paddings()[0];
+  int dilation = param.Dilations()[0];
+
+  int input_channel = param.Input()->dims()[1];
+  int input_height = param.Input()->dims()[2];
+  int input_width = param.Input()->dims()[3];
+
+  int output_height = param.Output()->dims()[2];
+  int output_width = param.Output()->dims()[3];
+
+  int filter_height = param.Filter()->dims()[2];
+  int filter_width = param.Filter()->dims()[3];
+
+  cl_int status;
+  int index = 0;
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &w_blk);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &h_blk);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter);
+  CL_CHECK_ERRORS(status);
+
+  if (biase) {
+    auto bias_mem = biase->GetCLImage();
+    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem);
+    CL_CHECK_ERRORS(status);
+  }
+
+  if (new_scale && new_bias) {
+    auto new_scale_mem = new_scale->GetCLImage();
+    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem);
+    CL_CHECK_ERRORS(status);
+
+    auto new_bias_mem = new_bias->GetCLImage();
+    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem);
+    CL_CHECK_ERRORS(status);
+  }
+
+  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &stride);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &pad);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &input_channel);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &input_width);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &input_height);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &output_width);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &filter_width);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &filter_height);
+  CL_CHECK_ERRORS(status);
+
+  if (default_work_size.data()[1] % 60 == 0 && use_lws) {
+    const size_t local_work_size[3] = {static_cast<const uint32_t>(1),
+                                       static_cast<const uint32_t>(60),
+                                       static_cast<const uint32_t>(1)};
+    status = clEnqueueNDRangeKernel(
+        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
+        default_work_size.data(), local_work_size, 0, NULL, NULL);
+  } else {
+    status = clEnqueueNDRangeKernel(
+        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
+        default_work_size.data(), NULL, 0, NULL, NULL);
+  }
+  CL_CHECK_ERRORS(status);
+}
+
+void ConvTransposeAddBnRelu(framework::CLHelper *cl_helper,
+                            const ConvTransposeParam<GPU_CL> &param,
+                            bool ifRelu, const framework::CLImage *biase,
+                            const framework::CLImage *new_scale,
+                            const framework::CLImage *new_bias) {
+  auto kernel = cl_helper->KernelAt(0);
+  const auto *input = param.Input();
+  auto *output = param.Output();
+  auto *filter = param.Filter();
+  const int n = input->dims()[0];
+  const int input_c = input->dims()[1];
+  const int input_c_block = (input_c + 3) / 4;
+  const int input_width = input->dims()[3];
+  const int input_height = input->dims()[2];
+  const int output_c = output->dims()[1];
+  const int output_c_block = (output_c + 3) / 4;
+  const int output_width = output->dims()[3];
+  const int output_height = output->dims()[2];
+
+  auto inputImage = input->GetCLImage();
+  auto outputImage = output->GetCLImage();
+  auto filterImage = filter->GetCLImage();
+
+  cl_int status;
+  status = clSetKernelArg(kernel, 0, sizeof(int), &input_c_block);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 1, sizeof(int), &input_width);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 2, sizeof(int), &input_height);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 3, sizeof(int), &output_width);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 4, sizeof(int), &output_height);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &inputImage);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &filterImage);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &outputImage);
+  CL_CHECK_ERRORS(status);
+
+  const size_t work_size[3] = {(size_t)output_c_block, (size_t)input_width,
+                               (size_t)(n * input_height)};
+
+  DLOG << "conv transpose " << input_c_block << input_width << input_height
+       << output_width << output_height << work_size[0] << work_size[1]
+       << work_size[2];
+
+  clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel, 3, NULL,
+                         work_size, NULL, 0, NULL, NULL);
+}
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.h
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.h
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifdef CONV_OP
+#if defined(CONV_OP) || defined(CONV_TRANSPOSE_OP)

 #pragma once

@@ -52,6 +52,18 @@ void SWConvAddBnRelu(framework::CLHelper *cl_helper,
                     const framework::CLImage *biase = nullptr,
                     const framework::CLImage *new_scale = nullptr,
                     const framework::CLImage *new_bias = nullptr);
+void DWConvTransposeAddBnRelu(framework::CLHelper *cl_helper,
+                              const ConvTransposeParam<GPU_CL> &param,
+                              bool ifRelu = false,
+                              const framework::CLImage *biase = nullptr,
+                              const framework::CLImage *new_scale = nullptr,
+                              const framework::CLImage *new_bias = nullptr);
+void ConvTransposeAddBnRelu(framework::CLHelper *cl_helper,
+                            const ConvTransposeParam<GPU_CL> &param,
+                            bool ifRelu = false,
+                            const framework::CLImage *biase = nullptr,
+                            const framework::CLImage *new_scale = nullptr,
+                            const framework::CLImage *new_bias = nullptr);

 }  // namespace operators
 }  // namespace paddle_mobile

--- a/mobile/src/operators/kernel/cl/cl_kernel/conv_transpose.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/conv_transpose.cl
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "cl_common.h"
-
-__kernel void conv_transpose(__private const int input_c_block,
-                                              __private const int input_width,/* of one block */
-                                              __private const int input_height,/* of one block */
-                                              __private const int output_width,
-                                              __private const int output_height,
-                                              __read_only image2d_t input_image,
-                                              __read_only image2d_t filter,
-                                              __write_only image2d_t output_image) {
-
-    const int out_c = get_global_id(0);
-    const int in_w = get_global_id(1);
-    const int in_nh = get_global_id(2);
-    const int n = in_nh / input_height;
-    const int h = in_nh % input_height;
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
-
-    half4 input1, input2, input3, input4;
-    half4 output1 = 0.0f, output2 = 0.0f, output3 = 0.0f, output4 = 0.0f;
-    half4 w = 0.0f;
-    int2 pos_in;
-    for (int i = 0; i < input_c_block; i += 1) {
-      pos_in = (int2)(mad24(i, input_width, in_w), in_nh);
-      input1 = select(read_imageh(input_image, sampler,
-                                               (int2)(pos_in.x, pos_in.y)),
-                                               (half4)(0.0f),
-                                               (ushort4)((in_w < 0 || h < 0 || in_w >= input_width || h >= input_height) << 15));
-      input2 = select(read_imageh(input_image, sampler,
-                                                     (int2)(pos_in.x + 1, pos_in.y)),
-                                                     (half4)(0.0f),
-                                                     (ushort4)((in_w + 1 < 0 || h < 0 || in_w + 1 >= input_width || h >= input_height) << 15));
-      input3 = select(read_imageh(input_image, sampler,
-                                                     (int2)(pos_in.x, pos_in.y + 1)),
-                                                     (half4)(0.0f),
-                                                     (ushort4)((in_w < 0 || h + 1 < 0 || in_w >= input_width || h + 1 >= input_height) << 15));
-      input4 = select(read_imageh(input_image, sampler,
-                                                     (int2)(pos_in.x + 1, pos_in.y + 1)),
-                                                     (half4)(0.0f),
-                                                     (ushort4)((in_w + 1 < 0 || h + 1 < 0 || in_w + 1 >= input_width || h + 1 >= input_height) << 15));
-
-      int wx = i * 3;
-      int wy = out_c * 4 * 3;
-      w = read_imageh(filter, sampler, (int2)(wx, wy));
-      output4.x += dot(input4, w);
-      w = read_imageh(filter, sampler, (int2)(wx + 1, wy));
-      output3.x += dot(input3, w);
-      w = read_imageh(filter, sampler, (int2)(wx + 2, wy));
-      output4.x += dot(input3, w);
-      w = read_imageh(filter, sampler, (int2)(wx, wy + 1));
-      output2.x += dot(input2, w);
-      w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1));
-      output1.x += dot(input1, w);
-      w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1));
-      output2.x += dot(input1, w);
-      w = read_imageh(filter, sampler, (int2)(wx, wy + 2));
-      output4.x += dot(input2, w);
-      w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2));
-      output3.x += dot(input1, w);
-      w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2));
-      output4.x += dot(input1, w);
-
-      wy = (out_c * 4 + 1) * 3;
-      w = read_imageh(filter, sampler, (int2)(wx, wy));
-      output4.y += dot(input4, w);
-      w = read_imageh(filter, sampler, (int2)(wx + 1, wy));
-      output3.y += dot(input3, w);
-      w = read_imageh(filter, sampler, (int2)(wx + 2, wy));
-      output4.y += dot(input3, w);
-      w = read_imageh(filter, sampler, (int2)(wx, wy + 1));
-      output2.y += dot(input2, w);
-      w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1));
-      output1.y += dot(input1, w);
-      w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1));
-      output2.y += dot(input1, w);
-      w = read_imageh(filter, sampler, (int2)(wx, wy + 2));
-      output4.y += dot(input2, w);
-      w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2));
-      output3.y += dot(input1, w);
-      w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2));
-      output4.y += dot(input1, w);
-
-      wy = (out_c * 4 + 2) * 3;
-      w = read_imageh(filter, sampler, (int2)(wx, wy));
-      output4.z += dot(input4, w);
-      w = read_imageh(filter, sampler, (int2)(wx + 1, wy));
-      output3.z += dot(input3, w);
-      w = read_imageh(filter, sampler, (int2)(wx + 2, wy));
-      output4.z += dot(input3, w);
-      w = read_imageh(filter, sampler, (int2)(wx, wy + 1));
-      output2.z += dot(input2, w);
-      w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1));
-      output1.z += dot(input1, w);
-      w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1));
-      output2.z += dot(input1, w);
-      w = read_imageh(filter, sampler, (int2)(wx, wy + 2));
-      output4.z += dot(input2, w);
-      w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2));
-      output3.z += dot(input1, w);
-      w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2));
-      output4.z += dot(input1, w);
-
-      wy = (out_c * 4 + 3) * 3;
-      w = read_imageh(filter, sampler, (int2)(wx, wy));
-      output4.w += dot(input4, w);
-      w = read_imageh(filter, sampler, (int2)(wx + 1, wy));
-      output3.w += dot(input3, w);
-      w = read_imageh(filter, sampler, (int2)(wx + 2, wy));
-      output4.w += dot(input3, w);
-      w = read_imageh(filter, sampler, (int2)(wx, wy + 1));
-      output2.w += dot(input2, w);
-      w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1));
-      output1.w += dot(input1, w);
-      w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1));
-      output2.w += dot(input1, w);
-      w = read_imageh(filter, sampler, (int2)(wx, wy + 2));
-      output4.w += dot(input2, w);
-      w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2));
-      output3.w += dot(input1, w);
-      w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2));
-      output4.w += dot(input1, w);
-    }
-
-    int2 pos_out = (int2)(out_c * output_width + 2 * in_w, n * output_height + 2 * h);
-    write_imageh(output_image, pos_out, output1);
-    write_imageh(output_image, (int2)(pos_out.x + 1, pos_out.y), output2);
-    write_imageh(output_image, (int2)(pos_out.x, pos_out.y + 1), output3);
-    write_imageh(output_image, (int2)(pos_out.x + 1, pos_out.y + 1), output4);
-}
\ No newline at end of file
--- a/mobile/src/operators/kernel/cl/cl_kernel/conv_transpose_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/conv_transpose_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "cl_common.h"
+
+__kernel void conv_transpose(__private const int input_c_block,
+                             __private const int input_width,/* of one block */
+                             __private const int input_height,/* of one block */
+                             __private const int output_width,
+                             __private const int output_height,
+                             __read_only image2d_t input_image,
+                             __read_only image2d_t filter,
+                             __write_only image2d_t output_image) {
+
+    const int out_c = get_global_id(0);
+    const int in_w = get_global_id(1);
+    const int in_nh = get_global_id(2);
+    const int n = in_nh / input_height;
+    const int h = in_nh % input_height;
+
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+
+    half4 input1, input2, input3, input4;
+    half4 output1 = 0.0f, output2 = 0.0f, output3 = 0.0f, output4 = 0.0f;
+    half4 w = 0.0f;
+    int2 pos_in;
+    for (int i = 0; i < input_c_block; i += 1) {
+        pos_in = (int2)(mad24(i, input_width, in_w), in_nh);
+        input1 = select(read_imageh(input_image, sampler,
+                                    (int2)(pos_in.x, pos_in.y)),
+                        (half4)(0.0f),
+                        (ushort4)((in_w < 0 || h < 0 || in_w >= input_width || h >= input_height) << 15));
+        input2 = select(read_imageh(input_image, sampler,
+                                    (int2)(pos_in.x + 1, pos_in.y)),
+                        (half4)(0.0f),
+                        (ushort4)((in_w + 1 < 0 || h < 0 || in_w + 1 >= input_width || h >= input_height) << 15));
+        input3 = select(read_imageh(input_image, sampler,
+                                    (int2)(pos_in.x, pos_in.y + 1)),
+                        (half4)(0.0f),
+                        (ushort4)((in_w < 0 || h + 1 < 0 || in_w >= input_width || h + 1 >= input_height) << 15));
+        input4 = select(read_imageh(input_image, sampler,
+                                    (int2)(pos_in.x + 1, pos_in.y + 1)),
+                        (half4)(0.0f),
+                        (ushort4)((in_w + 1 < 0 || h + 1 < 0 || in_w + 1 >= input_width || h + 1 >= input_height) << 15));
+
+        int wx = i * 3;
+        int wy = out_c * 4 * 3;
+        w = read_imageh(filter, sampler, (int2)(wx, wy));
+        output4.x += dot(input4, w);
+        w = read_imageh(filter, sampler, (int2)(wx + 1, wy));
+        output3.x += dot(input3, w);
+        w = read_imageh(filter, sampler, (int2)(wx + 2, wy));
+        output4.x += dot(input3, w);
+        w = read_imageh(filter, sampler, (int2)(wx, wy + 1));
+        output2.x += dot(input2, w);
+        w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1));
+        output1.x += dot(input1, w);
+        w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1));
+        output2.x += dot(input1, w);
+        w = read_imageh(filter, sampler, (int2)(wx, wy + 2));
+        output4.x += dot(input2, w);
+        w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2));
+        output3.x += dot(input1, w);
+        w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2));
+        output4.x += dot(input1, w);
+
+        wy = (out_c * 4 + 1) * 3;
+        w = read_imageh(filter, sampler, (int2)(wx, wy));
+        output4.y += dot(input4, w);
+        w = read_imageh(filter, sampler, (int2)(wx + 1, wy));
+        output3.y += dot(input3, w);
+        w = read_imageh(filter, sampler, (int2)(wx + 2, wy));
+        output4.y += dot(input3, w);
+        w = read_imageh(filter, sampler, (int2)(wx, wy + 1));
+        output2.y += dot(input2, w);
+        w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1));
+        output1.y += dot(input1, w);
+        w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1));
+        output2.y += dot(input1, w);
+        w = read_imageh(filter, sampler, (int2)(wx, wy + 2));
+        output4.y += dot(input2, w);
+        w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2));
+        output3.y += dot(input1, w);
+        w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2));
+        output4.y += dot(input1, w);
+
+        wy = (out_c * 4 + 2) * 3;
+        w = read_imageh(filter, sampler, (int2)(wx, wy));
+        output4.z += dot(input4, w);
+        w = read_imageh(filter, sampler, (int2)(wx + 1, wy));
+        output3.z += dot(input3, w);
+        w = read_imageh(filter, sampler, (int2)(wx + 2, wy));
+        output4.z += dot(input3, w);
+        w = read_imageh(filter, sampler, (int2)(wx, wy + 1));
+        output2.z += dot(input2, w);
+        w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1));
+        output1.z += dot(input1, w);
+        w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1));
+        output2.z += dot(input1, w);
+        w = read_imageh(filter, sampler, (int2)(wx, wy + 2));
+        output4.z += dot(input2, w);
+        w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2));
+        output3.z += dot(input1, w);
+        w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2));
+        output4.z += dot(input1, w);
+
+        wy = (out_c * 4 + 3) * 3;
+        w = read_imageh(filter, sampler, (int2)(wx, wy));
+        output4.w += dot(input4, w);
+        w = read_imageh(filter, sampler, (int2)(wx + 1, wy));
+        output3.w += dot(input3, w);
+        w = read_imageh(filter, sampler, (int2)(wx + 2, wy));
+        output4.w += dot(input3, w);
+        w = read_imageh(filter, sampler, (int2)(wx, wy + 1));
+        output2.w += dot(input2, w);
+        w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1));
+        output1.w += dot(input1, w);
+        w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1));
+        output2.w += dot(input1, w);
+        w = read_imageh(filter, sampler, (int2)(wx, wy + 2));
+        output4.w += dot(input2, w);
+        w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2));
+        output3.w += dot(input1, w);
+        w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2));
+        output4.w += dot(input1, w);
+    }
+
+    int2 pos_out = (int2)(out_c * output_width + 2 * in_w, n * output_height + 2 * h);
+    write_imageh(output_image, pos_out, output1);
+    write_imageh(output_image, (int2)(pos_out.x + 1, pos_out.y), output2);
+    write_imageh(output_image, (int2)(pos_out.x, pos_out.y + 1), output3);
+    write_imageh(output_image, (int2)(pos_out.x + 1, pos_out.y + 1), output4);
+}
+
+__kernel void depthwise_transpose(__private const int item_ch,
+                               __private const int item_w,
+                               __private const int item_h,
+                               __read_only image2d_t input_image,
+                               __read_only image2d_t filter_image,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+        __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+__read_only image2d_t new_scale,
+                                              __read_only image2d_t new_biase,
+#endif
+                               __write_only image2d_t output_image,
+                               __private const int stride,
+                               __private const int pad,
+                               __private const int dilation,
+                               __private const int in_ch,
+                               __private const int in_w,
+                               __private const int in_h,
+                               __private const int out_w,
+                               __private const int out_h,
+                               __private const int filter_w,
+                               __private const int filter_h) {
+
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+
+    // item_id
+    const int item_ch_id = get_global_id(0);
+    const int item_w_id = get_global_id(1);
+    const int item_h_id = get_global_id(2);
+
+    // out_id
+    int out_b_id = item_h_id / out_h;
+    int out_w_id_per_ch_blk = item_w_id;
+    int out_h_id_per_batch = item_h_id % out_h;
+    int out_w_id = item_ch_id * out_w + out_w_id_per_ch_blk;
+
+    // in_id
+    int in_w_id_per_ch_blk = (out_w_id_per_ch_blk + pad - filter_w + stride) / stride;
+    in_w_id_per_ch_blk = in_w_id_per_ch_blk > 0 ? in_w_id_per_ch_blk : 0;
+    int in_h_id_per_batch = (out_h_id_per_batch + pad - filter_h + stride) / stride;
+    in_h_id_per_batch = in_h_id_per_batch > 0 ? in_h_id_per_batch : 0;
+
+    // filter_id
+    int align_w_i = out_w_id_per_ch_blk + pad - filter_w + 1;
+    int align_w = align_w_i % stride > 0 ?
+                  align_w_i % stride - stride : align_w_i % stride;
+    int filter_w_id_per_ch_blk = out_w_id_per_ch_blk + pad < filter_w ? out_w_id_per_ch_blk + pad : filter_w + align_w - 1;
+
+    int align_h_i = out_h_id_per_batch + pad - filter_h + 1;
+    int align_h = align_h_i % stride > 0 ?
+                  align_h_i % stride - stride : align_h_i % stride;
+    int filter_h_id = out_h_id_per_batch + pad < filter_h ? out_h_id_per_batch + pad : filter_h + align_h - 1;
+
+#ifdef BIASE_CH
+    half4 output;
+    output = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
+#elif defined(BIASE_ELE)
+    half4 output;
+    output = read_imageh(bias, sampler, (int2)(out_w_id, item_h_id));
+#else
+    half4 output = 0.0f;
+#endif
+    half4 filter = 0.0f;
+    half4 input = 0.0f;
+    for (int h = filter_h_id; h >= 0; h -= stride) {
+        int in_h_id = select(out_b_id * in_h + in_h_id_per_batch, -1,
+                             in_h_id_per_batch < 0 || in_h_id_per_batch >= in_h);
+        for (int w = filter_w_id_per_ch_blk; w >= 0; w -= stride) {
+            int in_w_id = select(item_ch_id * in_w + in_w_id_per_ch_blk, -1,
+                                 in_w_id_per_ch_blk < 0 || in_w_id_per_ch_blk >= in_w);
+            int filter_w_id = item_ch_id * filter_w + w;
+            input = read_imageh(input_image, sampler, (int2)(in_w_id, in_h_id));
+            filter = read_imageh(filter_image, sampler, (int2)(filter_w_id, h));
+
+            output = mad(input, filter, output);
+            in_w_id_per_ch_blk++;
+        }
+        in_h_id_per_batch++;
+    }
+
+#ifdef BATCH_NORM
+    half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
+    half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
+    output = mad(scale, output, biase);
+#endif
+
+#ifdef RELU
+    output = activation(output);
+#endif
+
+    write_imageh(output_image, (int2)(out_w_id, item_h_id), output);
+}
+
+
+
+
+
+
--- a/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 #ifdef CONV_TRANSPOSE_OP

 #include "operators/kernel/conv_transpose_kernel.h"
+#include "operators/kernel/cl/cl-kernel-func/conv_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -21,60 +22,45 @@ namespace operators {
 template <>
 bool ConvTransposeKernel<GPU_CL, float>::Init(
    ConvTransposeParam<GPU_CL>* param) {
-  param->Filter()->InitConv2dTransposeFilterCLImage(
-      cl_helper_.CLContext(), cl_helper_.CLCommandQueue());
-  this->cl_helper_.AddKernel("conv_transpose", "conv_transpose.cl");
+  PADDLE_MOBILE_ENFORCE(param->Strides()[0] == param->Strides()[1] &&
+                            param->Paddings()[0] == param->Paddings()[1] &&
+                            param->Dilations()[0] == param->Dilations()[1] &&
+                            param->Dilations()[0] == 1,
+                        "need equal");
+
+  if (param->Filter()->dims()[1] == 1 &&
+      param->Input()->dims()[1] == param->Output()->dims()[1]) {
+    param->ExecMode() = ConvTransposeParam<GPU_CL>::EXEC_DEPTHWISETRANS_FLOAT;
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("depthwise_transpose",
+                               "conv_transpose_kernel.cl");
+  } else if (param->Filter()->dims()[2] == 3 &&
+             param->Filter()->dims()[3] == 3 && param->Strides()[0] == 2) {
+    param->ExecMode() = ConvTransposeParam<GPU_CL>::EXEC_CONVTRANS3x3s2_FLOAT;
+    param->Filter()->InitConv2dTransposeFilterCLImage(
+        cl_helper_.CLContext(), cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("conv_transpose", "conv_transpose_kernel.cl");
+  } else {
+    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
+  }
  return true;
 }

 template <>
 void ConvTransposeKernel<GPU_CL, float>::Compute(
    const ConvTransposeParam<GPU_CL>& param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  const auto* input = param.Input();
-  auto* output = param.Output();
-  auto* filter = param.Filter();
-  const int n = input->dims()[0];
-  const int input_c = input->dims()[1];
-  const int input_c_block = (input_c + 3) / 4;
-  const int input_width = input->dims()[3];
-  const int input_height = input->dims()[2];
-  const int output_c = output->dims()[1];
-  const int output_c_block = (output_c + 3) / 4;
-  const int output_width = output->dims()[3];
-  const int output_height = output->dims()[2];
-
-  auto inputImage = input->GetCLImage();
-  auto outputImage = output->GetCLImage();
-  auto filterImage = filter->GetCLImage();
-
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(int), &input_c_block);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(int), &input_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(int), &input_height);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(int), &output_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(int), &output_height);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &inputImage);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &filterImage);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &outputImage);
-  CL_CHECK_ERRORS(status);
-
-  const size_t work_size[3] = {(size_t)output_c_block, (size_t)input_width,
-                               (size_t)(n * input_height)};
-
-  DLOG << "conv transpose " << input_c_block << input_width << input_height
-       << output_width << output_height << work_size[0] << work_size[1]
-       << work_size[2];
-
-  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
-                         work_size, NULL, 0, NULL, NULL);
+  switch (param.ExecMode()) {
+    case ConvTransposeParam<GPU_CL>::EXEC_DEPTHWISETRANS_FLOAT:
+      DWConvTransposeAddBnRelu(&this->cl_helper_, param);
+      break;
+    case ConvTransposeParam<GPU_CL>::EXEC_CONVTRANS3x3s2_FLOAT:
+      ConvTransposeAddBnRelu(&this->cl_helper_, param);
+      break;
+    default:
+      PADDLE_MOBILE_THROW_EXCEPTION(
+          "Invalid convolution transpose execute mode %d", param.ExecMode());
+  }
 }

 template class ConvTransposeKernel<GPU_CL, float>;

--- a/mobile/src/operators/op_param.h
+++ b/mobile/src/operators/op_param.h
@@ -2492,8 +2492,8 @@ class ConvTransposeParam : public OpParam {
                     const VariableNameMap &outputs, const AttributeMap &attrs,
                     Scope *scope)
      : OpParam(inputs, outputs, attrs, scope) {
-    filter_ = FilterFrom<GType>(inputs, *scope);
-    input_ = InputFrom<GType>(inputs, *scope);
+    filter_ = OpParam::FilterFrom<GType>(inputs, *scope);
+    input_ = OpParam::InputFrom<GType>(inputs, *scope);
    // output_ = OutputFrom<GType>(outputs, scope);
    if (outputs.count("Output")) {
      output_ = OpParam::OutputFrom<GType>(outputs, *scope);
@@ -2518,6 +2518,10 @@ class ConvTransposeParam : public OpParam {

  const vector<int> &Paddings() const { return paddings_; }

+  const vector<int> &Filters() const { return filter_; }
+
+  const vector<int> &TransFilters() const { return transformed_filter_; }
+
  const vector<int> &Dilations() const { return dilations_; }

  const vector<int> &OutputSize() const { return output_size_; }
@@ -2529,6 +2533,8 @@ class ConvTransposeParam : public OpParam {
    EXEC_GEMM_FLOAT,
    EXEC_DECONV3X3_FLOAT,
    EXEC_DECONV4X4_FLOAT,
+    EXEC_DEPTHWISETRANS_FLOAT,
+    EXEC_CONVTRANS3x3s2_FLOAT,
  };

  ExecMode &ExecMode() const { return exec_mode_; }
@@ -2537,6 +2543,7 @@ class ConvTransposeParam : public OpParam {
  GType *input_;
  GType *output_;
  GType *filter_;
+  GType *transformed_filter_;
  vector<int> strides_;
  vector<int> paddings_;
  vector<int> dilations_;