optimize GPU conv performance and structure (#1595)

* optimize GPU conv performance and structure * add CL macro for test_conv_gpu * fix build failure

optimize GPU conv performance and structure (#1595)
* optimize GPU conv performance and structure * add CL macro for test_conv_gpu * fix build failure
cf70102b · Jiaying Zhao · GitHub · 619acd23 · cf70102b · cf70102b
23 changed file
--- a/src/framework/cl/cl_engine.h
+++ b/src/framework/cl/cl_engine.h
@@ -103,9 +103,9 @@ class CLEngine {
    return std::move(event_ptr);
  }

-  bool BuildProgram(cl_program program) {
+  bool BuildProgram(cl_program program, const std::string &options = "") {
    cl_int status;
-    std::string path = "-cl-fast-relaxed-math -I " +
+    std::string path = options + " -cl-fast-relaxed-math -I " +
                       CLEngine::Instance()->GetCLPath() + "/cl_kernel";

    status = clBuildProgram(program, 0, 0, path.c_str(), 0, 0);
@@ -149,7 +149,7 @@ class CLEngine {

  cl_int status_;

-  std::string cl_path_;
+  std::string cl_path_ = "/data/local/tmp/bin";
  std::unique_ptr<_cl_program, CLProgramDeleter> program_;

  std::unique_ptr<_cl_context, CLContextDeleter> context_ = nullptr;

--- a/src/framework/cl/cl_helper.h
+++ b/src/framework/cl/cl_helper.h
@@ -32,9 +32,10 @@ class CLHelper {

  explicit CLHelper(CLScope *scope) : scope_(scope) {}

-  void AddKernel(const std::string &kernel_name, const std::string &file_name) {
+  void AddKernel(const std::string &kernel_name, const std::string &file_name,
+                 const std::string &options = "") {
    DLOG << " begin add kernel ";
-    auto kernel = scope_->GetKernel(kernel_name, file_name);
+    auto kernel = scope_->GetKernel(kernel_name, file_name, options);
    DLOG << " add kernel ing ";
    kernels.emplace_back(std::move(kernel));
  }

--- a/src/framework/cl/cl_image.h
+++ b/src/framework/cl/cl_image.h
@@ -146,6 +146,20 @@ class CLImage {
    DLOG << " end init cl image";
  }

+  void InitEmpty(cl_context context, cl_command_queue command_queue,
+                 const DDim &image_dims) {
+    DLOG << " to get image dims ";
+    image_dims_ = image_dims;
+    DLOG << " end get image dims " << image_dims_;
+
+    InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
+
+    command_queue_ = command_queue;
+    cl_event_ = CLEngine::Instance()->CreateEvent(context);
+    initialized_ = true;
+    DLOG << " end init cl image";
+  }
+
  cl_mem GetCLImage() const { return cl_image_.get(); }

  const DDim &ImageDims() const { return image_dims_; }

--- a/src/framework/cl/cl_image_converter.cpp
+++ b/src/framework/cl/cl_image_converter.cpp
@@ -426,5 +426,25 @@ void CLImageConverterNormal::ImageToNCHW(half_t *image, float *tensor,
  default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim);
 }

+const DDim &CLImageConverterWinoTransWeight::InitImageDimInfoWith(
+    const DDim &tensor_dim) {
+  PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
+  size_t N, C, H, W;
+  N = tensor_dim[0];
+  C = tensor_dim[1];
+  H = tensor_dim[2];
+  W = tensor_dim[3];
+  size_t width = (C + 3) / 4;
+  size_t height = N * 16;  // N * (wino_blk_size + 2) * (wino_blk_size + 2)
+  return make_ddim({width, height});
+}
+
+void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor, half_t *image,
+                                                  const DDim &tensor_dim) {}
+
+void CLImageConverterWinoTransWeight::ImageToNCHW(half_t *image, float *tensor,
+                                                  const DDim &image_dim,
+                                                  const DDim &tensor_dim) {}
+
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/cl/cl_image_converter.h
+++ b/src/framework/cl/cl_image_converter.h
@@ -101,5 +101,13 @@ class CLImageConverterDWBlock : public CLImageConverterBase {
                   const DDim &tensor_dim);
 };

+class CLImageConverterWinoTransWeight : public CLImageConverterBase {
+ public:
+  const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
+  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
+  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim);
+};
+
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/cl/cl_scope.h
+++ b/src/framework/cl/cl_scope.h
@@ -37,9 +37,10 @@ class CLScope {
  cl_command_queue CommandQueue() { return command_queue_; }

  std::unique_ptr<_cl_kernel, CLKernelDeleter> GetKernel(
-      const std::string &kernel_name, const std::string &file_name) {
+      const std::string &kernel_name, const std::string &file_name,
+      const std::string &options) {
    DLOG << " to get program " << file_name;
-    auto program = Program(file_name);
+    auto program = Program(file_name, options);
    DLOG << " end get program ~ ";
    DLOG << " to create kernel: " << kernel_name;
    std::unique_ptr<_cl_kernel, CLKernelDeleter> kernel(
@@ -51,8 +52,12 @@ class CLScope {

  cl_context Context() { return context_; }

-  cl_program Program(const std::string &file_name) {
-    auto it = programs_.find(file_name);
+  cl_program Program(const std::string &file_name, const std::string &options) {
+    std::string program_key = file_name;
+    if (!options.empty()) {
+      program_key += options;
+    }
+    auto it = programs_.find(program_key);
    if (it != programs_.end()) {
      return it->second.get();
    }
@@ -61,13 +66,13 @@ class CLScope {
        context_,
        CLEngine::Instance()->GetCLPath() + "/cl_kernel/" + file_name);

-    DLOG << " --- begin build program -> " << file_name << " --- ";
-    CLEngine::Instance()->BuildProgram(program.get());
-    DLOG << " --- end build program -> " << file_name << " --- ";
+    DLOG << " --- begin build program -> " << program_key << " --- ";
+    CLEngine::Instance()->BuildProgram(program.get(), options);
+    DLOG << " --- end build program -> " << program_key << " --- ";

-    programs_[file_name] = std::move(program);
+    programs_[program_key] = std::move(program);

-    return programs_[file_name].get();
+    return programs_[program_key].get();
  }

 private:

--- a/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
+++ b/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "operators/kernel/cl/cl-kernel-func/conv_func.h"
+#include <vector>
 #include "framework/cl/cl_image_converter.h"
 #include "framework/cl/cl_tensor.h"

@@ -20,19 +21,23 @@ namespace paddle_mobile {
 namespace operators {

 template <>
-void winograd_transform_weight<4, 3>(framework::CLHelper &cl_helper,
-                                     framework::CLImage &weight){};
+void winograd_transform_weight<4, 3>(framework::CLHelper *cl_helper,
+                                     framework::CLImage *weight) {}

 template <>
-void WinogradConv3x3<4, 3>(framework::CLHelper &cl_helper,
-                           const ConvParam<GPU_CL> &param) {}
+void WinogradConv3x3<4, 3>(framework::CLHelper *cl_helper,
+                           const ConvParam<GPU_CL> &param, bool ifRelu,
+                           const framework::CLImage *biase,
+                           const framework::CLImage *new_scale,
+                           const framework::CLImage *new_bias) {}

-void ConvAddBnRelu(framework::CLHelper &cl_helper,
+void ConvAddBnRelu(framework::CLHelper *cl_helper,
                   const ConvParam<GPU_CL> &param, bool ifRelu,
-                   const CLImage *biase, const CLImage *new_scale,
-                   const CLImage *new_bias) {
-  auto kernel = cl_helper.KernelAt(0);
-  auto default_work_size = cl_helper.DefaultWorkSize(*param.Output());
+                   const framework::CLImage *biase,
+                   const framework::CLImage *new_scale,
+                   const framework::CLImage *new_bias) {
+  auto kernel = cl_helper->KernelAt(0);
+  auto default_work_size = cl_helper->DefaultWorkSize(*param.Output());
  int c_block = default_work_size[0];
  int w = default_work_size[1];
  int nh = default_work_size[2];
@@ -137,7 +142,7 @@ void ConvAddBnRelu(framework::CLHelper &cl_helper,
        static_cast<const uint32_t>(maped_w),
        static_cast<const uint32_t>(default_work_size.data()[2])};

-    status = clEnqueueNDRangeKernel(cl_helper.CLCommandQueue(), kernel,
+    status = clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel,
                                    default_work_size.size(), NULL, work_size,
                                    NULL, 0, NULL, NULL);
    CL_CHECK_ERRORS(status);
@@ -201,7 +206,7 @@ void ConvAddBnRelu(framework::CLHelper &cl_helper,
    CL_CHECK_ERRORS(status);

    status = clEnqueueNDRangeKernel(
-        cl_helper.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
        default_work_size.data(), NULL, 0, NULL, NULL);
    CL_CHECK_ERRORS(status);
  }

--- a/src/operators/kernel/cl/cl-kernel-func/conv_func.h
+++ b/src/operators/kernel/cl/cl-kernel-func/conv_func.h
@@ -22,23 +22,24 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {

-using namespace framework;
-
 inline int maptofactor(int i, int factor) { return (i + factor - 1) / factor; }

 template <int tile, int kernel>
-void winograd_transform_weight(framework::CLHelper &cl_helper,
-                               framework::CLImage &weight);
+void winograd_transform_weight(framework::CLHelper *cl_helper,
+                               framework::CLImage *weight);

 template <int tile, int kernel>
-void WinogradConv3x3(framework::CLHelper &cl_helper,
-                     const ConvParam<GPU_CL> &param);
+void WinogradConv3x3(framework::CLHelper *cl_helper,
+                     const ConvParam<GPU_CL> &param, bool ifRelu = false,
+                     const framework::CLImage *biase = nullptr,
+                     const framework::CLImage *new_scale = nullptr,
+                     const framework::CLImage *new_bias = nullptr);

-void ConvAddBnRelu(framework::CLHelper &cl_helper,
+void ConvAddBnRelu(framework::CLHelper *cl_helper,
                   const ConvParam<GPU_CL> &param, bool ifRelu = false,
-                   const CLImage *biase = nullptr,
-                   const CLImage *new_scale = nullptr,
-                   const CLImage *new_bias = nullptr);
+                   const framework::CLImage *biase = nullptr,
+                   const framework::CLImage *new_scale = nullptr,
+                   const framework::CLImage *new_bias = nullptr);

 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define BIASE
-#define BATCH_NORM
-#define RELU
-
-#include "conv_kernel.inc.cl"
--- a/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define BIASE
-
-#include "conv_kernel.inc.cl"
--- a/src/operators/kernel/cl/cl_kernel/conv_add_relu_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_relu_kernel.cl
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define BIASE
-#define RELU
-#include "conv_kernel.inc.cl"
--- a/src/operators/kernel/cl/cl_kernel/conv_bn_add_relu_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_bn_add_relu_kernel.cl
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#define BATCH_NORM
-#define BIASE
-#define RELU
-
-#include "conv_kernel.inc.cl"
--- a/src/operators/kernel/cl/cl_kernel/conv_bn_relu_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_bn_relu_kernel.cl
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define BATCH_NORM
-#define RELU
-
-#include "conv_kernel.inc.cl"
--- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
@@ -126,6 +126,10 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(

  param->SetOffset(offset);

+  const std::string conv_kernel_file = "conv_kernel.cl";
+  const std::string wino_kernel_file = "winograd_transform.cl";
+  const std::string build_options = "-DBIASE -DBATCH_NORM -DRELU";
+
  /*
  if (param->Filter()->dims()[2] == 1 &&
      param->Filter()->dims()[3] == 1 &&
@@ -137,26 +141,44 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
  }
  */
  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
    param->Filter()->InitNImage(cl_helper_.CLContext(),
                                cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("conv_1x1_spl", "conv_add_bn_relu_kernel.cl");

-    DLOG << " conv add bn relu conv 1x1";
+    this->cl_helper_.AddKernel("conv_1x1_spl", conv_kernel_file, build_options);
+
  } else if (param->Filter()->dims()[1] == 1 &&
             param->Input()->dims()[1] == param->Output()->dims()[1] &&
             param->Filter()->dims()[2] == 3) {
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
    param->Filter()->InitDWImage(cl_helper_.CLContext(),
                                 cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
-    DLOG << " conv add bn relu depth_conv_3x3";
+
+    this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
+                               build_options);

  } else if (param->Filter()->dims()[2] == 3 &&
             param->Filter()->dims()[3] == 3) {
+    //    if (param->Strides()[0] == param->Strides()[1] &&
+    //        param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) {
+    //      param->ExecMode() = ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT;
+    //      this->cl_helper_.AddKernel("winograd_filter_transform_2x2",
+    //                                 wino_kernel_file, build_options);
+    //      this->cl_helper_.AddKernel("winograd_input_transform_2x2",
+    //                                 wino_kernel_file, build_options);
+    //      this->cl_helper_.AddKernel("matmul", "matmul.cl");
+    //      this->cl_helper_.AddKernel("winograd_output_transform_2x2",
+    //                                 wino_kernel_file, build_options);
+    //
+    //      winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
+    //
+    //    } else {
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
    param->Filter()->InitCLImage(cl_helper_.CLContext(),
                                 cl_helper_.CLCommandQueue());

-    this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl");
-    DLOG << " conv add bn relu conv_3x3";
+    this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
+    //    }
  } else {
    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
  }
@@ -167,8 +189,21 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
 template <>
 void ConvAddBNReluKernel<GPU_CL, float>::Compute(
    const FusionConvAddBNReluParam<GPU_CL> &param) {
-  ConvAddBnRelu(this->cl_helper_, param, true, param.Bias(), param.NewScale(),
-                param.NewBias());
+  switch (param.ExecMode()) {
+    case ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT:
+      WinogradConv3x3<4, 3>(&this->cl_helper_, param, true, param.Bias(),
+                            param.NewScale(), param.NewBias());
+      break;
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
+      ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(),
+                    param.NewScale(), param.NewBias());
+      break;
+    default:
+      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
+                                    param.ExecMode());
+  }
 }

 template class ConvAddBNReluKernel<GPU_CL, float>;

--- a/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_kernel.cpp
@@ -33,34 +33,65 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
               static_cast<int>(param->Paddings()[1]);
  param->SetOffset(offset);

+  const std::string conv_kernel_file = "conv_kernel.cl";
+  const std::string wino_kernel_file = "winograd_transform.cl";
+  const std::string build_options = "-DBIASE";
+
  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
    param->Filter()->InitNImage(cl_helper_.CLContext(),
                                cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("conv_1x1_spl", "conv_add_kernel.cl");
+
+    this->cl_helper_.AddKernel("conv_1x1_spl", conv_kernel_file, build_options);
+
  } else if (param->Filter()->dims()[1] == 1 &&
             param->Input()->dims()[1] == param->Output()->dims()[1] &&
             param->Filter()->dims()[2] == 3) {
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
    param->Filter()->InitDWImage(cl_helper_.CLContext(),
                                 cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_kernel.cl");
+
+    this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
+                               build_options);

  } else if (param->Filter()->dims()[2] == 3 &&
             param->Filter()->dims()[3] == 3) {
+    //    if (param->Strides()[0] == param->Strides()[1] &&
+    //        param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) {
+    //      param->ExecMode() = ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT;
+    //      this->cl_helper_.AddKernel("winograd_filter_transform_2x2",
+    //                                 wino_kernel_file, build_options);
+    //      this->cl_helper_.AddKernel("winograd_input_transform_2x2",
+    //                                 wino_kernel_file, build_options);
+    //      this->cl_helper_.AddKernel("matmul", "matmul.cl");
+    //      this->cl_helper_.AddKernel("winograd_output_transform_2x2",
+    //                                 wino_kernel_file, build_options);
+    //
+    //      winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
+    //
+    //    } else {
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
    param->Filter()->InitCLImage(cl_helper_.CLContext(),
                                 cl_helper_.CLCommandQueue());

-    this->cl_helper_.AddKernel("conv_3x3", "conv_add_kernel.cl");
+    this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
+    //    }

  } else if (param->Filter()->dims()[2] == 7 &&
             param->Filter()->dims()[3] == 7) {
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT;
    param->Filter()->InitCLImage(cl_helper_.CLContext(),
                                 cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("conv_7x7", "conv_add_kernel.cl");
+
+    this->cl_helper_.AddKernel("conv_7x7", conv_kernel_file, build_options);
+
  } else if (param->Filter()->dims()[2] == 5 &&
             param->Filter()->dims()[3] == 5) {
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT;
    param->Filter()->InitCLImage(cl_helper_.CLContext(),
                                 cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("conv_5x5", "conv_add_kernel.cl");
+
+    this->cl_helper_.AddKernel("conv_5x5", conv_kernel_file, build_options);
  }

  return true;
@@ -69,7 +100,21 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
 template <>
 void ConvAddKernel<GPU_CL, float>::Compute(
    const FusionConvAddParam<GPU_CL> &param) {
-  ConvAddBnRelu(this->cl_helper_, param, false, param.Bias());
+  switch (param.ExecMode()) {
+    case ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT:
+      WinogradConv3x3<4, 3>(&this->cl_helper_, param, false, param.Bias());
+      break;
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
+      ConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
+      break;
+    default:
+      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
+                                    param.ExecMode());
+  }
 }

 template class ConvAddKernel<GPU_CL, float>;

--- a/src/operators/kernel/cl/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_relu_kernel.cpp
@@ -34,35 +34,66 @@ bool ConvAddReluKernel<GPU_CL, float>::Init(
               static_cast<int>(param->Paddings()[1]);
  param->SetOffset(offset);

+  const std::string conv_kernel_file = "conv_kernel.cl";
+  const std::string wino_kernel_file = "winograd_transform.cl";
+  const std::string build_options = "-DBIASE -DRELU";
+
  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
    param->Filter()->InitNImage(cl_helper_.CLContext(),
                                cl_helper_.CLCommandQueue());

-    this->cl_helper_.AddKernel("conv_1x1_spl", "conv_add_relu_kernel.cl");
+    this->cl_helper_.AddKernel("conv_1x1_spl", conv_kernel_file, build_options);
+
  } else if (param->Filter()->dims()[1] == 1 &&
             param->Input()->dims()[1] == param->Output()->dims()[1] &&
             param->Filter()->dims()[2] == 3) {
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
    param->Filter()->InitDWImage(cl_helper_.CLContext(),
                                 cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_relu_kernel.cl");
+
+    this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
+                               build_options);

  } else if (param->Filter()->dims()[2] == 3 &&
             param->Filter()->dims()[3] == 3) {
+    //    if (param->Strides()[0] == param->Strides()[1] &&
+    //        param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) {
+    //      param->ExecMode() = ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT;
+    //      this->cl_helper_.AddKernel("winograd_filter_transform_2x2",
+    //                                 wino_kernel_file, build_options);
+    //      this->cl_helper_.AddKernel("winograd_input_transform_2x2",
+    //                                 wino_kernel_file, build_options);
+    //      this->cl_helper_.AddKernel("matmul", "matmul.cl");
+    //      this->cl_helper_.AddKernel("winograd_output_transform_2x2",
+    //                                 wino_kernel_file, build_options);
+    //
+    //      winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
+    //
+    //    } else {
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
    param->Filter()->InitCLImage(cl_helper_.CLContext(),
                                 cl_helper_.CLCommandQueue());

-    this->cl_helper_.AddKernel("conv_3x3", "conv_add_relu_kernel.cl");
+    this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
+    //    }

  } else if (param->Filter()->dims()[2] == 7 &&
             param->Filter()->dims()[3] == 7) {
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT;
    param->Filter()->InitCLImage(cl_helper_.CLContext(),
                                 cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("conv_7x7", "conv_add_relu_kernel.cl");
+
+    this->cl_helper_.AddKernel("conv_7x7", conv_kernel_file, build_options);
+
  } else if (param->Filter()->dims()[2] == 5 &&
             param->Filter()->dims()[3] == 5) {
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT;
    param->Filter()->InitCLImage(cl_helper_.CLContext(),
                                 cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("conv_5x5", "conv_add_relu_kernel.cl");
+
+    this->cl_helper_.AddKernel("conv_5x5", conv_kernel_file, build_options);
+
  } else {
    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
  }
@@ -73,7 +104,21 @@ bool ConvAddReluKernel<GPU_CL, float>::Init(
 template <>
 void ConvAddReluKernel<GPU_CL, float>::Compute(
    const FusionConvAddReluParam<GPU_CL> &param) {
-  ConvAddBnRelu(this->cl_helper_, param, true, param.Bias());
+  switch (param.ExecMode()) {
+    case ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT:
+      WinogradConv3x3<4, 3>(&this->cl_helper_, param, true, param.Bias());
+      break;
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
+      ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias());
+      break;
+    default:
+      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
+                                    param.ExecMode());
+  }
 }

 template class ConvAddReluKernel<GPU_CL, float>;

--- a/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp
@@ -100,28 +100,51 @@ bool ConvBNAddReluKernel<GPU_CL, float>::Init(

  param->SetOffset(offset);

+  const std::string conv_kernel_file = "conv_kernel.cl";
+  const std::string wino_kernel_file = "winograd_transform.cl";
+  const std::string build_options = "-DBIASE -DBATCH_NORM -DRELU";
+
  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
    param->Filter()->InitNImage(cl_helper_.CLContext(),
                                cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("convBNAdd_1x1_spl",
-                               "conv_bn_add_relu_kernel.cl");
-    DLOG << " conv bn add relu conv 1x1";
+
+    this->cl_helper_.AddKernel("convBNAdd_1x1_spl", conv_kernel_file,
+                               build_options);
+
  } else if (param->Filter()->dims()[1] == 1 &&
             param->Input()->dims()[1] == param->Output()->dims()[1] &&
             param->Filter()->dims()[2] == 3) {
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
    param->Filter()->InitDWImage(cl_helper_.CLContext(),
                                 cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("depth_convBNAdd_3x3",
-                               "conv_bn_add_relu_kernel.cl");
-    DLOG << " conv bn add relu depth_conv_3x3";
+
+    this->cl_helper_.AddKernel("depth_convBNAdd_3x3", conv_kernel_file,
+                               build_options);

  } else if (param->Filter()->dims()[2] == 3 &&
             param->Filter()->dims()[3] == 3) {
+    //    if (param->Strides()[0] == param->Strides()[1] &&
+    //        param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) {
+    //      param->ExecMode() = ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT;
+    //      this->cl_helper_.AddKernel("winograd_filter_transform_2x2",
+    //                                 wino_kernel_file, build_options);
+    //      this->cl_helper_.AddKernel("winograd_input_transform_2x2",
+    //                                 wino_kernel_file, build_options);
+    //      this->cl_helper_.AddKernel("matmul", "matmul.cl");
+    //      this->cl_helper_.AddKernel("winograd_output_transform_2x2_bn_add",
+    //                                 wino_kernel_file, build_options);
+    //
+    //      winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
+    //
+    //    } else {
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
    param->Filter()->InitCLImage(cl_helper_.CLContext(),
                                 cl_helper_.CLCommandQueue());

-    this->cl_helper_.AddKernel("convBNAdd_3x3", "conv_bn_add_relu_kernel.cl");
-    DLOG << " conv bn add relu conv_3x3";
+    this->cl_helper_.AddKernel("convBNAdd_3x3", conv_kernel_file,
+                               build_options);
+    //    }
  } else {
    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
  }
@@ -132,8 +155,21 @@ bool ConvBNAddReluKernel<GPU_CL, float>::Init(
 template <>
 void ConvBNAddReluKernel<GPU_CL, float>::Compute(
    const FusionConvBNAddReluParam<GPU_CL> &param) {
-  ConvAddBnRelu(this->cl_helper_, param, true, param.Bias(), param.NewScale(),
-                param.NewBias());
+  switch (param.ExecMode()) {
+    case ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT:
+      WinogradConv3x3<4, 3>(&this->cl_helper_, param, true, param.Bias(),
+                            param.NewScale(), param.NewBias());
+      break;
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
+      ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(),
+                    param.NewScale(), param.NewBias());
+      break;
+    default:
+      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
+                                    param.ExecMode());
+  }
 }
 template class ConvBNAddReluKernel<GPU_CL, float>;


--- a/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
@@ -98,26 +98,49 @@ bool ConvBNReluKernel<GPU_CL, float>::Init(

  param->SetOffset(offset);

+  const std::string conv_kernel_file = "conv_kernel.cl";
+  const std::string wino_kernel_file = "winograd_transform.cl";
+  const std::string build_options = "-DBATCH_NORM -DRELU";
+
  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
    param->Filter()->InitNImage(cl_helper_.CLContext(),
                                cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("conv_1x1_spl", "conv_bn_relu_kernel.cl");
-    DLOG << " conv bn relu conv 1x1";
+
+    this->cl_helper_.AddKernel("conv_1x1_spl", conv_kernel_file, build_options);
+
  } else if (param->Filter()->dims()[1] == 1 &&
             param->Input()->dims()[1] == param->Output()->dims()[1] &&
             param->Filter()->dims()[2] == 3) {
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
    param->Filter()->InitDWImage(cl_helper_.CLContext(),
                                 cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("depth_conv_3x3", "conv_bn_relu_kernel.cl");
-    DLOG << " conv bn relu depth_conv_3x3";
+
+    this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
+                               build_options);

  } else if (param->Filter()->dims()[2] == 3 &&
             param->Filter()->dims()[3] == 3) {
+    //    if (param->Strides()[0] == param->Strides()[1] &&
+    //        param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) {
+    //      param->ExecMode() = ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT;
+    //      this->cl_helper_.AddKernel("winograd_filter_transform_2x2",
+    //                                 wino_kernel_file, build_options);
+    //      this->cl_helper_.AddKernel("winograd_input_transform_2x2",
+    //                                 wino_kernel_file, build_options);
+    //      this->cl_helper_.AddKernel("matmul", "matmul.cl");
+    //      this->cl_helper_.AddKernel("winograd_output_transform_2x2",
+    //                                 wino_kernel_file, build_options);
+    //
+    //      winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
+    //
+    //    } else {
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
    param->Filter()->InitCLImage(cl_helper_.CLContext(),
                                 cl_helper_.CLCommandQueue());

-    this->cl_helper_.AddKernel("conv_3x3", "conv_bn_relu_kernel.cl");
-    DLOG << " conv bn relu conv_3x3";
+    this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
+    //    }
  } else {
    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
  }
@@ -127,8 +150,21 @@ bool ConvBNReluKernel<GPU_CL, float>::Init(
 template <>
 void ConvBNReluKernel<GPU_CL, float>::Compute(
    const FusionConvBNReluParam<GPU_CL> &param) {
-  ConvAddBnRelu(this->cl_helper_, param, true, nullptr, param.NewScale(),
-                param.NewBias());
+  switch (param.ExecMode()) {
+    case ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT:
+      WinogradConv3x3<4, 3>(&this->cl_helper_, param, true, nullptr,
+                            param.NewScale(), param.NewBias());
+      break;
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
+      ConvAddBnRelu(&this->cl_helper_, param, true, nullptr, param.NewScale(),
+                    param.NewBias());
+      break;
+    default:
+      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
+                                    param.ExecMode());
+  }
 }
 template class ConvBNReluKernel<GPU_CL, float>;


--- a/src/operators/kernel/cl/conv_kernel.cpp
+++ b/src/operators/kernel/cl/conv_kernel.cpp
@@ -37,25 +37,49 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
  DLOG << " height of one block: " << param->Filter()->dims()[2];
  DLOG << " filter dims: " << param->Filter()->dims();

+  const std::string conv_kernel_file = "conv_kernel.cl";
+  const std::string wino_kernel_file = "winograd_transform.cl";
+
  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
    param->Filter()->InitNImage(cl_helper_.CLContext(),
                                cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("conv_1x1_spl", "conv_kernel.cl");
+
+    this->cl_helper_.AddKernel("conv_1x1_spl", conv_kernel_file);
    DLOG << "conv 1x1";

  } else if (param->Filter()->dims()[1] == 1 &&
             param->Input()->dims()[1] == param->Output()->dims()[1] &&
             param->Filter()->dims()[2] == 3) {
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
    param->Filter()->InitDWImage(cl_helper_.CLContext(),
                                 cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("depth_conv_3x3", "depthwise_conv_kernel.cl");
+
+    this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file);
    DLOG << "depth_conv 3x3";

  } else if (param->Filter()->dims()[2] == 3 &&
             param->Filter()->dims()[3] == 3) {
+    //    if (param->Strides()[0] == param->Strides()[1] &&
+    //        param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) {
+    //      param->ExecMode() = ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT;
+    //      this->cl_helper_.AddKernel("winograd_filter_transform_2x2",
+    //                                 wino_kernel_file);
+    //      this->cl_helper_.AddKernel("winograd_input_transform_2x2",
+    //                                 wino_kernel_file);
+    //      this->cl_helper_.AddKernel("matmul", "matmul.cl");
+    //      this->cl_helper_.AddKernel("winograd_output_transform_2x2",
+    //                                 wino_kernel_file);
+    //
+    //      winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
+    //
+    //    } else {
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
    param->Filter()->InitCLImage(cl_helper_.CLContext(),
                                 cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("conv_3x3", "conv_kernel.cl");
+
+    this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file);
+    //    }
    DLOG << "conv 3x3";

  } else {
@@ -67,7 +91,19 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {

 template <>
 void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
-  ConvAddBnRelu(this->cl_helper_, param);
+  switch (param.ExecMode()) {
+    case ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT:
+      WinogradConv3x3<4, 3>(&this->cl_helper_, param);
+      break;
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
+      ConvAddBnRelu(&this->cl_helper_, param);
+      break;
+    default:
+      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
+                                    param.ExecMode());
+  }
 }

 template class ConvKernel<GPU_CL, float>;

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -479,6 +479,11 @@ class ConvParam : public OpParam {
    EXEC_DEPTHWISE5x5_INT8,
    EXEC_SLIDINGWINDOW3x3S1_FLOAT,
    EXEC_SLIDINGWINDOW3x3S2_FLOAT,
+    EXEC_DEPTHWISE3x3_FLOAT,
+    EXEC_SLIDINGWINDOW1x1_FLOAT,
+    EXEC_SLIDINGWINDOW3x3_FLOAT,
+    EXEC_SLIDINGWINDOW5x5_FLOAT,
+    EXEC_SLIDINGWINDOW7x7_FLOAT,
  };

  ExecMode &ExecMode() const { return exec_mode_; }

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -494,4 +494,7 @@ if (NOT FOUND_MATCH)

    ADD_EXECUTABLE(test-dwconv-bn-relu-op operators/test_dwconv_bn_relu_op.cpp test_helper.h test_include.h)
    target_link_libraries(test-dwconv-bn-relu-op paddle-mobile)
+
+    ADD_EXECUTABLE(test-conv-gpu operators/test_conv_gpu.cpp test_helper.h test_include.h)
+    target_link_libraries(test-conv-gpu paddle-mobile)
 endif ()
--- a/test/net/test_resnet.cpp
+++ b/test/net/test_resnet.cpp
@@ -21,7 +21,10 @@ int main() {
  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
 #endif

-#ifdef PADDLE_MOBILE_CPU
+#ifdef PADDLE_MOBILE_CL
+  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile;
+  paddle_mobile.SetCLPath("/data/local/tmp/bin");
+#else
  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
 #endif
  paddle_mobile.SetThreadNum(4);
@@ -38,13 +41,13 @@ int main() {
                             input_tensor.data<float>() + input_tensor.numel());
 #ifndef PADDLE_MOBILE_FPGA
    //   预热十次
-    for (int i = 0; i < 10; ++i) {
-      paddle_mobile.Predict(input, dims);
-    }
+    //    for (int i = 0; i < 10; ++i) {
+    //      paddle_mobile.Predict(input, dims);
+    //    }
    auto time3 = time();
-    for (int i = 0; i < 10; ++i) {
-      paddle_mobile.Predict(input, dims);
-    }
+    //    for (int i = 0; i < 10; ++i) {
+    paddle_mobile.Predict(input, dims);
+    //    }
    auto time4 = time();
    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
              << std::endl;

--- a/test/operators/test_conv_gpu.cpp
+++ b/test/operators/test_conv_gpu.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_MOBILE_CL
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+#include "common/common.h"
+#include "framework/cl/cl_helper.h"
+#include "framework/cl/cl_image.h"
+#include "operators/conv_op.h"
+#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
+
+namespace paddle_mobile {
+
+template <typename Itype, typename Otype, int Kernel, int Pad, int Stride>
+int TestConvOp(int in_channels, int in_height, int in_width, int out_channels,
+               int groups) {
+  int kernel_h = Kernel;
+  int kernel_w = Kernel;
+  int pad_h = Pad;
+  int pad_w = Pad;
+  int stride_h = Stride;
+  int stride_w = Stride;
+  int dilation_h = 1;
+  int dilation_w = 1;
+
+  int batch_size = 1;
+  int input_c = in_channels;
+  int input_h = in_height;
+  int input_w = in_width;
+  int output_c = out_channels;
+  framework::DDim input_shape =
+      framework::make_ddim({batch_size, input_c, input_h, input_w});
+  framework::DDim filter_shape =
+      framework::make_ddim({output_c, input_c / groups, kernel_h, kernel_w});
+
+  //    std::cerr << " init " << std::endl;
+  VariableNameMap inputs;
+  VariableNameMap outputs;
+  auto scope = std::make_shared<framework::Scope>();
+  inputs["Input"] = std::vector<std::string>({"input"});
+  inputs["Filter"] = std::vector<std::string>({"filter"});
+  outputs["Output"] = std::vector<std::string>({"output"});
+  cl_context context = scope->GetCLScpoe()->Context();
+  cl_command_queue command_queue = scope->GetCLScpoe()->CommandQueue();
+
+  //    std::cerr << " input " << std::endl;
+  auto input_var = scope.get()->Var("input");
+  auto input = input_var->template GetMutable<framework::CLImage>();
+  const int in_numel = framework::product(input_shape);
+  float *in_data = new float[in_numel];
+  for (int i = 0; i < in_numel; ++i) {
+    in_data[i] = (i % 36 / 6) + 1;
+  }
+  input->SetTensorData(in_data, input_shape);
+  input->InitNormalCLImage(context, command_queue);
+  DLOG << "input image \n" << *input;
+
+  //    std::cerr << " filter " << std::endl;
+  auto filter_var = scope.get()->Var("filter");
+  auto filter = filter_var->template GetMutable<framework::CLImage>();
+  const int filter_numel = product(filter_shape);
+  float *filter_data = new float[filter_numel];
+  for (int i = 0; i < filter_numel; ++i) {
+    filter_data[i] = i % 9;
+  }
+  filter->SetTensorData(filter_data, filter_shape);
+
+  //    std::cerr << " attrs " << std::endl;
+  framework::AttributeMap attrs;
+  attrs["strides"].Set<vector<int>>(std::vector<int>({stride_h, stride_w}));
+  attrs["paddings"].Set<vector<int>>(std::vector<int>({pad_h, pad_w}));
+  attrs["dilations"].Set<vector<int>>(
+      std::vector<int>({dilation_h, dilation_w}));
+  attrs["groups"].Set<int>(groups);
+
+  std::cerr << " output " << std::endl;
+  auto output_var = scope.get()->Var("output");
+  auto output = output_var->template GetMutable<framework::CLImage>();
+
+  auto *op = new operators::ConvOp<GPU_CL, float>("conv2d", inputs, outputs,
+                                                  attrs, scope.get());
+
+  op->InferShape();
+
+  framework::DDim ddim = output->dims();
+
+  DLOG << "output dims = " << ddim;
+  output->InitEmptyImage(context, command_queue, ddim);
+
+  //    std::cerr << " op->init " << std::endl;
+  op->Init();
+
+  auto time1 = time();
+  op->Run();
+  auto time2 = time();
+  std::cerr << "time cost : " << time_diff(time1, time2) << std::endl;
+
+  delete op;
+  return 0;
+}
+
+}  // namespace paddle_mobile
+
+int TestAll(const int in_channels, const int in_height, const int in_width,
+            const int out_channels, const int groups) {
+  std::cerr << "in_channels=" << in_channels << ", in_height=" << in_height
+            << ", in_width=" << in_width << ", out_channels=" << out_channels
+            << ", groups=" << groups << std::endl;
+  std::cerr << "float, kernel=3, pad=1, stride=1" << std::endl;
+  paddle_mobile::TestConvOp<float, float, 3, 1, 1>(
+      in_channels, in_height, in_width, out_channels, groups);
+
+  return 0;
+}
+#endif
+
+int main() {
+  //  TestAll(4, 6, 6, 4, 1);
+  //  TestAll(6, 32, 32, 24, 1);
+  //    TestAll(12, 32, 32, 24, 1);
+  //    TestAll(24, 32, 32, 24, 1);
+  //    TestAll(36, 32, 32, 24, 1);
+  //    TestAll(48, 32, 32, 24, 1);
+  //    TestAll(60, 32, 32, 24, 1);
+  //    TestAll(72, 32, 32, 24, 1);
+  //    TestAll(116, 32, 32, 24, 1);
+  //    TestAll(232, 32, 32, 24, 1);
+  //    TestAll(464, 32, 32, 24, 1);
+  //
+  //    TestAll(6, 64, 64, 24, 1);
+  //    TestAll(12, 64, 64, 24, 1);
+  //    TestAll(24, 64, 64, 24, 1);
+  //    TestAll(36, 64, 64, 24, 1);
+  //    TestAll(48, 64, 64, 24, 1);
+  //    TestAll(60, 64, 64, 24, 1);
+  //    TestAll(72, 64, 64, 24, 1);
+  //    TestAll(116, 64, 64, 24, 1);
+  //    TestAll(232, 64, 64, 24, 1);
+  //    TestAll(464, 64, 64, 24, 1);
+  //
+  //    TestAll(6, 128, 128, 24, 1);
+  //    TestAll(12, 128, 128, 24, 1);
+  //    TestAll(24, 128, 128, 24, 1);
+  //    TestAll(36, 128, 128, 24, 1);
+  //    TestAll(48, 128, 128, 24, 1);
+  //    TestAll(60, 128, 128, 24, 1);
+  //    TestAll(72, 128, 128, 24, 1);
+  //    TestAll(116, 128, 128, 24, 1);
+  //    TestAll(232, 128, 128, 24, 1);
+  //    TestAll(464, 128, 128, 24, 1);
+  //
+  //
+  //    TestAll(6, 32, 32, 6, 1);
+  //    TestAll(12, 32, 32, 12, 1);
+  //    TestAll(24, 32, 32, 24, 1);
+  //    TestAll(36, 32, 32, 36, 1);
+  //    TestAll(48, 32, 32, 48, 1);
+  //    TestAll(60, 32, 32, 60, 1);
+  //    TestAll(72, 32, 32, 72, 1);
+  //    TestAll(116, 32, 32, 116, 1);
+  //    TestAll(232, 32, 32, 232, 1);
+  //    TestAll(464, 32, 32, 464, 1);
+  //
+  //    TestAll(6, 64, 64, 6, 1);
+  //    TestAll(12, 64, 64, 12, 1);
+  //    TestAll(24, 64, 64, 24, 1);
+  //    TestAll(36, 64, 64, 36, 1);
+  //    TestAll(48, 64, 64, 48, 1);
+  //    TestAll(60, 64, 64, 60, 1);
+  //    TestAll(72, 64, 64, 72, 1);
+  //    TestAll(116, 64, 64, 116, 1);
+  //    TestAll(232, 64, 64, 232, 1);
+  //    TestAll(464, 64, 64, 464, 1);
+  //
+  //    TestAll(6, 128, 128, 6, 1);
+  //    TestAll(12, 128, 128, 12, 1);
+  //    TestAll(24, 128, 128, 24, 1);
+  //    TestAll(36, 128, 128, 36, 1);
+  //    TestAll(48, 128, 128, 48, 1);
+  //    TestAll(60, 128, 128, 60, 1);
+  //    TestAll(72, 128, 128, 72, 1);
+  //    TestAll(116, 128, 128, 116, 1);
+  //    TestAll(232, 128, 128, 232, 1);
+  //    TestAll(464, 128, 128, 464, 1);
+  return 0;
+}