[OPENCL] remove conv redundant's for opencl kernel. test=develop (#3924)

remove conv redundant's for opencl kernel.

[OPENCL] remove conv redundant's for opencl kernel. test=develop (#3924)
remove conv redundant's for opencl kernel.
d341fccb · ysh329 · GitHub · 4780849f · d341fccb · d341fccb
13 changed file
--- a/lite/backends/opencl/cl_context.cc
+++ b/lite/backends/opencl/cl_context.cc
@@ -119,7 +119,7 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) {
  }
 }

-cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size,
+cl::NDRange CLContext::LocalWorkSizeTune(cl::NDRange global_work_size,
                                         size_t max_work_size,
                                         int divisor) {
  int preferred_lws = 0;
@@ -157,7 +157,7 @@ cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size,
                     static_cast<size_t>(gws0)};
 #endif
 }
-cl::NDRange CLContext::LocalWorkSizeTurnReverse(cl::NDRange global_work_size,
+cl::NDRange CLContext::LocalWorkSizeTuneReverse(cl::NDRange global_work_size,
                                                size_t max_work_size,
                                                int divisor) {
  int preferred_lws = 0;

--- a/lite/backends/opencl/cl_context.h
+++ b/lite/backends/opencl/cl_context.h
@@ -62,10 +62,10 @@ class CLContext {

  cl::NDRange LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size);

-  cl::NDRange LocalWorkSizeTurn(cl::NDRange global_work_size,
+  cl::NDRange LocalWorkSizeTune(cl::NDRange global_work_size,
                                size_t max_work_size,
                                int divitor = 2);
-  cl::NDRange LocalWorkSizeTurnReverse(cl::NDRange global_work_size,
+  cl::NDRange LocalWorkSizeTuneReverse(cl::NDRange global_work_size,
                                       size_t max_work_size,
                                       int divitor = 2);
  bool IsArmMali();

--- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
@@ -6,9 +6,7 @@ __kernel void conv2d_1x1_opt(
    __private const int global_size_dim2,
    __read_only image2d_t input_image,
    __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
    __read_only image2d_t bias,
-#endif
 #ifdef BATCH_NORM
    __read_only image2d_t new_scale,
    __read_only image2d_t new_biase,
@@ -284,9 +282,7 @@ __kernel void conv2d_1x1_simple(
    __private const int global_size_dim2,
    __read_only image2d_t input_image,
    __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
    __read_only image2d_t bias,
-#endif
 #ifdef BATCH_NORM
    __read_only image2d_t new_scale,
    __read_only image2d_t new_biase,

--- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
@@ -19,9 +19,7 @@ __kernel void conv2d_3x3(__private const int global_size_dim0,
                         __private const int global_size_dim2,
                         __read_only image2d_t input_image,
                         __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                         __read_only image2d_t bias,
-#endif
                         __write_only image2d_t output_image,
                         __private const int stride,
                         __private const int offset,

--- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
@@ -19,9 +19,7 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
                             __private const int item_h,
                             __read_only image2d_t input_image,
                             __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                             __read_only image2d_t bias,
-#endif
                             __write_only image2d_t output_image,
                             __private const int stride,
                             __private const int pad,
@@ -264,9 +262,7 @@ __kernel void conv2d_3x3_multi_batch(__private const int item_ch,
                                     __private const int item_h,
                                     __read_only image2d_t input_image,
                                     __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                                     __read_only image2d_t bias,
-#endif
                                     __write_only image2d_t output_image,
                                     __private const int stride,
                                     __private const int pad,

--- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl
@@ -5,9 +5,7 @@ __kernel void conv2d_5x5(__private const int global_size_dim0,
                         __private const int global_size_dim2,
                         __read_only image2d_t input_image,
                         __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                         __read_only image2d_t bias,
-#endif
 #ifdef BATCH_NORM
                         __read_only image2d_t new_scale,
                         __read_only image2d_t new_biase,

--- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl
@@ -20,9 +20,7 @@ __kernel void conv2d_5x5_opt(__private const int item_ch,
                             __private const int item_h,
                             __read_only image2d_t input_image,
                             __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                             __read_only image2d_t bias,
-#endif
                             __write_only image2d_t output_image,
                             __private const int stride,
                             __private const int pad,
@@ -268,9 +266,7 @@ __kernel void conv2d_5x5_multi_batch(__private const int item_ch,
                                     __private const int item_h,
                                     __read_only image2d_t input_image,
                                     __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                                     __read_only image2d_t bias,
-#endif
                                     __write_only image2d_t output_image,
                                     __private const int stride,
                                     __private const int pad,
@@ -513,4 +509,4 @@ __kernel void conv2d_5x5_multi_batch(__private const int item_ch,
                   (int2)(out_w_base_id + out_w_id4, item_h_id),
                   output[4]);
  }
-}
\ No newline at end of file
+}
--- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
@@ -5,9 +5,7 @@ __kernel void conv2d_7x7(__private const int global_size_dim0,
                         __private const int global_size_dim2,
                         __read_only image2d_t input_image,
                         __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                         __read_only image2d_t bias,
-#endif
 #ifdef BATCH_NORM
                         __read_only image2d_t new_scale,
                         __read_only image2d_t new_biase,

--- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl
@@ -20,9 +20,7 @@ __kernel void conv2d_7x7_opt(__private const int item_ch,
                             __private const int item_h,
                             __read_only image2d_t input_image,
                             __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                             __read_only image2d_t bias,
-#endif
                             __write_only image2d_t output_image,
                             __private const int stride,
                             __private const int pad,
@@ -268,9 +266,7 @@ __kernel void conv2d_7x7_multi_batch(__private const int item_ch,
                                     __private const int item_h,
                                     __read_only image2d_t input_image,
                                     __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                                     __read_only image2d_t bias,
-#endif
                                     __write_only image2d_t output_image,
                                     __private const int stride,
                                     __private const int pad,
@@ -513,4 +509,4 @@ __kernel void conv2d_7x7_multi_batch(__private const int item_ch,
                   (int2)(out_w_base_id + out_w_id4, item_h_id),
                   output[4]);
  }
-}
\ No newline at end of file
+}
--- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl
@@ -19,9 +19,7 @@ __kernel void depth_conv2d(__private const int global_size_dim0,
                           __private const int global_size_dim2,
                           __read_only image2d_t input,
                           __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                           __read_only image2d_t bias,
-#endif
 #ifdef BATCH_NORM
                           __read_only image2d_t new_scale,
                           __read_only image2d_t new_biase,

--- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
@@ -20,9 +20,7 @@ __kernel void depth_conv2d_3x3(
    __private const int global_size_dim2,
    __read_only image2d_t input,
    __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
    __read_only image2d_t bias,
-#endif
    __write_only image2d_t output_image,
    __private const int stride,
    __private const int offset,
@@ -249,9 +247,7 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
                                 __private const int ou_nh,
                                 __read_only image2d_t input,
                                 __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                                 __read_only image2d_t bias,
-#endif
                                 __write_only image2d_t output_image,
                                 __private const int stride,
                                 __private const int pad,

--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
--- a/lite/kernels/opencl/conv_image_compute.h
+++ b/lite/kernels/opencl/conv_image_compute.h
@@ -33,6 +33,7 @@ namespace paddle {
 namespace lite {
 namespace kernels {
 namespace opencl {
+
 class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
                                           PRECISION(kFP16),
                                           DATALAYOUT(kImageDefault)> {
@@ -42,8 +43,11 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),

  void PrepareForRun() override;

+  void ReInitWhenNeeded() override;
+
  void Run() override;
-  double Turn(int times = 5);
+
+  double Tune(int times = 5);

 #ifdef LITE_WITH_PROFILE
  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
@@ -56,16 +60,20 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
 #endif

 private:
-  void Conv2d1x1opt(bool is_turn = false);
-  void Conv2d3x3(bool is_turn = false);
-  void Conv2d3x3opt(bool is_turn = false);
-  void Conv2d5x5(bool is_turn = false);
-  void Conv2d5x5opt(bool is_turn = false);
-  void Conv2d7x7(bool is_turn = false);
-  void Conv2d7x7opt(bool is_turn = false);
-  void DepthwiseConv2d3x3s1(bool is_turn = false);
-  void DepthwiseConv2d3x3(bool is_turn = false);
-  void DepthwiseConv2d(bool is_turn = false);
+  void PrintConvInfo();
+  void GetGlobalWorkSize();
+  void Conv2d1x1opt(bool enable_tune = false);
+  void Conv2d3x3(bool enable_tune = false);
+  void Conv2d3x3opt(bool enable_tune = false);
+  void Conv2d5x5(bool enable_tune = false);
+  void Conv2d5x5opt(bool enable_tune = false);
+  void Conv2d7x7(bool enable_tune = false);
+  void Conv2d7x7opt(bool enable_tune = false);
+  void DepthwiseConv2d3x3s1(bool enable_tune = false);
+  void DepthwiseConv2d3x3(bool enable_tune = false);
+  void DepthwiseConv2d(bool enable_tune = false);
+
+  param_t* conv_param_{nullptr};

  kernel_t impl_;
  std::vector<std::string> kernel_func_names_{};
@@ -79,19 +87,72 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
  std::unique_ptr<Tensor> tensor_hold_bias_image_{nullptr};
  cl::NDRange global_work_size_ = cl::NDRange{
      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
+
+  // opencl kernel args
  int c_blk_ = 1;
  int w_blk_ = 1;
  int nh_blk_ = 1;

+  const cl::Image2D* input_image_p_{nullptr};
+  const cl::Image2D* filter_image_p_{nullptr};
+  const cl::Image2D* bias_image_p_{nullptr};
+  const cl::Image2D* output_image_p_{nullptr};
+
+  int stride_h_{-1};
+  int stride_w_{-1};
+
+  int dilation_h_{-1};
+  int dilation_w_{-1};
+
+  int pad_up_{-1};
+  int pad_down_{-1};
+  int pad_left_{-1};
+  int pad_right_{-1};
+
+  int offset_{-1};
+  int groups_{-1};
+  bool relu_fused_{false};
+  bool has_bias_{false};
+
+  int input_tensor_n_{-1};
+  int input_tensor_c_{-1};
+  int input_tensor_h_{-1};
+  int input_tensor_w_{-1};
+  int input_image_h_{-1};
+  int input_image_w_{-1};
+  int input_c_block_{-1};
+
+  int output_tensor_n_{-1};
+  int output_tensor_c_{-1};
+  int output_tensor_h_{-1};
+  int output_tensor_w_{-1};
+  int output_image_h_{-1};
+  int output_image_w_{-1};
+
+  int filter_tensor_n_{-1};
+  int filter_tensor_c_{-1};
+  int filter_tensor_h_{-1};
+  int filter_tensor_w_{-1};
+  int filter_image_h_{-1};
+  int filter_image_w_{-1};
+
+  int bias_image_h_{-1};
+  int bias_image_w_{-1};
+
  int default_c_blk_ = 1;
  int default_w_blk_ = 1;
  int default_nh_blk_ = 1;
+  // =================
+
+  DDim last_input_dims_{};
+  bool is_first_epoch_for_run_{true};

  cl::Kernel kernel_;
+  cl_int status_;
  cl::NDRange local_work_size_ = cl::NDRange{
      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
  bool use_lws_{true};
-  bool use_turn_{false};
+  bool use_tune_{false};
 };

 }  // namespace opencl