From 9f343ac26252447788e8f92558dad5102339a913 Mon Sep 17 00:00:00 2001
From: "ysh329@sina.com" <ysh329@sina.com>
Date: Tue, 4 Aug 2020 11:41:31 +0000
Subject: [PATCH] fix opencl concat. test=develop

---
 .../opencl/cl_kernel/image/concat_kernel.cl   | 283 ++++++++++++++
 lite/core/optimizer.h                         |   6 +-
 lite/core/profile/precision_profiler.h        |  22 +-
 lite/kernels/opencl/concat_image_compute.cc   | 367 +++++++++---------
 lite/kernels/opencl/conv_image_compute.cc     |  43 +-
 lite/kernels/opencl/test_helper.h             |   2 +-
 6 files changed, 513 insertions(+), 210 deletions(-)

diff --git a/lite/backends/opencl/cl_kernel/image/concat_kernel.cl b/lite/backends/opencl/cl_kernel/image/concat_kernel.cl
index 40cc52d54d..dd967ba71c 100644
--- a/lite/backends/opencl/cl_kernel/image/concat_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/concat_kernel.cl
@@ -11,6 +11,285 @@ limitations under the License. */
 
 #include <cl_common.h>
 
+__kernel void concatByCWith2Inputs(
+                    __write_only image2d_t output_image,
+                    __private const int out_C,
+                    __private const int out_W,
+                    __read_only image2d_t input_image_0,
+                    __private const int C_0,
+                    __read_only image2d_t input_image_1,
+                    __private const int C_1) {
+		    
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+        CLK_ADDRESS_CLAMP |
+        CLK_FILTER_NEAREST;
+
+  int2 output_pos;
+  output_pos.x = out_c * out_W + out_w;
+  output_pos.y = out_nh;
+  CL_DTYPE4 output_data;
+
+  for (int i = 0; i < 4; i++) {
+    int c = out_c * 4 + i;
+    if (c >= out_C) {
+        break;
+    }
+    int c_in;
+    CL_DTYPE4 input_data;
+    if (c < C_0) {
+      c_in = c;
+      int2 input_pos;
+      input_pos.x = (c_in / 4) * out_W + out_w;
+      input_pos.y = out_nh;
+      input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_0, sampler, input_pos);
+    } else {
+      c_in = c - C_0;
+      int2 input_pos;
+      input_pos.x = (c_in / 4) * out_W + out_w;
+      input_pos.y = out_nh;
+      input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_1, sampler, input_pos);
+    }
+    int value_offset = c_in % 4;
+    float value;
+    if (value_offset == 0) {
+      value = input_data.x;
+    } else if (value_offset == 1) {
+      value = input_data.y;
+    } else if (value_offset == 2) {
+      value = input_data.z;
+    } else if (value_offset == 3) {
+      value = input_data.w;
+    }
+   if (i == 0) {
+      output_data.x = value;
+    } else if (i == 1) {
+      output_data.y = value;
+    } else if (i == 2) {
+      output_data.z = value;
+    } else if (i == 3) {
+      output_data.w = value;
+    }
+  }
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output_data);
+}
+
+
+__kernel void concatByCWith3Inputs(
+                    __write_only image2d_t output_image,
+                    __private const int out_C,
+                    __private const int out_W,
+                    __read_only image2d_t input_image_0,
+                    __private const int C_0,
+                    __read_only image2d_t input_image_1,
+                    __private const int C_1,
+                    __read_only image2d_t input_image_2,
+                    __private const int C_2) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+        CLK_ADDRESS_CLAMP |
+        CLK_FILTER_NEAREST;
+
+  int2 output_pos;
+  output_pos.x = out_c * out_W + out_w;
+  output_pos.y = out_nh;
+  CL_DTYPE4 output_data;
+
+  for (int i = 0; i < 4; i++) {
+    int c = out_c * 4 + i;
+    if (c >= out_C) {
+        break;
+    }
+    int c_in;
+    CL_DTYPE4 input_data;
+    if (c < C_0) {
+      c_in = c;
+      int2 input_pos;
+      input_pos.x = (c_in / 4) * out_W + out_w;
+      input_pos.y = out_nh;
+      input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_0, sampler, input_pos);
+    } else if (c < C_0 + C_1) {
+      c_in = c - C_0;
+      int2 input_pos;
+      input_pos.x = (c_in / 4) * out_W + out_w;
+      input_pos.y = out_nh;
+      input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_1, sampler, input_pos);
+    } else {
+      c_in = c - C_0 - C_1;
+      int2 input_pos;
+      input_pos.x = (c_in / 4) * out_W + out_w;
+      input_pos.y = out_nh;
+      input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_2, sampler, input_pos);
+    }
+    int value_offset = c_in % 4;
+    float value;
+    if (value_offset == 0) {
+      value = input_data.x;
+    } else if (value_offset == 1) {
+      value = input_data.y;
+    } else if (value_offset == 2) {
+      value = input_data.z;
+    } else if (value_offset == 3) {
+      value = input_data.w;
+    }
+    if (i == 0) {
+      output_data.x = value;
+    } else if (i == 1) {
+      output_data.y = value;
+    } else if (i == 2) {
+      output_data.z = value;
+    } else if (i == 3) {
+      output_data.w = value;
+    }
+  }
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output_data);
+}
+
+
+__kernel void concatByCWith4Inputs(
+                    __write_only image2d_t output_image,
+                    __private const int out_C,
+                    __private const int out_W,
+                    __read_only image2d_t input_image_0,
+                    __private const int C_0,
+                    __read_only image2d_t input_image_1,
+                    __private const int C_1,
+                    __read_only image2d_t input_image_2,
+                    __private const int C_2,
+                    __read_only image2d_t input_image_3,
+                    __private const int C_3) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+        CLK_ADDRESS_CLAMP |
+        CLK_FILTER_NEAREST;
+
+  int2 output_pos;
+  output_pos.x = out_c * out_W + out_w;
+  output_pos.y = out_nh;
+  CL_DTYPE4 output_data;
+
+  for (int i = 0; i < 4; i++) {
+    int c = out_c * 4 + i;
+    if (c >= out_C) {
+        break;
+    }
+    int c_in;
+    CL_DTYPE4 input_data;
+    if (c < C_0) {
+      c_in = c;
+      int2 input_pos;
+      input_pos.x = (c_in / 4) * out_W + out_w;
+      input_pos.y = out_nh;
+      input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_0, sampler, input_pos);
+    } else if (c < C_0 + C_1) {
+      c_in = c - C_0;
+      int2 input_pos;
+      input_pos.x = (c_in / 4) * out_W + out_w;
+      input_pos.y = out_nh;
+      input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_1, sampler, input_pos);
+    } else if (c < C_0 + C_1 + C_2) {
+      c_in = c - C_0 - C_1;
+      int2 input_pos;
+      input_pos.x = (c_in / 4) * out_W + out_w;
+      input_pos.y = out_nh;
+      input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_2, sampler, input_pos);
+    }else if (c < C_0 + C_1 + C_2 + C_3){
+      c_in = c - C_0 - C_1 - C_2;
+      int2 input_pos;
+      input_pos.x = (c_in / 4) * out_W + out_w;
+      input_pos.y = out_nh;
+      input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image_3, sampler, input_pos);
+    }
+    int value_offset = c_in % 4;
+    float value;
+    if (value_offset == 0) {
+      value = input_data.x;
+    } else if (value_offset == 1) {
+      value = input_data.y;
+    } else if (value_offset == 2) {
+      value = input_data.z;
+    } else if (value_offset == 3) {
+      value = input_data.w;
+    }
+    if (i == 0) {
+      output_data.x = value;
+    } else if (i == 1) {
+      output_data.y = value;
+    } else if (i == 2) {
+      output_data.z = value;
+    } else if (i == 3) {
+      output_data.w = value;
+    }
+  }
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output_data);
+}
+
+
+__kernel void concatByH(__read_only image2d_t input_image,
+                        __write_only image2d_t output_image,
+                        __private const int out_W,
+                        __private const int out_H_Start) {
+
+  const int in_c = get_global_id(0);
+  const int in_w = get_global_id(1);
+  const int in_nh = get_global_id(2);
+
+  int2 input_pos;
+  input_pos.x = in_c * out_W + in_w;
+  input_pos.y = in_nh;
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+        CLK_ADDRESS_CLAMP |
+        CLK_FILTER_NEAREST;
+  CL_DTYPE4 input;
+  input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,input_pos);
+
+  int2 output_pos;
+  output_pos.x = input_pos.x;
+  output_pos.y = out_H_Start + input_pos.y;
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, input);
+
+}
+
+
+__kernel void concatByW(__read_only image2d_t input_image,
+                        __write_only image2d_t output_image,
+                        __private const int in_W,
+                        __private const int pre_Width,
+                        __private const int out_Width) {
+
+  const int in_c = get_global_id(0);
+  const int in_w = get_global_id(1);
+  const int in_nh = get_global_id(2);
+
+  int2 input_pos;
+  input_pos.x = in_c * in_W + in_w;
+  input_pos.y = in_nh;
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+        CLK_ADDRESS_CLAMP |
+        CLK_FILTER_NEAREST;
+  CL_DTYPE4 input;
+  input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,input_pos);
+
+  int2 output_pos;
+  output_pos.x = input_pos.x + pre_Width + out_Width * in_c;
+  output_pos.y = input_pos.y;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, input);
+}
+
+
+// deprecated
 __kernel void concat2(__read_only image2d_t input0,
                       __read_only image2d_t input1,
                       __write_only image2d_t output,
@@ -104,6 +383,7 @@ __kernel void concat2(__read_only image2d_t input0,
   }
 }
 
+// deprecated
 __kernel void concat_mul(__read_only image2d_t input,
                          __write_only image2d_t output,
                          int flag, int C_0, int out_C, int out_W, int in_W, int width) {
@@ -162,3 +442,6 @@ __kernel void concat_mul(__read_only image2d_t input,
     WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input_data);
   }
 }
+
+
+
diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h
index 42dac8e59b..d3618aa625 100644
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -161,8 +161,10 @@ class Optimizer {
 
            "runtime_context_assign_pass",
            "argument_type_display_pass",
-
-           "memory_optimize_pass"}};
+#ifndef LITE_WITH_PRECISION_PROFILE
+           "memory_optimize_pass"
+#endif
+          }};
 
       if (passes.size() == 1) {
         // multi_stream_analysis_pass must be in the front of
diff --git a/lite/core/profile/precision_profiler.h b/lite/core/profile/precision_profiler.h
index fda2b74f8f..0a862d1e32 100644
--- a/lite/core/profile/precision_profiler.h
+++ b/lite/core/profile/precision_profiler.h
@@ -36,6 +36,10 @@
 #include "lite/backends/cuda/math/type_trans.h"
 #endif
 
+#ifdef LITE_ON_TINY_PUBLISH
+#include "lite/utils/replace_stl/stream.h"
+#endif
+
 namespace paddle {
 namespace lite {
 namespace profile {
@@ -88,19 +92,25 @@ class PrecisionProfiler {
   PrecisionProfiler() {}
 
   std::string GetSummaryHeader() {
+#ifdef LITE_ON_TINY_PUBLISH
+    using replace_stl::setw;
+#else
     using std::setw;
+#endif
     using std::left;
     using std::fixed;
     STL::stringstream ss;
     ss << "\n\n========================================= "
        << "Detailed Precision Profiler Summary "
-       << "=========================================" << std::endl;
+       << "========================================="
+       << "\n";
     ss << setw(45) << left << "operator:(kernel_info)"
        << " " << setw(70) << left << "output_tensor_name:(tensor_info)"
        << " " << setw(15) << left << "dims"
        << " " << setw(15) << left << "mean"
        << " " << setw(15) << left << "std_deviation"
-       << " " << setw(15) << left << "ave_grow_rate*" << std::endl;
+       << " " << setw(15) << left << "ave_grow_rate*"
+       << "\n";
 
     // write to file with path: `log_dir`
     if (log_dir_ != "") {
@@ -368,7 +378,11 @@ class PrecisionProfiler {
   }
 
   std::string GetInstPrecision(const Instruction* inst = nullptr) {
+#ifdef LITE_ON_TINY_PUBLISH
+    using replace_stl::setw;
+#else
     using std::setw;
+#endif
     using std::left;
     using std::fixed;
     STL::stringstream ss;
@@ -429,7 +443,7 @@ class PrecisionProfiler {
              << output_arg_info << " " << setw(15) << left << tout->dims()
              << " " << setw(15) << left << mean_str << " " << setw(15) << left
              << std_dev_str << " " << setw(15) << left << ave_grow_rate_str
-             << std::endl;
+             << "\n";
         } else if (type->IsTensorList()) {
           auto touts =
               op_scope->FindVar(out_name)->GetMutable<std::vector<Tensor>>();
@@ -466,7 +480,7 @@ class PrecisionProfiler {
                << output_arg_info << " " << setw(15) << left << tout->dims()
                << " " << setw(15) << left << mean_str << " " << setw(15) << left
                << std_dev_str << " " << setw(15) << left << ave_grow_rate_str
-               << std::endl;
+               << "\n";
           }
         }
       }
diff --git a/lite/kernels/opencl/concat_image_compute.cc b/lite/kernels/opencl/concat_image_compute.cc
index 25830b6a08..83dc7dec87 100644
--- a/lite/kernels/opencl/concat_image_compute.cc
+++ b/lite/kernels/opencl/concat_image_compute.cc
@@ -38,213 +38,230 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
     concat_param_ = param_.get_mutable<param_t>();
-    if (concat_param_->x.size() == 2) {
-      kernel_func_name_ = "concat2";
-    } else {
-      kernel_func_name_ = "concat_mul";
-    }
-    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
-    context.cl_context()->AddKernel(kernel_func_name_,
-                                    "image/concat_kernel.cl",
-                                    build_options_,
-                                    time_stamp_);
 
+    auto input_num = concat_param_->x.size();
+    auto* output = concat_param_->output;
+    auto output_dims_size = output->dims().size();
     auto axis = concat_param_->axis;
-    auto inputs = concat_param_->x;
-    auto out_dims = concat_param_->output->dims();
-    auto* axis_tensor = concat_param_->axis_tensor;
-    if (axis_tensor != nullptr) {
-      // auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
-      // axis = axis_tensor_data[0];
-    }
-    auto in_dims = inputs[0]->dims();
-    axis_size_ = out_dims[axis];
-    axis_ = axis;
-    if (out_dims.size() < 4) {
-      if (out_dims.size() - axis == 1) {
-        // width
-        width_ = out_dims[1];  // c
-        flag_ = 3;
+    if (output_dims_size < 4) {
+      if (output_dims_size - axis == 1) {
+        kernel_func_name_ = "concatByW";
       } else {
-        // height
-        width_ = out_dims[0];  // n
-        flag_ = 2;
-      }
-    } else {
-      switch (axis_) {
-        case 0:
-          width_ = out_dims[2];  // h
-          flag_ = 0;
-          break;
-        case 1:                  // channel
-          width_ = out_dims[3];  // w
-          flag_ = 1;
-          break;
-        case 2:                  // height
-          width_ = out_dims[0];  // n
-          flag_ = 2;
-          break;
-        case 3:
-        case -1:                 // width
-          width_ = out_dims[1];  // c
-          flag_ = 3;
-          break;
-        default:
-          printf("this axis: %d does not support \n", axis_);
+        kernel_func_name_ = "concatByH";
       }
-    }
-
-    for (int i = 1; i < inputs.size(); i++) {
-      auto dims = inputs[i]->dims();
-      // auto flag = CHECK_EQ_OR_FALSE(in_dims.size(), dims.size());
-      if (in_dims.size() != dims.size()) {
-        printf("input shape must be same \n");
-        return;
-      }
-      for (int i = 0; i < dims.size(); i++) {
-        if (i != axis) {
-          if (in_dims[i] != dims[i]) {
-            printf("input shape must be same \n");
-            return;
-          }
-        }
+    } else if (output_dims_size == 4) {  // output->dims.size() == 4
+      if (input_num == 2) {
+        kernel_func_name_ = "concatByCWith2Inputs";
+      } else if (input_num == 3) {
+        kernel_func_name_ = "concatByCWith3Inputs";
+      } else if (input_num == 4) {
+        kernel_func_name_ = "concatByCWith4Inputs";
+      } else {
+        LOG(FATAL) << "Unsupported input tensors number:" << input_num << ".";
       }
+    } else {  // output->dims.size() > 4
+      LOG(FATAL) << "Unsupported output dims " << output->dims()
+                 << ", whose dims.size() is bigger than 4.";
     }
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/concat_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
   }
 
   void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    const auto& x_dims = param.output->dims();
-    auto image_shape = InitImageDimInfoWith(x_dims);
-    auto* out_buf = param.output->mutable_data<half_t, cl::Image2D>(
-        image_shape["width"], image_shape["height"]);
-    const auto& y_dims = param.output->dims();  // useless: check dim only
+    auto output_tensor_dims = concat_param_->output->dims();
+    auto output_image_shape = InitImageDimInfoWith(output_tensor_dims);
+    auto output_image_p =
+        concat_param_->output->mutable_data<half_t, cl::Image2D>(
+            output_image_shape["width"], output_image_shape["height"]);
+
+    auto inputs = concat_param_->x;
+    auto axis = concat_param_->axis;
 
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
     STL::stringstream kernel_key;
     kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
-    auto inputs = param.x;
-    int arg_idx = 0;
-    int width = inputs[0]->dims()[inputs[0]->dims().size() - 1];
+    if (kernel_func_name_ == "concatByW" || kernel_func_name_ == "concatByH") {
+      auto output_tensor_w = output_tensor_dims[output_tensor_dims.size() - 1];
+      if (output_tensor_dims.size() - axis == 1) {
+        for (size_t input_idx = 0; input_idx < inputs.size(); ++input_idx) {
+          auto* input = inputs[input_idx];
+          auto input_tensor_dims = input->dims();
+          auto input_image_shape = InitImageDimInfoWith(input_tensor_dims);
+          auto input_tensor_w = input_tensor_dims[input_tensor_dims.size() - 1];
+          auto* input_image_p = input->data<half_t, cl::Image2D>();
 
-#ifdef LITE_WITH_LOG
-    VLOG(4) << "concat input shape:  ";
-    for (size_t i = 0; i < inputs.size(); i++) {
-      VLOG(4) << "inputs [" << i << "]"
-              << "[" << inputs[i]->dims().size() << "D]:"
-              << "   dims:" << inputs[i]->dims()[0] << " "
-              << inputs[i]->dims()[1] << " " << inputs[i]->dims()[2] << " "
-              << inputs[i]->dims()[3];
-    }
+          size_t input_tensor_pre_w = 0;
+          for (size_t ii_idx = 0; ii_idx < input_idx; ++ii_idx) {
+            auto input_tensor_dims = inputs[ii_idx]->dims();
+            input_tensor_pre_w +=
+                input_tensor_dims[input_tensor_dims.size() - 1];
+          }
 
-    VLOG(4) << "concat output shape:  ";
-    VLOG(4) << " out  dims:  "
-            << "[" << x_dims.size() << "D]:" << x_dims[0] << " " << x_dims[1]
-            << " " << x_dims[2] << " " << x_dims[3];
-    VLOG(4) << "axis_: " << axis_;
-    VLOG(4) << "flag_: " << flag_;
-#endif
+          int input_special_w = input_tensor_dims[output_tensor_dims.size() -
+                                                  2];  // not a good var name
 
-    auto global_work_size =
-        cl::NDRange{static_cast<cl::size_type>(x_dims[x_dims.size() - 1]),
-                    static_cast<cl::size_type>(image_shape["width"] /
-                                               x_dims[x_dims.size() - 1]),
-                    static_cast<cl::size_type>(image_shape["height"])};
+          const std::vector<size_t>& default_work_size = DefaultWorkSize(
+              input_tensor_dims,
+              DDim(std::vector<DDim::value_type>{
+                  static_cast<int64_t>(input_image_shape["width"]),
+                  static_cast<int64_t>(input_image_shape["height"])}));
+          cl::NDRange global_work_size =
+              cl::NDRange{static_cast<size_t>(default_work_size[0]),
+                          static_cast<size_t>(default_work_size[1]),
+                          static_cast<size_t>(default_work_size[2])};
+          cl_int status;
+          status = kernel.setArg(0, *input_image_p);
+          CL_CHECK_FATAL(status);
+          status = kernel.setArg(1, *output_image_p);
+          CL_CHECK_FATAL(status);
+          status = kernel.setArg(2, input_special_w);
+          CL_CHECK_FATAL(status);
+          status = kernel.setArg(3, input_tensor_pre_w);
+          CL_CHECK_FATAL(status);
+          status = kernel.setArg(4, output_tensor_w);
+          CL_CHECK_FATAL(status);
 
-#ifdef LITE_WITH_LOG
-    VLOG(4) << TargetToStr(param.output->target());
-    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
-            << image_shape["height"];
-    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
-            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3]
-            << "x_dims[x_dims.size() - 1]" << x_dims[x_dims.size() - 1];
-    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
-            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
-    VLOG(4) << "width_: " << width_ << ", flag_: " << flag_;
-    VLOG(4) << "global_work_size: " << x_dims[x_dims.size() - 1] << "  "
-            << (image_shape["width"] / x_dims[x_dims.size() - 1]) << "  "
-            << (image_shape["height"]);
-#endif
+          status = EnqueueNDRangeKernel(context,
+                                        kernel,
+                                        cl::NullRange,
+                                        global_work_size,
+                                        cl::NullRange,
+                                        nullptr,
+                                        event_);
+          CL_CHECK_FATAL(status);
+        }
+      } else {
+        size_t output_image_height_start = 0;  // output image height start
+        for (size_t input_idx = 0; input_idx < inputs.size(); ++input_idx) {
+          auto* input = inputs[input_idx];
+          auto input_tensor_dims = input->dims();
+          auto input_image_shape = InitImageDimInfoWith(input_tensor_dims);
+          auto input_tensor_w = input_tensor_dims[input_tensor_dims.size() - 1];
+          auto* input_image_p = input->data<half_t, cl::Image2D>();
 
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-    int out_w = x_dims[x_dims.size() - 1];
-    int out_c = x_dims[1];
-    if (inputs.size() == 2) {
-      auto* x_buf0 = inputs[0]->data<half_t, cl::Image2D>();
-      auto* x_buf1 = inputs[1]->data<half_t, cl::Image2D>();
-      cl_int status = kernel.setArg(arg_idx, *x_buf0);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, *x_buf1);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, *out_buf);
+          const std::vector<size_t>& default_work_size = DefaultWorkSize(
+              input_tensor_dims,
+              DDim(std::vector<DDim::value_type>{
+                  static_cast<int64_t>(input_image_shape["width"]),
+                  static_cast<int64_t>(input_image_shape["height"])}));
+          cl::NDRange global_work_size =
+              cl::NDRange{static_cast<size_t>(default_work_size[0]),
+                          static_cast<size_t>(default_work_size[1]),
+                          static_cast<size_t>(default_work_size[2])};
+
+          cl_int status;
+          status = kernel.setArg(0, *input_image_p);
+          CL_CHECK_FATAL(status);
+          status = kernel.setArg(1, *output_image_p);
+          CL_CHECK_FATAL(status);
+          status = kernel.setArg(2, output_tensor_w);
+          CL_CHECK_FATAL(status);
+          status = kernel.setArg(3, output_image_height_start);
+          CL_CHECK_FATAL(status);
+
+          status = EnqueueNDRangeKernel(context,
+                                        kernel,
+                                        cl::NullRange,
+                                        global_work_size,
+                                        cl::NullRange,
+                                        nullptr,
+                                        event_);
+          CL_CHECK_FATAL(status);
+
+          // compute new output_image_height_start
+          if (output_tensor_dims.size() == 3) {
+            output_image_height_start += input_tensor_dims[1];
+          } else if (output_tensor_dims.size() == 2) {
+            output_image_height_start += input_tensor_dims[0];
+          }
+        }
+      }
+    } else if (kernel_func_name_ == "concatByCWith2Inputs" ||
+               kernel_func_name_ == "concatByCWith3Inputs" ||
+               kernel_func_name_ == "concatByCWith4Inputs") {
+      auto* input0 = inputs[0];
+      auto* input0_image_p = input0->data<half_t, cl::Image2D>();
+      size_t input0_tensor_c = input0->dims()[1];
+
+      auto* input1 = inputs.size() >= 2 ? inputs[1] : nullptr;
+      auto* input1_image_p =
+          input1 ? input1->data<half_t, cl::Image2D>() : nullptr;
+      size_t input1_tensor_c = input1 ? input1->dims()[1] : -1;
+
+      auto* input2 = inputs.size() >= 3 ? inputs[2] : nullptr;
+      auto* input2_image_p =
+          input2 ? input2->data<half_t, cl::Image2D>() : nullptr;
+      size_t input2_tensor_c = input2 ? input2->dims()[1] : -1;
+
+      auto* input3 = inputs.size() >= 4 ? inputs[3] : nullptr;
+      auto* input3_image_p =
+          input3 ? input3->data<half_t, cl::Image2D>() : nullptr;
+      size_t input3_tensor_c = input3 ? input3->dims()[1] : -1;
+
+      LOG(INFO) << "input0_image_p:" << input0_image_p;
+      LOG(INFO) << "input0_tensor_c:" << input0_tensor_c;
+      LOG(INFO) << "input1_image_p:" << input1_image_p;
+      LOG(INFO) << "input1_tensor_c:" << input1_tensor_c;
+      LOG(INFO) << "input2_image_p:" << input2_image_p;
+      LOG(INFO) << "input2_tensor_c:" << input2_tensor_c;
+      LOG(INFO) << "input3_image_p:" << input3_image_p;
+      LOG(INFO) << "input3_tensor_c:" << input3_tensor_c;
+
+      const std::vector<size_t>& default_work_size = DefaultWorkSize(
+          output_tensor_dims,
+          DDim(std::vector<DDim::value_type>{
+              static_cast<int64_t>(output_image_shape["width"]),
+              static_cast<int64_t>(output_image_shape["height"])}));
+      cl::NDRange global_work_size =
+          cl::NDRange{static_cast<size_t>(default_work_size[0]),
+                      static_cast<size_t>(default_work_size[1]),
+                      static_cast<size_t>(default_work_size[2])};
+
+      cl_int status;
+      status = kernel.setArg(0, *output_image_p);
       CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, flag_);
+      status = kernel.setArg(
+          1, static_cast<size_t>(output_tensor_dims[1]));  // output_tensor_c
       CL_CHECK_FATAL(status);
-      status =
-          kernel.setArg(++arg_idx, static_cast<int>(inputs[0]->dims()[axis_]));
+      status = kernel.setArg(
+          2, static_cast<size_t>(output_tensor_dims[3]));  // output_tensor_w
       CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, out_c);
+      status = kernel.setArg(3, *input0_image_p);
       CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, out_w);
+      status = kernel.setArg(4, input0_tensor_c);
       CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, width_);
+      status = kernel.setArg(5, *input1_image_p);
       CL_CHECK_FATAL(status);
-
-      status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-          kernel,
-          cl::NullRange,
-          global_work_size,
-          cl::NullRange,
-          nullptr,
-          nullptr);
+      status = kernel.setArg(6, input1_tensor_c);
       CL_CHECK_FATAL(status);
-    } else {
-      auto start = 0;
-      for (int i = 0; i < inputs.size(); i++) {
-        arg_idx = 0;
-        auto in_dims = inputs[i]->dims();
-        image_shape = InitImageDimInfoWith(in_dims);
-        auto* x_buf = inputs[i]->data<half_t, cl::Image2D>();
-        int in_w = in_dims[in_dims.size() - 1];
-#ifdef LITE_WITH_LOG
-        VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
-                << image_shape["height"];
-#endif
-        global_work_size =
-            cl::NDRange{static_cast<cl::size_type>(in_dims[in_dims.size() - 1]),
-                        static_cast<cl::size_type>(image_shape["width"] /
-                                                   in_dims[in_dims.size() - 1]),
-                        static_cast<cl::size_type>(image_shape["height"])};
-        cl_int status = kernel.setArg(arg_idx, *x_buf);
-        CL_CHECK_FATAL(status);
-        status = kernel.setArg(++arg_idx, *out_buf);
-        CL_CHECK_FATAL(status);
-        status = kernel.setArg(++arg_idx, flag_);
-        CL_CHECK_FATAL(status);
-        status = kernel.setArg(++arg_idx, start);
+      if (inputs.size() >= 3) {
+        status = kernel.setArg(7, *input2_image_p);
         CL_CHECK_FATAL(status);
-        status = kernel.setArg(++arg_idx, out_c);
+        status = kernel.setArg(8, input2_tensor_c);
         CL_CHECK_FATAL(status);
-        status = kernel.setArg(++arg_idx, out_w);
-        CL_CHECK_FATAL(status);
-        status = kernel.setArg(++arg_idx, in_w);
-        CL_CHECK_FATAL(status);
-        status = kernel.setArg(++arg_idx, width_);
+      }
+      if (inputs.size() == 4) {
+        status = kernel.setArg(9, *input3_image_p);
         CL_CHECK_FATAL(status);
+        status = kernel.setArg(10, input3_tensor_c);
         CL_CHECK_FATAL(status);
-
-        status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-            kernel,
-            cl::NullRange,
-            global_work_size,
-            cl::NullRange,
-            nullptr,
-            nullptr);
-        CL_CHECK_FATAL(status);
-        start += inputs[i]->dims()[axis_];
       }
+      status = EnqueueNDRangeKernel(context,
+                                    kernel,
+                                    cl::NullRange,
+                                    global_work_size,
+                                    cl::NullRange,
+                                    nullptr,
+                                    event_);
+      CL_CHECK_FATAL(status);
+    } else {
+      LOG(FATAL) << "Unsupported kernel func name: " << kernel_func_name_;
     }
   }
 
diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc
index f53c464e99..3494c727eb 100644
--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
@@ -185,47 +185,29 @@ void ConvImageCompute::PrepareForRun() {
 
     impl_ = &ConvImageCompute::DepthwiseConv2d;
   } else if (filter_tensor_h_ == 3 && filter_tensor_w_ == 3) {
-// #define CONV3x3OPT_FALL_BACK
-#ifndef CONV3x3OPT_FALL_BACK
     // conv2d_3x3
-    kernel_func_names_.push_back(input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch"
-                                                     : "conv2d_3x3_opt");
-    kernel_func_paths_.push_back("image/conv2d_3x3_opt_kernel.cl");
-
-    CLImageConverterFolder converter;
-    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    filter_image_h_ = filter_image_dims[1];
-    filter_image_w_ = filter_image_dims[0];
-    tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
-
-    half_t* filter_image_data =
-        tensor_hold_filter_image_->mutable_data<half_t>();
-
-    converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
-    filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        filter_image_w_, filter_image_h_, filter_image_data);
-
-    impl_ = &ConvImageCompute::Conv2d3x3opt;
-#else
-    kernel_func_names_.push_back("conv2d_3x3");
-    kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl");
+    if (groups_ == 1) {
+      kernel_func_names_.push_back(
+          input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch" : "conv2d_3x3_opt");
+      kernel_func_paths_.push_back("image/conv2d_3x3_opt_kernel.cl");
+      impl_ = &ConvImageCompute::Conv2d3x3opt;
+    } else {  // groups_ > 1
+      kernel_func_names_.push_back("conv2d_3x3");
+      kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl");
+      impl_ = &ConvImageCompute::Conv2d3x3;
+    }
 
     CLImageConverterFolder converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
     filter_image_h_ = filter_image_dims[1];
     filter_image_w_ = filter_image_dims[0];
     tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
-
     half_t* filter_image_data =
         tensor_hold_filter_image_->mutable_data<half_t>();
 
     converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
     filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
         filter_image_w_, filter_image_h_, filter_image_data);
-
-    impl_ = &ConvImageCompute::Conv2d3x3;
-#endif
-#undef CONV3x3OPT_FALL_BACK
   } else if (filter_tensor_h_ == 5 && filter_tensor_w_ == 5) {
 #define CONV_5x5_OPT
 #ifndef CONV_5x5_OPT
@@ -584,6 +566,11 @@ void ConvImageCompute::GetGlobalWorkSize() {
                                     static_cast<size_t>(w_blk_),
                                     static_cast<size_t>(nh_blk_)};
     input_c_block_ = static_cast<const int>((input_tensor_c_ + 3) / 4);
+  } else if (kernel_func_names_[0] == "conv2d_3x3") {
+    global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                    static_cast<size_t>(w_blk_),
+                                    static_cast<size_t>(nh_blk_)};
+
   } else if (kernel_func_names_[0] == "conv2d_3x3_multi_batch" ||
              kernel_func_names_[0] == "conv2d_3x3_opt") {
     int w_blk_size = 5;
diff --git a/lite/kernels/opencl/test_helper.h b/lite/kernels/opencl/test_helper.h
index a1b875688e..2df88758d0 100644
--- a/lite/kernels/opencl/test_helper.h
+++ b/lite/kernels/opencl/test_helper.h
@@ -19,7 +19,7 @@
 #define COMPUTE_RELATIVE_DIFF(res0, res1) abs(abs(res0 - res1) / (res1 + 1e-5))
 
 #define IS_DIFF_PASSED(res0, res1, threshold)        \
-  (((COMPTUE_ABS_DIFF(res0, res1) < threshold) ||    \
+  (((COMPUTE_ABS_DIFF(res0, res1) < threshold) ||    \
     (COMPUTE_RELATIVE_DIFF(res0, res1) < threshold)) \
        ? true                                        \
        : false)
-- 
GitLab