[opencl] expand opencl kernel & unit test (#3742)

* [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop * [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop * [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop * [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop * [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop * [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop * [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop * [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop * [OPENCL] develop expend opencl kernel & unit test ,test=develop * [OPENCL] develop expend opencl kernel & unit test ,test=develop

[opencl] expand opencl kernel & unit test (#3742)
* [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop * [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop * [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop * [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop * [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop * [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop * [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop * [OPENCL] develop pixel_shuffle opencl kernel & unit test ,test=develop * [OPENCL] develop expend opencl kernel & unit test ,test=develop * [OPENCL] develop expend opencl kernel & unit test ,test=develop
fbffa674 · xiebaiyuan · GitHub · 154021ad · fbffa674 · fbffa674
4 changed file
--- a/lite/backends/opencl/cl_kernel/image/expand_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/expand_kernel.cl
+#include <cl_common.h>
+__kernel void expend_c1(__private const int OUT_C,
+                        __private const int OUT_W,
+                        __private const int OUT_NH,
+                        __private const int IN_C,
+                        __private const int IN_W,
+                        __private const int IN_NH,
+                        __private const int input_width,  /* of one block */
+                        __private const int input_height, /* of one block */
+                        __private const int output_width,
+                        __private const int output_height,
+                        __read_only image2d_t input,
+                        __write_only image2d_t output,
+                        __private const int n_times,
+                        __private const int c_times,
+                        __private const int h_times,
+                        __private const int w_times) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
+    return;
+  }
+  const int out_n = out_nh / output_height;
+  const int out_h = out_nh % output_height;
+  const int in_c = 0;
+  const int in_w = out_w / w_times;
+  const int in_h = out_h / h_times;
+  const int in_n = out_n / n_times;
+  const int in_nh = in_n * input_height + in_h;
+  int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
+  int2 input_pos = (int2)(in_w, in_nh);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
+  in.y = 0;
+  in.z = 0;
+  in.w = 0;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, in);
+}
+__kernel void expend_c2(__private const int OUT_C,
+                        __private const int OUT_W,
+                        __private const int OUT_NH,
+                        __private const int IN_C,
+                        __private const int IN_W,
+                        __private const int IN_NH,
+                        __private const int input_width,  /* of one block */
+                        __private const int input_height, /* of one block */
+                        __private const int output_width,
+                        __private const int output_height,
+                        __read_only image2d_t input,
+                        __write_only image2d_t output,
+                        __private const int n_times,
+                        __private const int c_times,
+                        __private const int h_times,
+                        __private const int w_times) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
+    return;
+  }
+  const int out_n = out_nh / output_height;
+  const int out_h = out_nh % output_height;
+  const int in_c = 0;
+  const int in_w = out_w / w_times;
+  const int in_h = out_h / h_times;
+  const int in_n = out_n / n_times;
+  const int in_nh = in_n * input_height + in_h;
+  int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
+  int2 input_pos = (int2)(in_w, in_nh);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
+  in.z = 0;
+  in.w = 0;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, in);
+}
+__kernel void expend_c3(__private const int OUT_C,
+                        __private const int OUT_W,
+                        __private const int OUT_NH,
+                        __private const int IN_C,
+                        __private const int IN_W,
+                        __private const int IN_NH,
+                        __private const int input_width,  /* of one block */
+                        __private const int input_height, /* of one block */
+                        __private const int output_width,
+                        __private const int output_height,
+                        __read_only image2d_t input,
+                        __write_only image2d_t output,
+                        __private const int n_times,
+                        __private const int c_times,
+                        __private const int h_times,
+                        __private const int w_times) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
+    return;
+  }
+  const int out_n = out_nh / output_height;
+  const int out_h = out_nh % output_height;
+  const int in_c = 0;
+  const int in_w = out_w / w_times;
+  const int in_h = out_h / h_times;
+  const int in_n = out_n / n_times;
+  const int in_nh = in_n * input_height + in_h;
+  int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
+  int2 input_pos = (int2)(in_w, in_nh);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
+  in.w = 0;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, in);
+}
+__kernel void expend_c4(__private const int OUT_C,
+                        __private const int OUT_W,
+                        __private const int OUT_NH,
+                        __private const int IN_C,
+                        __private const int IN_W,
+                        __private const int IN_NH,
+                        __private const int input_width,  /* of one block */
+                        __private const int input_height, /* of one block */
+                        __private const int output_width,
+                        __private const int output_height,
+                        __read_only image2d_t input,
+                        __write_only image2d_t output,
+                        __private const int n_times,
+                        __private const int c_times,
+                        __private const int h_times,
+                        __private const int w_times) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
+    return;
+  }
+  const int out_n = out_nh / output_height;
+  const int out_h = out_nh % output_height;
+  const int in_c = 0;
+  const int in_w = out_w / w_times;
+  const int in_h = out_h / h_times;
+  const int in_n = out_n / n_times;
+  const int in_nh = in_n * input_height + in_h;
+  int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
+  int2 input_pos = (int2)(in_w, in_nh);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, in);
+}
+__kernel void expend_cn(__private const int OUT_C,
+                        __private const int OUT_W,
+                        __private const int OUT_NH,
+                        __private const int IN_C,
+                        __private const int IN_W,
+                        __private const int IN_NH,
+                        __private const int input_width,  /* of one block */
+                        __private const int input_height, /* of one block */
+                        __private const int output_width,
+                        __private const int output_height,
+                        __read_only image2d_t input,
+                        __write_only image2d_t output,
+                        __private const int n_times,
+                        __private const int c_times,
+                        __private const int h_times,
+                        __private const int w_times) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
+    return;
+  }
+  const int out_n = out_nh / output_height;
+  const int out_h = out_nh % output_height;
+  const int in_c = out_c;
+  const int in_w = out_w / w_times;
+  const int in_h = out_h / h_times;
+  const int in_n = out_n / n_times;
+  const int in_nh = in_n * input_height + in_h;
+  int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
+  int2 input_pos = (int2)(in_c * IN_W + in_w, in_nh);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, in);
+}
\ No newline at end of file
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -35,6 +35,8 @@ add_kernel(dropout_opencl OPENCL basic SRCS dropout_image_compute.cc DEPS ${cl_k
 add_kernel(pad2d_opencl OPENCL basic SRCS pad2d_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(box_coder_opencl OPENCL basic SRCS box_coder_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(pixel_shuffle_opencl OPENCL basic SRCS pixel_shuffle_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(expand_opencl OPENCL basic SRCS expand_image_compute.cc DEPS ${cl_kernel_deps})
 # extra
 # wait to add ...
@@ -77,6 +79,9 @@ lite_cc_test(test_layout_image_opencl SRCS layout_image_compute_test.cc
 lite_cc_test(test_pixel_shuffle_image_opencl SRCS pixel_shuffle_image_compute_test.cc
             DEPS pixel_shuffle_opencl op_registry program context)
+lite_cc_test(test_expand_image_opencl SRCS expand_image_compute_test.cc
+             DEPS expand_opencl op_registry program context)
 lite_cc_test(test_elementwise_add_image_opencl SRCS elementwise_add_image_compute_test.cc
             DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context)
 lite_cc_test(test_elementwise_sub_image_opencl SRCS elementwise_sub_image_compute_test.cc

--- a/lite/kernels/opencl/expand_image_compute.cc
+++ b/lite/kernels/opencl/expand_image_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <vector>
+#include "lite/backends/opencl/cl_half.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/replace_stl/stream.h"
+#include "lite/utils/string.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif
+#include "lite/backends/opencl/cl_utility.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+class ExpandComputeImage2D : public KernelLite<TARGET(kOpenCL),
+                                               PRECISION(kFP16),
+                                               DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ExpandParam;
+  std::string doc() const override { return "expand using cl::Image2D, kFP16"; }
+  void PrepareForRun() override {
+    expand_param_ = param_.get_mutable<param_t>();
+    auto expand_times = expand_param_->expand_times;
+    auto in_dims = expand_param_->X->dims();
+    CHECK(in_dims.size() == 4) << "expand image now only support indims size 4";
+    CHECK(expand_times.size() == 4)
+        << "expand image now only support in_expand_timesdims size 4";
+    CHECK(expand_times[1] == 1) << "expand image do not support expend c now";
+    // do not confuse with these cases.it is use to support expend c in future
+    if (in_dims[1] == 1) {
+      kernel_func_name_ = "expend_c1";
+    } else if (in_dims[1] == 2) {
+      kernel_func_name_ = "expend_c2";
+    } else if (in_dims[1] == 3) {
+      kernel_func_name_ = "expend_c3";
+    } else if (in_dims[1] == 4) {
+      kernel_func_name_ = "expend_c4";
+    } else {
+      kernel_func_name_ = "expend_cn";
+    }
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/expand_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
+    kernel_ = context.cl_context()->GetKernel(kernel_key.str());
+  }
+  void ReInitWhenNeeded() override {
+    VLOG(1) << "ReInitWhenNeeded:  " << kernel_func_name_;
+    auto x_dims = expand_param_->X->dims();
+    auto out_dims = expand_param_->Out->dims();
+    auto expand_times = expand_param_->expand_times;
+    VLOG(1) << "x_dims:  " << x_dims;
+    VLOG(1) << "out_dims:  " << out_dims;
+    VLOG(1) << "expand_times:  " << expand_times[0] << " " << expand_times[1]
+            << " " << expand_times[2] << " " << expand_times[3];
+    if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
+        first_epoch_for_reinit_) {
+      last_x_dims_ = x_dims;
+      first_epoch_for_reinit_ = false;
+      // compute image shape
+      paddle::lite::CLImageConverterDefault default_convertor;
+      out_img_shape_ = default_convertor.InitImageDimInfoWith(out_dims);
+      VLOG(1) << "out_img_shape_:  " << out_img_shape_[0] << "  "
+              << out_img_shape_[1];
+      // compute global work size
+      auto image_width = out_dims[3] * ((out_dims[1] + 3) / 4);
+      size_t work_size_0 = image_width / out_dims[3];
+      size_t work_size_1 = out_dims[3];
+      size_t work_size_2 = out_dims[0] * out_dims[2];
+      global_work_size_ = cl::NDRange{work_size_0, work_size_1, work_size_2};
+      VLOG(1) << "global_work_size_:  " << global_work_size_[0] << " "
+              << global_work_size_[1] << " " << global_work_size_[2];
+    }
+  }
+  void Run() override {
+    auto* x_img = expand_param_->X->data<half_t, cl::Image2D>();
+    auto* out_img = expand_param_->Out->mutable_data<half_t, cl::Image2D>(
+        out_img_shape_[0], out_img_shape_[1]);
+    auto expand_times = expand_param_->expand_times;
+    auto x_dims = expand_param_->X->dims();
+    int in_n = x_dims[0];
+    int in_c = x_dims[1];
+    int in_h = x_dims[2];
+    int in_w = x_dims[3];
+    auto out_dims = expand_param_->Out->dims();
+    int out_n = out_dims[0];
+    int out_c = out_dims[1];
+    int out_h = out_dims[2];
+    int out_w = out_dims[3];
+    auto out_image_width = out_dims[3] * ((out_dims[1] + 3) / 4);
+    int out_c_block = out_image_width / out_dims[3];
+    int out_nh = out_dims[0] * out_dims[2];
+    auto in_image_width = x_dims[3] * ((x_dims[1] + 3) / 4);
+    int in_c_block = in_image_width / x_dims[3];
+    int in_nh = x_dims[0] * x_dims[2];
+    int expand_times_n = expand_times[0];
+    int expand_times_c = expand_times[1];
+    int expand_times_h = expand_times[2];
+    int expand_times_w = expand_times[3];
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    auto kernel = kernel_;
+    cl_int status;
+    status = kernel.setArg(0, out_c_block);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(1, out_w);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(2, out_nh);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(3, in_c_block);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(4, in_w);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(5, in_nh);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(6, in_w);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(7, in_h);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(8, out_w);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(9, out_h);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(10, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(11, *out_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(12, expand_times_n);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(13, expand_times_c);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(14, expand_times_h);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(15, expand_times_w);
+    CL_CHECK_FATAL(status);
+    status = EnqueueNDRangeKernel(context,
+                                  kernel,
+                                  cl::NullRange,
+                                  global_work_size_,
+                                  cl::NullRange,
+                                  nullptr,
+                                  event_);
+    CL_CHECK_FATAL(status);
+  }
+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+ private:
+  std::string kernel_func_name_{};
+  std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
+  param_t* expand_param_{nullptr};
+  cl::Kernel kernel_;
+  bool first_epoch_for_reinit_{true};
+  DDim last_x_dims_;
+  DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
+  cl::NDRange global_work_size_ = cl::NDRange{
+      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
+};
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(expand,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::ExpandComputeImage2D,
+                     image2d)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
--- a/lite/kernels/opencl/expand_image_compute_test.cc
+++ b/lite/kernels/opencl/expand_image_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <random>
+#include <gtest/gtest.h>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/opencl/test_helper.h"
+#define FP16_MAX_DIFF (5e-1)
+namespace paddle {
+namespace lite {
+TEST(expand_hw_image2d, compute) {
+  LOG(INFO) << "create kernel ...";
+  auto kernels = KernelRegistry::Global().Create(
+      "expand", TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault));
+  ASSERT_FALSE(kernels.empty());
+  const int INPUT_N = 1;
+  const int INPUT_C = 1;
+  const int INPUT_H = 2;
+  const int INPUT_W = 3;
+  const int EXPAND_N = 1;
+  const int EXPAND_C = 1;
+  const int EXPAND_H = 2;
+  const int EXPAND_W = 3;
+  auto kernel = std::move(kernels.front());
+  LOG(INFO) << "prepare to test kernel ====> " << kernel->doc();
+  lite::Tensor x, out;
+  operators::ExpandParam param;
+  param.X = &x;
+  param.Out = &out;
+  param.expand_times = {EXPAND_N, EXPAND_C, EXPAND_H, EXPAND_W};
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> pixel_shuffle_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(pixel_shuffle_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(pixel_shuffle_context));
+  const DDim in_dim =
+      DDim(std::vector<DDim::value_type>{INPUT_N, INPUT_C, INPUT_H, INPUT_W});
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{INPUT_N * EXPAND_N,
+                                                          INPUT_C * EXPAND_C,
+                                                          INPUT_H * EXPAND_H,
+                                                          INPUT_W * EXPAND_W});
+  LOG(INFO) << "in_dim: " << in_dim;
+  LOG(INFO) << "expand_times: " << EXPAND_N << EXPAND_C << EXPAND_H << EXPAND_W;
+  LOG(INFO) << "out_dim: " << out_dim;
+  x.Resize(in_dim);
+  out.Resize(out_dim);
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-2, 2);
+  std::vector<float> input_v(INPUT_N * INPUT_C * INPUT_H * INPUT_W);
+  int index = 0;
+  for (auto& i : input_v) {
+    i = index++;
+  }
+  VLOG(1) << "input_v ..... ";
+  for (size_t i = 0; i < input_v.size(); i++) {
+    VLOG(10) << input_v[i];
+  }
+  LOG(INFO) << "prepare input";
+  CLImageConverterDefault* default_converter = new CLImageConverterDefault();
+  DDim x_image_shape = default_converter->InitImageDimInfoWith(in_dim);
+  LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " "
+            << x_image_shape[1];
+  std::vector<half_t> x_image_data(x_image_shape.production() * 4);  // 4 : RGBA
+  default_converter->NCHWToImage(input_v.data(), x_image_data.data(), in_dim);
+  auto* x_image = x.mutable_data<half_t, cl::Image2D>(
+      x_image_shape[0], x_image_shape[1], x_image_data.data());
+  VLOG(1) << "x_image_data ..... ";
+  for (size_t i = 0; i < x_image_data.size(); i++) {
+    VLOG(10) << Half2Float(x_image_data[i]);
+  }
+  DDim out_image_shape = default_converter->InitImageDimInfoWith(out_dim);
+  LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
+            << out_image_shape[1];
+  auto* out_image = out.mutable_data<half_t, cl::Image2D>(out_image_shape[0],
+                                                          out_image_shape[1]);
+  kernel->Launch();
+  CLRuntime::Global()->command_queue().finish();
+  std::vector<float> out_data_v{0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0,
+                                1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4,
+                                5, 5, 5, 3, 3, 3, 4, 4, 4, 5, 5, 5};
+  const size_t cl_image2d_row_pitch{0};
+  const size_t cl_image2d_slice_pitch{0};
+  half_t* out_image_data = new half_t[out_image_shape.production() * 4];
+  TargetWrapperCL::ImgcpySync(out_image_data,
+                              out_image,
+                              out_image_shape[0],
+                              out_image_shape[1],
+                              cl_image2d_row_pitch,
+                              cl_image2d_slice_pitch,
+                              IoDirection::DtoH);
+  VLOG(1) << "out_image_data ..... ";
+  for (size_t i = 0; i < out_image_shape.production() * 4; i++) {
+    VLOG(10) << Half2Float(out_image_data[i]);
+  }
+  float* out_data = new float[out_image_shape.production() * 4];
+  default_converter->ImageToNCHW(
+      out_image_data, out_data, out_image_shape, out_dim);
+  VLOG(1) << "out_data ..... ";
+  for (int i = 0; i < out_dim.production(); i++) {
+    VLOG(10) << out_data[i];
+  }
+  for (int i = 0; i < out_dim.production(); i++) {
+    auto abs_diff = abs(out_data[i] - out_data_v[i]);
+    auto relative_diff = COMPUTE_RELATIVE_DIFF(out_data[i], out_data_v[i]);
+    EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
+              true);
+    if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
+      LOG(ERROR) << "error idx:" << i << " out_data[" << i
+                 << "]:" << out_data[i] << " "
+                                           "out_ref["
+                 << i << "]:" << out_data_v[i] << " abs_diff:" << abs_diff
+                 << " relative_diff:" << relative_diff
+                 << " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
+    }
+  }
+}
+TEST(expand_c2hw_image2d, compute) {
+  LOG(INFO) << "create kernel ...";
+  auto kernels = KernelRegistry::Global().Create(
+      "expand", TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault));
+  ASSERT_FALSE(kernels.empty());
+  const int INPUT_N = 1;
+  const int INPUT_C = 2;
+  const int INPUT_H = 2;
+  const int INPUT_W = 3;
+  const int EXPAND_N = 1;
+  const int EXPAND_C = 1;
+  const int EXPAND_H = 2;
+  const int EXPAND_W = 1;
+  auto kernel = std::move(kernels.front());
+  LOG(INFO) << "prepare to test kernel ====> " << kernel->doc();
+  lite::Tensor x, out;
+  operators::ExpandParam param;
+  param.X = &x;
+  param.Out = &out;
+  param.expand_times = {EXPAND_N, EXPAND_C, EXPAND_H, EXPAND_W};
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> pixel_shuffle_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(pixel_shuffle_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(pixel_shuffle_context));
+  const DDim in_dim =
+      DDim(std::vector<DDim::value_type>{INPUT_N, INPUT_C, INPUT_H, INPUT_W});
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{INPUT_N * EXPAND_N,
+                                                          INPUT_C * EXPAND_C,
+                                                          INPUT_H * EXPAND_H,
+                                                          INPUT_W * EXPAND_W});
+  LOG(INFO) << "in_dim: " << in_dim;
+  LOG(INFO) << "expand_times: " << EXPAND_N << EXPAND_C << EXPAND_H << EXPAND_W;
+  LOG(INFO) << "out_dim: " << out_dim;
+  x.Resize(in_dim);
+  out.Resize(out_dim);
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-2, 2);
+  std::vector<float> input_v(INPUT_N * INPUT_C * INPUT_H * INPUT_W);
+  int index = 0;
+  for (auto& i : input_v) {
+    i = index++;
+  }
+  VLOG(1) << "input_v ..... ";
+  for (size_t i = 0; i < input_v.size(); i++) {
+    VLOG(10) << input_v[i];
+  }
+  LOG(INFO) << "prepare input";
+  CLImageConverterDefault* default_converter = new CLImageConverterDefault();
+  DDim x_image_shape = default_converter->InitImageDimInfoWith(in_dim);
+  LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " "
+            << x_image_shape[1];
+  std::vector<half_t> x_image_data(x_image_shape.production() * 4);  // 4 : RGBA
+  default_converter->NCHWToImage(input_v.data(), x_image_data.data(), in_dim);
+  auto* x_image = x.mutable_data<half_t, cl::Image2D>(
+      x_image_shape[0], x_image_shape[1], x_image_data.data());
+  VLOG(1) << "x_image_data ..... ";
+  for (size_t i = 0; i < x_image_data.size(); i++) {
+    VLOG(10) << Half2Float(x_image_data[i]);
+  }
+  DDim out_image_shape = default_converter->InitImageDimInfoWith(out_dim);
+  LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
+            << out_image_shape[1];
+  auto* out_image = out.mutable_data<half_t, cl::Image2D>(out_image_shape[0],
+                                                          out_image_shape[1]);
+  kernel->Launch();
+  CLRuntime::Global()->command_queue().finish();
+  std::vector<float> out_data_v{0, 1, 2, 0, 1, 2, 3, 4,  5,  3, 4,  5,
+                                6, 7, 8, 6, 7, 8, 9, 10, 11, 9, 10, 11};
+  const size_t cl_image2d_row_pitch{0};
+  const size_t cl_image2d_slice_pitch{0};
+  half_t* out_image_data = new half_t[out_image_shape.production() * 4];
+  TargetWrapperCL::ImgcpySync(out_image_data,
+                              out_image,
+                              out_image_shape[0],
+                              out_image_shape[1],
+                              cl_image2d_row_pitch,
+                              cl_image2d_slice_pitch,
+                              IoDirection::DtoH);
+  VLOG(1) << "out_image_data ..... ";
+  for (size_t i = 0; i < out_image_shape.production() * 4; i++) {
+    VLOG(10) << Half2Float(out_image_data[i]);
+  }
+  float* out_data = new float[out_image_shape.production() * 4];
+  default_converter->ImageToNCHW(
+      out_image_data, out_data, out_image_shape, out_dim);
+  VLOG(1) << "out_data ..... ";
+  for (int i = 0; i < out_dim.production(); i++) {
+    VLOG(10) << out_data[i];
+  }
+  for (int i = 0; i < out_dim.production(); i++) {
+    auto abs_diff = abs(out_data[i] - out_data_v[i]);
+    auto relative_diff = COMPUTE_RELATIVE_DIFF(out_data[i], out_data_v[i]);
+    EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
+              true);
+    if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
+      LOG(ERROR) << "error idx:" << i << " out_data[" << i
+                 << "]:" << out_data[i] << " "
+                                           "out_ref["
+                 << i << "]:" << out_data_v[i] << " abs_diff:" << abs_diff
+                 << " relative_diff:" << relative_diff
+                 << " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
+    }
+  }
+}
+TEST(expand_c3hw_image2d, compute) {
+  LOG(INFO) << "create kernel ...";
+  auto kernels = KernelRegistry::Global().Create(
+      "expand", TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault));
+  ASSERT_FALSE(kernels.empty());
+  const int INPUT_N = 1;
+  const int INPUT_C = 3;
+  const int INPUT_H = 2;
+  const int INPUT_W = 3;
+  const int EXPAND_N = 1;
+  const int EXPAND_C = 1;
+  const int EXPAND_H = 2;
+  const int EXPAND_W = 1;
+  auto kernel = std::move(kernels.front());
+  LOG(INFO) << "prepare to test kernel ====> " << kernel->doc();
+  lite::Tensor x, out;
+  operators::ExpandParam param;
+  param.X = &x;
+  param.Out = &out;
+  param.expand_times = {EXPAND_N, EXPAND_C, EXPAND_H, EXPAND_W};
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> pixel_shuffle_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(pixel_shuffle_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(pixel_shuffle_context));
+  const DDim in_dim =
+      DDim(std::vector<DDim::value_type>{INPUT_N, INPUT_C, INPUT_H, INPUT_W});
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{INPUT_N * EXPAND_N,
+                                                          INPUT_C * EXPAND_C,
+                                                          INPUT_H * EXPAND_H,
+                                                          INPUT_W * EXPAND_W});
+  LOG(INFO) << "in_dim: " << in_dim;
+  LOG(INFO) << "expand_times: " << EXPAND_N << EXPAND_C << EXPAND_H << EXPAND_W;
+  LOG(INFO) << "out_dim: " << out_dim;
+  x.Resize(in_dim);
+  out.Resize(out_dim);
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-2, 2);
+  std::vector<float> input_v(INPUT_N * INPUT_C * INPUT_H * INPUT_W);
+  int index = 0;
+  for (auto& i : input_v) {
+    i = index++;
+  }
+  VLOG(1) << "input_v ..... ";
+  for (size_t i = 0; i < input_v.size(); i++) {
+    VLOG(10) << input_v[i];
+  }
+  LOG(INFO) << "prepare input";
+  CLImageConverterDefault* default_converter = new CLImageConverterDefault();
+  DDim x_image_shape = default_converter->InitImageDimInfoWith(in_dim);
+  LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " "
+            << x_image_shape[1];
+  std::vector<half_t> x_image_data(x_image_shape.production() * 4);  // 4 : RGBA
+  default_converter->NCHWToImage(input_v.data(), x_image_data.data(), in_dim);
+  auto* x_image = x.mutable_data<half_t, cl::Image2D>(
+      x_image_shape[0], x_image_shape[1], x_image_data.data());
+  VLOG(1) << "x_image_data ..... ";
+  for (size_t i = 0; i < x_image_data.size(); i++) {
+    VLOG(10) << Half2Float(x_image_data[i]);
+  }
+  DDim out_image_shape = default_converter->InitImageDimInfoWith(out_dim);
+  LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
+            << out_image_shape[1];
+  auto* out_image = out.mutable_data<half_t, cl::Image2D>(out_image_shape[0],
+                                                          out_image_shape[1]);
+  kernel->Launch();
+  CLRuntime::Global()->command_queue().finish();
+  std::vector<float> out_data_v{0,  1,  2,  0,  1,  2,  3,  4,  5,  3,  4,  5,
+                                6,  7,  8,  6,  7,  8,  9,  10, 11, 9,  10, 11,
+                                12, 13, 14, 12, 13, 14, 15, 16, 17, 15, 16, 17};
+  const size_t cl_image2d_row_pitch{0};
+  const size_t cl_image2d_slice_pitch{0};
+  half_t* out_image_data = new half_t[out_image_shape.production() * 4];
+  TargetWrapperCL::ImgcpySync(out_image_data,
+                              out_image,
+                              out_image_shape[0],
+                              out_image_shape[1],
+                              cl_image2d_row_pitch,
+                              cl_image2d_slice_pitch,
+                              IoDirection::DtoH);
+  VLOG(1) << "out_image_data ..... ";
+  for (size_t i = 0; i < out_image_shape.production() * 4; i++) {
+    VLOG(10) << Half2Float(out_image_data[i]);
+  }
+  float* out_data = new float[out_image_shape.production() * 4];
+  default_converter->ImageToNCHW(
+      out_image_data, out_data, out_image_shape, out_dim);
+  VLOG(1) << "out_data ..... ";
+  for (int i = 0; i < out_dim.production(); i++) {
+    VLOG(10) << out_data[i];
+  }
+  for (int i = 0; i < out_dim.production(); i++) {
+    auto abs_diff = abs(out_data[i] - out_data_v[i]);
+    auto relative_diff = COMPUTE_RELATIVE_DIFF(out_data[i], out_data_v[i]);
+    EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
+              true);
+    if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
+      LOG(ERROR) << "error idx:" << i << " out_data[" << i
+                 << "]:" << out_data[i] << " "
+                                           "out_ref["
+                 << i << "]:" << out_data_v[i] << " abs_diff:" << abs_diff
+                 << " relative_diff:" << relative_diff
+                 << " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
+    }
+  }
+}
+TEST(expand_c4hw_image2d, compute) {
+  LOG(INFO) << "create kernel ...";
+  auto kernels = KernelRegistry::Global().Create(
+      "expand", TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault));
+  ASSERT_FALSE(kernels.empty());
+  const int INPUT_N = 1;
+  const int INPUT_C = 4;
+  const int INPUT_H = 2;
+  const int INPUT_W = 1;
+  const int EXPAND_N = 1;
+  const int EXPAND_C = 1;
+  const int EXPAND_H = 2;
+  const int EXPAND_W = 1;
+  auto kernel = std::move(kernels.front());
+  LOG(INFO) << "prepare to test kernel ====> " << kernel->doc();
+  lite::Tensor x, out;
+  operators::ExpandParam param;
+  param.X = &x;
+  param.Out = &out;
+  param.expand_times = {EXPAND_N, EXPAND_C, EXPAND_H, EXPAND_W};
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> pixel_shuffle_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(pixel_shuffle_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(pixel_shuffle_context));
+  const DDim in_dim =
+      DDim(std::vector<DDim::value_type>{INPUT_N, INPUT_C, INPUT_H, INPUT_W});
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{INPUT_N * EXPAND_N,
+                                                          INPUT_C * EXPAND_C,
+                                                          INPUT_H * EXPAND_H,
+                                                          INPUT_W * EXPAND_W});
+  LOG(INFO) << "in_dim: " << in_dim;
+  LOG(INFO) << "expand_times: " << EXPAND_N << EXPAND_C << EXPAND_H << EXPAND_W;
+  LOG(INFO) << "out_dim: " << out_dim;
+  x.Resize(in_dim);
+  out.Resize(out_dim);
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-2, 2);
+  std::vector<float> input_v(INPUT_N * INPUT_C * INPUT_H * INPUT_W);
+  int index = 0;
+  for (auto& i : input_v) {
+    i = index++;
+  }
+  VLOG(1) << "input_v ..... ";
+  for (size_t i = 0; i < input_v.size(); i++) {
+    VLOG(10) << input_v[i];
+  }
+  LOG(INFO) << "prepare input";
+  CLImageConverterDefault* default_converter = new CLImageConverterDefault();
+  DDim x_image_shape = default_converter->InitImageDimInfoWith(in_dim);
+  LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " "
+            << x_image_shape[1];
+  std::vector<half_t> x_image_data(x_image_shape.production() * 4);  // 4 : RGBA
+  default_converter->NCHWToImage(input_v.data(), x_image_data.data(), in_dim);
+  auto* x_image = x.mutable_data<half_t, cl::Image2D>(
+      x_image_shape[0], x_image_shape[1], x_image_data.data());
+  VLOG(1) << "x_image_data ..... ";
+  for (size_t i = 0; i < x_image_data.size(); i++) {
+    VLOG(10) << Half2Float(x_image_data[i]);
+  }
+  DDim out_image_shape = default_converter->InitImageDimInfoWith(out_dim);
+  LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
+            << out_image_shape[1];
+  auto* out_image = out.mutable_data<half_t, cl::Image2D>(out_image_shape[0],
+                                                          out_image_shape[1]);
+  kernel->Launch();
+  CLRuntime::Global()->command_queue().finish();
+  std::vector<float> out_data_v{0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7};
+  const size_t cl_image2d_row_pitch{0};
+  const size_t cl_image2d_slice_pitch{0};
+  half_t* out_image_data = new half_t[out_image_shape.production() * 4];
+  TargetWrapperCL::ImgcpySync(out_image_data,
+                              out_image,
+                              out_image_shape[0],
+                              out_image_shape[1],
+                              cl_image2d_row_pitch,
+                              cl_image2d_slice_pitch,
+                              IoDirection::DtoH);
+  VLOG(1) << "out_image_data ..... ";
+  for (size_t i = 0; i < out_image_shape.production() * 4; i++) {
+    VLOG(10) << Half2Float(out_image_data[i]);
+  }
+  float* out_data = new float[out_image_shape.production() * 4];
+  default_converter->ImageToNCHW(
+      out_image_data, out_data, out_image_shape, out_dim);
+  VLOG(1) << "out_data ..... ";
+  for (int i = 0; i < out_dim.production(); i++) {
+    VLOG(10) << out_data[i];
+  }
+  for (int i = 0; i < out_dim.production(); i++) {
+    auto abs_diff = abs(out_data[i] - out_data_v[i]);
+    auto relative_diff = COMPUTE_RELATIVE_DIFF(out_data[i], out_data_v[i]);
+    EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
+              true);
+    if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
+      LOG(ERROR) << "error idx:" << i << " out_data[" << i
+                 << "]:" << out_data[i] << " "
+                                           "out_ref["
+                 << i << "]:" << out_data_v[i] << " abs_diff:" << abs_diff
+                 << " relative_diff:" << relative_diff
+                 << " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
+    }
+  }
+}
+TEST(expand_n_image2d, compute) {
+  LOG(INFO) << "create kernel ...";
+  auto kernels = KernelRegistry::Global().Create(
+      "expand", TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault));
+  ASSERT_FALSE(kernels.empty());
+  const int INPUT_N = 1;
+  const int INPUT_C = 1;
+  const int INPUT_H = 2;
+  const int INPUT_W = 3;
+  const int EXPAND_N = 2;
+  const int EXPAND_C = 1;
+  const int EXPAND_H = 2;
+  const int EXPAND_W = 3;
+  auto kernel = std::move(kernels.front());
+  LOG(INFO) << "prepare to test kernel ====> " << kernel->doc();
+  lite::Tensor x, out;
+  operators::ExpandParam param;
+  param.X = &x;
+  param.Out = &out;
+  param.expand_times = {EXPAND_N, EXPAND_C, EXPAND_H, EXPAND_W};
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> pixel_shuffle_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(pixel_shuffle_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(pixel_shuffle_context));
+  const DDim in_dim =
+      DDim(std::vector<DDim::value_type>{INPUT_N, INPUT_C, INPUT_H, INPUT_W});
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{INPUT_N * EXPAND_N,
+                                                          INPUT_C * EXPAND_C,
+                                                          INPUT_H * EXPAND_H,
+                                                          INPUT_W * EXPAND_W});
+  LOG(INFO) << "in_dim: " << in_dim;
+  LOG(INFO) << "expand_times: " << EXPAND_N << EXPAND_C << EXPAND_H << EXPAND_W;
+  LOG(INFO) << "out_dim: " << out_dim;
+  x.Resize(in_dim);
+  out.Resize(out_dim);
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-2, 2);
+  std::vector<float> input_v(INPUT_N * INPUT_C * INPUT_H * INPUT_W);
+  int index = 0;
+  for (auto& i : input_v) {
+    i = index++;
+  }
+  VLOG(1) << "input_v ..... ";
+  for (size_t i = 0; i < input_v.size(); i++) {
+    VLOG(10) << input_v[i];
+  }
+  LOG(INFO) << "prepare input";
+  CLImageConverterDefault* default_converter = new CLImageConverterDefault();
+  DDim x_image_shape = default_converter->InitImageDimInfoWith(in_dim);
+  LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " "
+            << x_image_shape[1];
+  std::vector<half_t> x_image_data(x_image_shape.production() * 4);  // 4 : RGBA
+  default_converter->NCHWToImage(input_v.data(), x_image_data.data(), in_dim);
+  auto* x_image = x.mutable_data<half_t, cl::Image2D>(
+      x_image_shape[0], x_image_shape[1], x_image_data.data());
+  VLOG(1) << "x_image_data ..... ";
+  for (size_t i = 0; i < x_image_data.size(); i++) {
+    VLOG(10) << Half2Float(x_image_data[i]);
+  }
+  DDim out_image_shape = default_converter->InitImageDimInfoWith(out_dim);
+  LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
+            << out_image_shape[1];
+  auto* out_image = out.mutable_data<half_t, cl::Image2D>(out_image_shape[0],
+                                                          out_image_shape[1]);
+  kernel->Launch();
+  CLRuntime::Global()->command_queue().finish();
+  std::vector<float> out_data_v{
+      0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4,
+      5, 5, 5, 3, 3, 3, 4, 4, 4, 5, 5, 5, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0,
+      1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 3, 3, 3, 4, 4, 4, 5, 5, 5};
+  const size_t cl_image2d_row_pitch{0};
+  const size_t cl_image2d_slice_pitch{0};
+  half_t* out_image_data = new half_t[out_image_shape.production() * 4];
+  TargetWrapperCL::ImgcpySync(out_image_data,
+                              out_image,
+                              out_image_shape[0],
+                              out_image_shape[1],
+                              cl_image2d_row_pitch,
+                              cl_image2d_slice_pitch,
+                              IoDirection::DtoH);
+  VLOG(1) << "out_image_data ..... ";
+  for (size_t i = 0; i < out_image_shape.production() * 4; i++) {
+    VLOG(10) << Half2Float(out_image_data[i]);
+  }
+  float* out_data = new float[out_image_shape.production() * 4];
+  default_converter->ImageToNCHW(
+      out_image_data, out_data, out_image_shape, out_dim);
+  VLOG(1) << "out_data ..... ";
+  for (int i = 0; i < out_dim.production(); i++) {
+    VLOG(10) << out_data[i];
+  }
+  for (int i = 0; i < out_dim.production(); i++) {
+    auto abs_diff = abs(out_data[i] - out_data_v[i]);
+    auto relative_diff = COMPUTE_RELATIVE_DIFF(out_data[i], out_data_v[i]);
+    EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
+              true);
+    if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
+      LOG(ERROR) << "error idx:" << i << " out_data[" << i
+                 << "]:" << out_data[i] << " "
+                                           "out_ref["
+                 << i << "]:" << out_data_v[i] << " abs_diff:" << abs_diff
+                 << " relative_diff:" << relative_diff
+                 << " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
+    }
+  }
+}
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(expand, kOpenCL, kFP16, kImageDefault, image2d);