[LITE][OPENCL] Add nearest_interp kernel of OpenCL Image2D format and UT. test=develop(#2838)

028a35b1 · xiaogang · GitHub · 88abc6ff · 028a35b1 · 028a35b1
4 changed file
--- a/lite/backends/opencl/cl_kernel/image/nearest_interp_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/nearest_interp_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void nearest_interp(__read_only image2d_t input, __write_only image2d_t output,
+                             __private const float scale_h, __private const float scale_w,
+                             __private const int in_dims_h, __private const int out_dims_h,
+                             __private const int in_dims_w, __private const int out_dims_w) {
+                             const int c = get_global_id(0);
+                             const int w = get_global_id(1);
+                             const int nh = get_global_id(2);
+                             int2 output_pos;
+                             output_pos.x = c * out_dims_w + w;
+                             output_pos.y = nh;
+                             int out_n = nh / out_dims_h;
+                             int out_h = nh % out_dims_h;
+                             int2 input_pos;
+                             input_pos.x = c * in_dims_w + w / scale_w;
+                             input_pos.y = out_n * in_dims_h + out_h / scale_h;
+
+                             const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                             CLK_ADDRESS_CLAMP |
+                                                             CLK_FILTER_NEAREST;
+                             half4 input_data = read_imageh(input, sampler, (int2)(input_pos.x, input_pos.y));
+                             write_imageh(output, (int2)(output_pos.x , output_pos.y), input_data);
+}
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -19,6 +19,7 @@ add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc
 add_kernel(reshape_opencl OPENCL basic SRCS reshape_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps} cl_image_converter)
 add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_compute.cc DEPS ${cl_kernel_deps})

 lite_cc_test(test_elementwise_add_opencl SRCS elementwise_add_compute_test.cc
             DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context
@@ -75,5 +76,8 @@ lite_cc_test(test_conv_image2d_opencl SRCS conv_image2d_compute_test.cc
        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)

 lite_cc_test(test_layout_opencl SRCS layout_compute_test.cc
-             DEPS layout_opencl op_registry program context
-             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+        DEPS layout_opencl op_registry program context cl_image_converter
+        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+lite_cc_test(test_nearest_interp_opencl SRCS nearest_interp_compute_test.cc
+        DEPS nearest_interp_opencl layout_opencl op_registry program context cl_image_converter
+        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
--- a/lite/kernels/opencl/nearest_interp_compute.cc
+++ b/lite/kernels/opencl/nearest_interp_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/replace_stl/stream.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+class NearestInterpComputeFloatImageDefault
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFloat),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::InterpolateParam;
+
+  std::string doc() const override {
+    return "NearestInterp using cl::Image2D(ImageDefault/RGBA), kFloat";
+  }
+
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/nearest_interp_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const auto& x_dims = param.X->dims();
+    auto* x_buf = param.X->data<float, cl::Image2D>();
+    auto* out_buf =
+        param.Out->mutable_data<float, cl::Image2D>(param.out_w, param.out_h);
+    const auto& y_dims = param.Out->dims();  // useless: check dim only
+    float scale_h = y_dims[2] / x_dims[2];
+    float scale_w = y_dims[3] / x_dims[3];
+    int in_dims_h = x_dims[2];
+    int out_dims_h = y_dims[2];
+    int in_dims_w = x_dims[3];
+    int out_dims_w = y_dims[3];
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    int arg_idx = 0;
+    cl_int status = kernel.setArg(arg_idx, *x_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const float>(scale_h));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const float>(scale_w));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims_h));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims_h));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims_w));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims_w));
+    CL_CHECK_FATAL(status);
+
+    paddle::lite::CLImageConverterDefault default_convertor;
+    auto y_img_shape = default_convertor.InitImageDimInfoWith(y_dims);  // w, h
+    auto y_img_width = y_img_shape[0];
+    LOG(INFO) << "y_img_width:" << y_img_width;
+
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(y_img_width / y_dims[3]),
+                    static_cast<cl::size_type>(y_dims[3]),
+                    static_cast<cl::size_type>(y_dims[0] * y_dims[2])};
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
+    // context.cl_wait_list()->emplace(out_buf, event_);
+    context.cl_context()->GetCommandQueue().finish();
+  }
+
+ private:
+  std::string kernel_func_name_{"nearest_interp"};
+  std::string build_options_{"-DCL_DTYPE_float "};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+class NearestInterpComputeFP16ImageDefault
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFP16),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::InterpolateParam;
+
+  std::string doc() const override {
+    return "NearestInterp using cl::Image2D(ImageDefault/RGBA), kFP16";
+  }
+
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/nearest_interp_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const auto& x_dims = param.X->dims();
+    auto* x_buf =
+        param.X->data<int16_t,
+                      cl::Image2D>();  // use int16_t represents half float
+    auto image_shape = InitImageDimInfoWith(x_dims);
+    auto* out_buf =
+        param.Out->mutable_data<int16_t, cl::Image2D>(  // use int16_t
+                                                        // represents half float
+            image_shape["width"],
+            image_shape["height"]);
+    const auto& y_dims = param.Out->dims();  // useless: check dim only
+    float scale_h = y_dims[2] / x_dims[2];
+    float scale_w = y_dims[3] / x_dims[3];
+    int in_dims_h = x_dims[2];
+    int out_dims_h = y_dims[2];
+    int in_dims_w = x_dims[3];
+    int out_dims_w = y_dims[3];
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    int arg_idx = 0;
+    cl_int status = kernel.setArg(arg_idx, *x_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const float>(scale_h));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const float>(scale_w));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims_h));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims_h));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims_w));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims_w));
+    CL_CHECK_FATAL(status);
+
+    VLOG(4) << TargetToStr(param.X->target());
+    VLOG(4) << TargetToStr(param.Out->target());
+    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
+            << image_shape["height"];
+    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
+            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
+    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
+            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
+
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
+                    static_cast<cl::size_type>(image_shape["height"])};
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
+    // context.cl_wait_list()->emplace(out_buf, event_);
+    context.cl_context()->GetCommandQueue().finish();
+  }
+
+ private:
+  std::string kernel_func_name_{"nearest_interp"};
+  std::string build_options_{"-DCL_DTYPE_half"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    nearest_interp,
+    kOpenCL,
+    kFloat,
+    kImageDefault,
+    paddle::lite::kernels::opencl::NearestInterpComputeFloatImageDefault,
+    ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    nearest_interp,
+    kOpenCL,
+    kFP16,
+    kImageDefault,
+    paddle::lite::kernels::opencl::NearestInterpComputeFP16ImageDefault,
+    ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
--- a/lite/kernels/opencl/nearest_interp_compute_test.cc
+++ b/lite/kernels/opencl/nearest_interp_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/opencl/image_helper.h"
+
+namespace paddle {
+namespace lite {
+
+template <typename dtype>
+void nearest_interp_compute_ref(const dtype *src,
+                                int w_in,
+                                int h_in,
+                                dtype *dst,
+                                int w_out,
+                                int h_out,
+                                float scale_x,
+                                float scale_y,
+                                bool with_align = false) {
+  float scale_w_new = (with_align)
+                          ? (static_cast<float>(w_in - 1) / (w_out - 1))
+                          : (static_cast<float>(w_in) / (w_out));
+  float scale_h_new = (with_align)
+                          ? (static_cast<float>(h_in - 1) / (h_out - 1))
+                          : (static_cast<float>(h_in) / (h_out));
+  if (with_align) {
+    for (int h = 0; h < h_out; ++h) {
+      dtype *dst_p = dst + h * w_out;
+      int near_y = static_cast<int>(scale_h_new * h + 0.5);
+      for (int w = 0; w < w_out; ++w) {
+        int near_x = static_cast<int>(scale_w_new * w + 0.5);
+        *dst_p++ = src[near_y * w_in + near_x];
+      }
+    }
+  } else {
+    for (int h = 0; h < h_out; ++h) {
+      dtype *dst_p = dst + h * w_out;
+      int near_y = static_cast<int>(scale_h_new * h);
+      for (int w = 0; w < w_out; ++w) {
+        int near_x = static_cast<int>(scale_w_new * w);
+        *dst_p++ = src[near_y * w_in + near_x];
+      }
+    }
+  }
+}
+// #define LOOP_TEST
+// #define PRINT_RESULT
+TEST(nearest_interp_image2d_fp32, compute) {
+  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> "
+               "nearest_interp(img) -> "
+               "layout(img2buf) "
+               "-> host";
+
+#ifdef LOOP_TEST
+  for (int n : {1, 3}) {
+    for (auto c : {1, 3}) {
+      for (int h : {12, 20, 50, 112}) {
+        for (int w : {12, 20, 50, 112}) {
+          for (int out_h : {36, 60, 90, 224}) {
+            for (int out_w : {36, 60, 90, 224}) {
+              if (out_w < w || out_h < h) {
+                continue;
+              }
+#else
+  const int n = 1;
+  const int c = 2;
+  const int h = 3;
+  const int w = 4;
+  const int out_h = 6;
+  const int out_w = 8;
+#endif  // LOOP_TEST
+
+              float scale_x = out_w / w;
+              float scale_y = out_h / h;
+
+              LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c
+                        << " " << h << " " << w << " ========" << out_h << " "
+                        << out_w;
+              // set layout kernels
+              auto buf_to_img_kernels =
+                  KernelRegistry::Global().Create("layout",
+                                                  TARGET(kOpenCL),
+                                                  PRECISION(kAny),
+                                                  DATALAYOUT(kImageDefault));
+              auto img_to_buf_kernels =
+                  KernelRegistry::Global().Create("layout",
+                                                  TARGET(kOpenCL),
+                                                  PRECISION(kAny),
+                                                  DATALAYOUT(kNCHW));
+              auto nearest_interp_img_kernels =
+                  KernelRegistry::Global().Create("nearest_interp",
+                                                  TARGET(kOpenCL),
+                                                  PRECISION(kFloat),
+                                                  DATALAYOUT(kImageDefault));
+              ASSERT_FALSE(buf_to_img_kernels.empty());
+              ASSERT_FALSE(buf_to_img_kernels.empty());
+              ASSERT_FALSE(nearest_interp_img_kernels.empty());
+
+              auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
+              auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
+              auto nearest_interp_img_kernel =
+                  std::move(nearest_interp_img_kernels.front());
+              LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
+              LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
+              LOG(INFO) << "get 3rd kernel: "
+                        << nearest_interp_img_kernel->doc();
+
+              // set tensors about op param
+              LOG(INFO) << "set tensors about op param";
+              // layout(buf->img): x -> nearest_interp_in
+              // nearest_interp(img): nearest_interp_in -> nearest_interp_out
+              // layout(img->buf): nearest_interp_out -> y
+              lite::Tensor x, y, nearest_interp_in, nearest_interp_out, y_ref;
+              operators::LayoutParam BufferToImageParam;
+              operators::LayoutParam ImageToBufferParam;
+              BufferToImageParam.x = &x;
+              BufferToImageParam.y = &nearest_interp_in;
+              ImageToBufferParam.x = &nearest_interp_out;
+              ImageToBufferParam.y = &y;
+              operators::InterpolateParam NearestInterpParam;
+              NearestInterpParam.X = &nearest_interp_in;
+              NearestInterpParam.Out = &nearest_interp_out;
+              NearestInterpParam.out_h = out_h;
+              NearestInterpParam.out_w = out_w;
+
+              const DDim x_dim =
+                  DDim(std::vector<DDim::value_type>{n, c, h, w});
+              const DDim y_dim =
+                  DDim(std::vector<DDim::value_type>{n, c, out_h, out_w});
+              x.Resize(x_dim);
+              y.Resize(y_dim);
+              nearest_interp_in.Resize(x_dim);
+              nearest_interp_out.Resize(y_dim);
+              y_ref.Resize(y_dim);
+              auto nearest_interp_image2d_shape =
+                  paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
+
+              // initialize tensors
+              LOG(INFO) << "initialize tensors";
+              auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+              auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+              auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
+              auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
+                  x_data, 0, sizeof(float) * x_dim.production()));
+              auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
+                  y_data, 0, sizeof(float) * y_dim.production()));
+              for (int i = 0; i < x_dim.production(); ++i) {
+                mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2;
+              }
+              for (int i = 0; i < y_dim.production(); ++i) {
+                mapped_y[i] = static_cast<int>(0);
+              }
+              auto *nearest_interp_in_data =
+                  nearest_interp_in.mutable_data<float, cl::Image2D>(
+                      nearest_interp_image2d_shape["width"],
+                      nearest_interp_image2d_shape["height"]);
+              auto *nearest_interp_out_data =
+                  nearest_interp_out.mutable_data<float, cl::Image2D>(y_dim[3],
+                                                                      y_dim[2]);
+
+              // set context and kernel args
+              LOG(INFO) << "set context and kernel args";
+              std::unique_ptr<KernelContext> context(new KernelContext);
+              context->As<OpenCLContext>().InitOnce();
+
+              buf_to_img_kernel->SetParam(BufferToImageParam);
+              std::unique_ptr<KernelContext> buf_to_img_context(
+                  new KernelContext);
+              context->As<OpenCLContext>().CopySharedTo(
+                  &(buf_to_img_context->As<OpenCLContext>()));
+              buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
+
+              img_to_buf_kernel->SetParam(ImageToBufferParam);
+              std::unique_ptr<KernelContext> img_to_buf_context(
+                  new KernelContext);
+              context->As<OpenCLContext>().CopySharedTo(
+                  &(img_to_buf_context->As<OpenCLContext>()));
+              img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
+
+              nearest_interp_img_kernel->SetParam(NearestInterpParam);
+              std::unique_ptr<KernelContext> nearest_interp_img_context(
+                  new KernelContext);
+              context->As<OpenCLContext>().CopySharedTo(
+                  &(nearest_interp_img_context->As<OpenCLContext>()));
+              nearest_interp_img_kernel->SetContext(
+                  std::move(nearest_interp_img_context));
+
+              // run kernels
+              LOG(INFO) << "run kernel: buf_to_img_kernel";
+              buf_to_img_kernel->Launch();
+              LOG(INFO) << "run kernel: nearest_interp_img_kernel";
+              nearest_interp_img_kernel->Launch();
+              LOG(INFO) << "run kernel: img_to_buf_kernel";
+              img_to_buf_kernel->Launch();
+
+              // compute ref cpu
+              for (int nid = 0; nid < x_dim[0]; ++nid) {
+                for (int cid = 0; cid < x_dim[1]; ++cid) {
+                  float *x_nc =
+                      mapped_x + (nid * x_dim[1] + cid) * x_dim[3] * x_dim[2];
+                  float *y_nc =
+                      y_data_ref + (nid * x_dim[1] + cid) * y_dim[3] * y_dim[2];
+                  nearest_interp_compute_ref<float>(x_nc,
+                                                    x_dim[3],
+                                                    x_dim[2],
+                                                    y_nc,
+                                                    y_dim[3],
+                                                    y_dim[2],
+                                                    1 / scale_x,
+                                                    1 / scale_y);
+                }
+              }
+// result
+#ifdef PRINT_RESULT
+              LOG(INFO) << "---- print kernel result (input -> output) ----";
+              for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
+                std::cout << mapped_x[eidx] << " ";
+              }
+              std::cout << std::endl;
+              for (int eidx = 0; eidx < y_dim.production(); ++eidx) {
+                std::cout << mapped_y[eidx] << " ";
+              }
+              std::cout << std::endl;
+              for (int eidx = 0; eidx < y_dim.production(); ++eidx) {
+                std::cout << y_data_ref[eidx] << " ";
+              }
+              std::cout << std::endl;
+#endif  // PRINT_RESULT
+
+              // check result: compare kernel output and cpu output(y_data_ref)
+              for (int eidx = 0; eidx < y_dim.production(); eidx++) {
+                EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
+                if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
+                  LOG(FATAL) << "1st diff in this case at eidx[from 0]:" << eidx
+                             << " / " << x_dim.production() << ", y_data_ref["
+                             << eidx << "]:" << y_data_ref[eidx]
+                             << ", mapped_y[" << eidx << "]:" << mapped_y[eidx];
+                  break;
+                }
+              }
+
+              // free
+              LOG(INFO) << "free: unmap x, y";
+              TargetWrapperCL::Unmap(x_data, mapped_x);
+              TargetWrapperCL::Unmap(y_data, mapped_y);
+#ifdef LOOP_TEST
+            }
+          }
+        }  // w
+      }    // h
+    }      // c
+  }        // n
+#else
+// nothing to do.
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+// nearest_interp buffer
+// USE_LITE_KERNEL(nearest_interp, kOpenCL, kFloat, kNCHW, def);
+
+// nearest_interp image2d fp32
+USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
+USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
+USE_LITE_KERNEL(nearest_interp, kOpenCL, kFloat, kImageDefault, ImageDefault);
+
+// nearest_interp image2d fp16
+USE_LITE_KERNEL(nearest_interp, kOpenCL, kFP16, kImageDefault, ImageDefault);