[opencl] Add lrn OP (#3104)

* add lrn op * fix v7 build error, test=develop

[opencl] Add lrn OP (#3104)
* add lrn op * fix v7 build error, test=develop
64cbe711 · HappyAngel · GitHub · dead5163 · 64cbe711 · 64cbe711
5 changed file
--- a/lite/backends/opencl/cl_kernel/image/lrn_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/lrn_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+__kernel void lrn(__read_only image2d_t input,
+                        __write_only image2d_t output,
+                        __private const int out_C,
+                        __private const int out_W,
+                        __private const int local_size,
+                        __private const float k,
+                        __private const float alpha,
+                        __private const float beta){
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  const int out_c0 = out_c * 4;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+
+  const int out_c1 = out_c0 + 1;
+  const int out_c2 = out_c0 + 2;
+  const int out_c3 = out_c0 + 3;
+
+  const int pad = (local_size - 1) / 2;
+  const int start = out_c0 - pad;
+  const int end = out_c0 + pad;
+  start = start > 0 ? start : 0;
+  end = end < out_C - 1 ? end : out_C - 1; 
+  float square0 = 0.0;
+  float square1 = 0.0;
+  float square2 = 0.0;
+  float square3 = 0.0;
+  for (int i = start; i <= end; i++){
+      int input_c0 = i / 4;
+      int2 input_pos;
+      input_pos.x = input_c0 * out_C + out_w;
+      input_pos.y = out_nh;
+      CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
+      int num = i % 4;
+      switch (num){
+        case 0:
+            square0 += input_data.x * input_data.x;
+            break;
+        case 1:
+            square0 += input_data.y * input_data.y;
+            break;
+        case 2:
+            square0 += input_data.z * input_data.z;
+            break;
+        case 3:
+            square0 += input_data.w * input_data.w;
+            break;
+      }
+  }
+  start = out_c1 - pad;
+  end = out_c1 + pad;
+  for (int i = start; i <= end; i++){
+      int input_c0 = i / 4;
+      int2 input_pos;
+      input_pos.x = input_c0 * out_C + out_w;
+      input_pos.y = out_nh;
+      CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
+      int num = i % 4;
+      switch (num){
+        case 0:
+            square1 += input_data.x * input_data.x;
+            break;
+        case 1:
+            square1 += input_data.y * input_data.y;
+            break;
+        case 2:
+            square1 += input_data.z * input_data.z;
+            break;
+        case 3:
+            square1 += input_data.w * input_data.w;
+            break;
+      }
+  }
+  start = out_c2 - pad;
+  end = out_c2 + pad;
+  for (int i = start; i <= end; i++){
+      int input_c0 = i / 4;
+      int2 input_pos;
+      input_pos.x = input_c0 * out_C + out_w;
+      input_pos.y = out_nh;
+      CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
+      int num = i % 4;
+      switch (num){
+        case 0:
+            square2 += input_data.x * input_data.x;
+            break;
+        case 1:
+            square2 += input_data.y * input_data.y;
+            break;
+        case 2:
+            square2 += input_data.z * input_data.z;
+            break;
+        case 3:
+            square2 += input_data.w * input_data.w;
+            break;
+      }
+  }
+  start = out_c3 - pad;
+  end = out_c3 + pad;
+  for (int i = start; i <= end; i++){
+      int input_c0 = i / 4;
+      int2 input_pos;
+      input_pos.x = input_c0 * out_C + out_w;
+      input_pos.y = out_nh;
+      CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
+      int num = i % 4;
+      switch (num){
+        case 0:
+            square3 += input_data.x * input_data.x;
+            break;
+        case 1:
+            square3 += input_data.y * input_data.y;
+            break;
+        case 2:
+            square3 += input_data.z * input_data.z;
+            break;
+        case 3:
+            square3 += input_data.w * input_data.w;
+            break;
+      }
+  }
+  int2 out_pos;
+  out_pos.x = out_c * out_W + out_w;
+  out_pos.y = out_nh;
+  CL_DTYPE4 input = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, out_pos);
+
+  float4 out_val;
+  out_val.x = input.x / (pow(k + alpha * (square0), beta));
+  if (out_c1 < out_C){
+      out_val.y = input.y / (pow(k + alpha * (square1), beta));
+  }
+  if (out_c2 < out_C){
+      out_val.z = input.z / (pow(k + alpha * (square1), beta));
+  }
+  if (out_c3 < out_C){
+      out_val.w = input.w / (pow(k + alpha * (square1), beta));
+  }
+  CL_DTYPE4 out_data = CONVERT_TYPE_TO(out_val, CL_DTYPE4);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, out_pos, out_data);
+}
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -23,7 +23,9 @@ add_kernel(concat_opencl OPENCL basic SRCS concat_image_compute.cc DEPS ${cl_ker
 add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(scale_opencl OPENCL basic SRCS scale_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(grid_sampler_opencl OPENCL basic SRCS grid_sampler_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(lrn_opencl OPENCL basic SRCS lrn_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(bilinear_interp_opencl OPENCL basic SRCS bilinear_interp_image_compute.cc DEPS ${cl_kernel_deps})
+
 # extra
 # wait to add ...

@@ -68,6 +70,9 @@ lite_cc_test(test_elementwise_add_image_opencl SRCS elementwise_add_image_comput

 lite_cc_test(test_grid_sampler_image_opencl SRCS grid_sampler_image_compute_test.cc
             DEPS grid_sampler_opencl op_registry program context)
+
+lite_cc_test(test_lrn_image_opencl SRCS lrn_image_compute_test.cc
+             DEPS lrn_opencl op_registry program context)
             
 lite_cc_test(test_bilinear_interp_image_opencl SRCS bilinear_interp_image_compute_test.cc
 	         DEPS bilinear_interp_opencl op_registry program context)

--- a/lite/kernels/opencl/lrn_image_compute.cc
+++ b/lite/kernels/opencl/lrn_image_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include "lite/backends/opencl/cl_half.h"
+#include "lite/backends/opencl/cl_image_converter.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/logging.h"
+#include "lite/utils/replace_stl/stream.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
+                                          PRECISION(kFP16),
+                                          DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::LrnParam;
+
+  std::string doc() const override {
+    return "Lrn using cl::Image2D(ImageDefault/RGBA), kFP16";
+  }
+
+  void PrepareForRun() override {
+    lrn_param_ = param_.get_mutable<param_t>();
+
+    auto& context = ctx_->As<OpenCLContext>();
+    n_ = lrn_param_->n;
+    k_ = lrn_param_->k;
+    alpha_ = lrn_param_->alpha;
+    beta_ = lrn_param_->beta;
+    norm_region_ = lrn_param_->norm_region;
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/lrn_kernel.cl", build_options_);
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+  }
+
+  void Run() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+
+    auto* x = lrn_param_->X;
+    auto* out = lrn_param_->Out;
+    if (norm_region_ != "AcrossChannels") {
+      LOG(FATAL) << "This norm_region_: " << norm_region_ << "doesn't support";
+      return;
+    }
+    auto out_dims = out->dims();
+    auto in_dims = x->dims();
+
+    VLOG(4) << "x->target(): " << TargetToStr(x->target());
+    VLOG(4) << "out->target(): " << TargetToStr(out->target());
+    VLOG(4) << "x->dims(): " << in_dims;
+    VLOG(4) << "lrn param: ";
+    VLOG(4) << "n: " << n_;
+    VLOG(4) << "k: " << k_;
+    VLOG(4) << "alpha: " << alpha_;
+    VLOG(4) << "beta: " << beta_;
+    VLOG(4) << "norm_region: " << norm_region_;
+
+    auto out_image_shape = InitImageDimInfoWith(out_dims);
+    auto* x_img = x->data<half_t, cl::Image2D>();
+    // VLOG(4) << "x_image: " << x_img;
+
+    auto* out_img = out->mutable_data<half_t, cl::Image2D>(
+        out_image_shape["width"], out_image_shape["height"]);
+    // VLOG(4) << "out_image" << out_img;
+    VLOG(4) << "out_image_shape[w,h]:" << out_image_shape["width"] << " "
+            << out_image_shape["height"];
+
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    int arg_idx = 0;
+    int out_channel = out_dims[1];
+    int out_width = out_dims[3];
+    auto default_work_size =
+        DefaultWorkSize(out_dims,
+                        DDim(std::vector<DDim::value_type>{
+                            static_cast<int64_t>(out_image_shape["width"]),
+                            static_cast<int64_t>(out_image_shape["height"])}));
+    VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
+            << default_work_size[1] << ", " << default_work_size[3];
+    cl_int status = kernel.setArg(arg_idx++, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, *out_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, out_channel);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, out_width);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, n_);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, k_);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, alpha_);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, beta_);
+    CL_CHECK_FATAL(status);
+
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
+                    static_cast<cl::size_type>(default_work_size[1]),
+                    static_cast<cl::size_type>(default_work_size[2])};
+
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_img, event_);
+
+    VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
+            << global_work_size[1] << " " << global_work_size[2];
+  }
+
+ protected:
+  param_t* lrn_param_{nullptr};
+  int n_{5};
+  float alpha_{1e-4};
+  float beta_{0.75};
+  float k_{1.};
+  std::string norm_region_{"AcrossChannels"};
+  std::string kernel_func_name_{"lrn"};
+  std::string build_options_{"-DCL_DTYPE_half"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+namespace ocl = paddle::lite::kernels::opencl;
+REGISTER_LITE_KERNEL(
+    lrn, kOpenCL, kFP16, kImageDefault, ocl::LrnImageCompute, ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
--- a/lite/kernels/opencl/lrn_image_compute_test.cc
+++ b/lite/kernels/opencl/lrn_image_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <memory>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/opencl/test_helper.h"
+
+#define FP16_MAX_DIFF (5e-1)
+
+namespace paddle {
+namespace lite {
+float lrn_square(const float* din,
+                 int c,
+                 int offset,
+                 int channel,
+                 int height,
+                 int width,
+                 int local_size) {
+  int pre_pad = (local_size - 1) / 2;
+  float sum = 0.f;
+  int start = c - pre_pad;
+  int end = c + pre_pad;
+  start = start < 0 ? 0 : start;
+  end = end < channel - 1 ? end : channel - 1;
+  for (int i = start; i <= end; i++) {
+    sum += din[i * height * width] * din[i * height * width];
+  }
+  return sum;
+}
+void lrn_ref(const float* din,
+             const DDim& in_dims,
+             float* output,
+             int local_size,
+             float k,
+             float alpha,
+             float beta,
+             std::string norm_region) {
+  int num = in_dims[0];
+  int channel = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+
+  if (norm_region == "AcrossChannels") {
+    for (int b = 0; b < num; b++) {
+      const float* din_batch = din + b * channel * height * width;
+      float* dout_batch = output + b * channel * height * width;
+      int offset_num = b * channel * height * width;
+      for (int c = 0; c < channel; c++) {
+        for (int h = 0; h < height; ++h) {
+          for (int w = 0; w < width; ++w) {
+            int offset_within_channel = h * width + w;
+            int dst_id = c * height * width + offset_within_channel;
+            float square = lrn_square(din_batch,
+                                      c,
+                                      offset_within_channel,
+                                      channel,
+                                      height,
+                                      width,
+                                      local_size);
+            dout_batch[dst_id] =
+                din_batch[dst_id] * pow(k + alpha * square, -beta);
+          }
+        }
+      }
+    }
+  }
+}
+// #define LRN_FP16_LOOP_TEST
+// #define LRN_FP16_PRINT_RESULT
+TEST(lrn_image2d, compute) {
+#ifdef LRN_FP16_LOOP_TEST
+  for (int n = 1; n <= 100; n += 33) {
+    for (auto c : {1, 3, 8, 23, 32}) {
+      for (int h = 12; h <= 100; h += 13) {
+        for (int w = 12; w <= 100; w += 25) {
+          for (auto num : {3, 5, 9}) {
+            for (auto k : {1.0, 1.5}) {
+              for (auto alpha : {1e-4}) {
+                for (auto beta : {0.5, 0.75}) {
+                  for (auto norm_region : {"AcrossChannels"}) {
+#else
+  const int n = 1;
+  const int c = 5;
+  const int h = 2;
+  const int w = 4;
+  const int num = 5;
+  const float k = 1.0;
+  const float alpha = 1e-4;
+  const float beta = 0.75;
+  const std::string norm_region = "AcrossChannels";
+#endif  // GRID_FP16_LOOP_TEST
+
+                    LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " "
+                              << c << " " << h << " " << w << " ========";
+                    LOG(INFO) << "LRN parameters: ";
+                    LOG(INFO) << "num: " << num << ", k: " << k
+                              << ", alpha: " << alpha << ", beta: " << beta
+                              << ", norm_region: " << norm_region;
+                    auto kernels = KernelRegistry::Global().Create(
+                        "lrn",
+                        TARGET(kOpenCL),
+                        PRECISION(kFP16),
+                        DATALAYOUT(kImageDefault));
+                    ASSERT_FALSE(kernels.empty());
+                    auto kernel = std::move(kernels.front());
+                    LOG(INFO) << "get kernel:" << kernel->doc();
+
+                    lite::Tensor x, out;
+                    operators::LrnParam param;
+                    param.X = &x;
+                    param.Out = &out;
+                    param.n = num;
+                    param.k = k;
+                    param.alpha = alpha;
+                    param.beta = beta;
+                    param.norm_region = norm_region;
+
+                    std::unique_ptr<KernelContext> context(new KernelContext);
+                    context->As<OpenCLContext>().InitOnce();
+
+                    kernel->SetParam(param);
+                    std::unique_ptr<KernelContext> lrn_context(
+                        new KernelContext);
+                    context->As<OpenCLContext>().CopySharedTo(
+                        &(lrn_context->As<OpenCLContext>()));
+                    kernel->SetContext(std::move(lrn_context));
+
+                    const DDim in_dim =
+                        DDim(std::vector<DDim::value_type>{n, c, h, w});
+                    const DDim out_dim =
+                        DDim(std::vector<DDim::value_type>{n, c, h, w});
+                    x.Resize(in_dim);
+                    out.Resize(out_dim);
+
+                    std::default_random_engine engine;
+                    std::uniform_real_distribution<float> dist(-1, 1);
+                    int sum = n * c * h * w;
+                    std::vector<float> input_v(sum);
+                    for (auto& i : input_v) {
+                      i = dist(engine);
+                    }
+
+                    LOG(INFO) << "prepare input";
+                    CLImageConverterDefault* default_converter =
+                        new CLImageConverterDefault();
+                    DDim x_image_shape =
+                        default_converter->InitImageDimInfoWith(in_dim);
+                    LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " "
+                              << x_image_shape[1];
+                    std::vector<half_t> x_image_data(
+                        x_image_shape.production() * 4);  // 4 : RGBA
+                    default_converter->NCHWToImage(
+                        input_v.data(), x_image_data.data(), in_dim);
+                    auto* x_image = x.mutable_data<half_t, cl::Image2D>(
+                        x_image_shape[0],
+                        x_image_shape[1],
+                        x_image_data.data());
+                    // LOG(INFO) << "x_image:" << x_image;
+
+                    DDim out_image_shape =
+                        default_converter->InitImageDimInfoWith(out_dim);
+                    LOG(INFO) << "out_image_shape = " << out_image_shape[0]
+                              << " " << out_image_shape[1];
+                    auto* out_image = out.mutable_data<half_t, cl::Image2D>(
+                        out_image_shape[0], out_image_shape[1]);
+                    // LOG(INFO) << "out_image:" << out_image;
+                    kernel->Launch();
+
+                    auto* wait_list =
+                        context->As<OpenCLContext>().cl_wait_list();
+                    auto* out_ptr = param.Out->data<half_t, cl::Image2D>();
+                    auto it = wait_list->find(out_ptr);
+                    if (it != wait_list->end()) {
+                      VLOG(4) << "--- Find the sync event for the target cl "
+                                 "tensor. ---";
+                      auto& event = *(it->second);
+                      event.wait();
+                    } else {
+                      LOG(FATAL) << "Could not find the sync event for the "
+                                    "target cl tensor.";
+                    }
+
+                    std::unique_ptr<float[]> out_ref(
+                        new float[out_dim.production()]);
+                    lrn_ref(input_v.data(),
+                            in_dim,
+                            out_ref.get(),
+                            num,
+                            k,
+                            alpha,
+                            beta,
+                            norm_region);
+
+                    const size_t cl_image2d_row_pitch{0};
+                    const size_t cl_image2d_slice_pitch{0};
+                    half_t* out_image_data =
+                        new half_t[40000];  // out_image_shape.production() *
+                                            // 4];
+                    TargetWrapperCL::ImgcpySync(out_image_data,
+                                                out_image,
+                                                out_image_shape[0],
+                                                out_image_shape[1],
+                                                cl_image2d_row_pitch,
+                                                cl_image2d_slice_pitch,
+                                                IoDirection::DtoH);
+                    float* out_data =
+                        new float[40000];  // out_image_shape.production() * 4];
+                    default_converter->ImageToNCHW(
+                        out_image_data, out_data, out_image_shape, out_dim);
+// result
+#ifdef LRN_FP16_PRINT_RESULT
+                    LOG(INFO)
+                        << "---- print kernel result (input -> output) ----";
+                    for (int eidx = 0; eidx < in_dim.production(); ++eidx) {
+                      std::cout << input_v[eidx] << " -> " << out_data[eidx]
+                                << std::endl;
+                    }
+#endif  // LRN_FP16_PRINT_RESULT
+                    for (int i = 0; i < out_dim.production(); i++) {
+                      auto abs_diff = abs(out_data[i] - out_ref[i]);
+                      auto relative_diff =
+                          COMPUTE_RELATIVE_DIFF(out_data[i], out_ref[i]);
+                      EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) ||
+                                    (abs_diff <= FP16_MAX_DIFF),
+                                true);
+                      if ((relative_diff > FP16_MAX_DIFF) &&
+                          (abs_diff > FP16_MAX_DIFF)) {
+                        LOG(ERROR) << "error idx: " << i << ", input_v[" << i
+                                   << "]: " << input_v[i] << ",  output_data["
+                                   << i << "]: " << out_data[i] << ", out_ref["
+                                   << i << "]:" << out_ref[i]
+                                   << " abs_diff:" << abs_diff
+                                   << " relative_diff:" << relative_diff
+                                   << " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
+                      }
+                    }
+#ifdef LRN_FP16_LOOP_TEST
+                  }  // norm_region
+                }    // beta
+              }      // alpha
+            }        // k
+          }          // num
+        }            // w
+      }              // h
+    }                // c
+  }                  // n
+#else
+// nothing to do.
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(lrn, kOpenCL, kFP16, kImageDefault, ImageDefault);
--- a/lite/utils/cv/image_resize.cc
+++ b/lite/utils/cv/image_resize.cc
@@ -236,10 +236,10 @@ void resize(const uint8_t* src,
          "vorr.s32   q10, q12, q12   \n"
          "vorr.s32   q11, q12, q12   \n"

-          "vmull.s16  q0, d2, %[_b0]     \n"
-          "vmull.s16  q1, d3, %[_b0]     \n"
-          "vmull.s16  q2, d6, %[_b1]     \n"
-          "vmull.s16  q3, d7, %[_b1]     \n"
+          "vmull.s16  q0, d2, %e[_b0]     \n"
+          "vmull.s16  q1, d3, %e[_b0]     \n"
+          "vmull.s16  q2, d6, %e[_b1]     \n"
+          "vmull.s16  q3, d7, %e[_b1]     \n"

          "vsra.s32   q10, q0, #16    \n"
          "vsra.s32   q11, q1, #16    \n"