diff --git a/lite/backends/opencl/cl_kernel/image/lrn_kernel.cl b/lite/backends/opencl/cl_kernel/image/lrn_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..655a2657e07c419d4e50aed0e78cb8c37afa4b2a --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/lrn_kernel.cl @@ -0,0 +1,159 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +__kernel void lrn(__read_only image2d_t input, + __write_only image2d_t output, + __private const int out_C, + __private const int out_W, + __private const int local_size, + __private const float k, + __private const float alpha, + __private const float beta){ + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + const int out_c0 = out_c * 4; + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + const int out_c1 = out_c0 + 1; + const int out_c2 = out_c0 + 2; + const int out_c3 = out_c0 + 3; + + const int pad = (local_size - 1) / 2; + const int start = out_c0 - pad; + const int end = out_c0 + pad; + start = start > 0 ? start : 0; + end = end < out_C - 1 ? end : out_C - 1; + float square0 = 0.0; + float square1 = 0.0; + float square2 = 0.0; + float square3 = 0.0; + for (int i = start; i <= end; i++){ + int input_c0 = i / 4; + int2 input_pos; + input_pos.x = input_c0 * out_C + out_w; + input_pos.y = out_nh; + CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos); + int num = i % 4; + switch (num){ + case 0: + square0 += input_data.x * input_data.x; + break; + case 1: + square0 += input_data.y * input_data.y; + break; + case 2: + square0 += input_data.z * input_data.z; + break; + case 3: + square0 += input_data.w * input_data.w; + break; + } + } + start = out_c1 - pad; + end = out_c1 + pad; + for (int i = start; i <= end; i++){ + int input_c0 = i / 4; + int2 input_pos; + input_pos.x = input_c0 * out_C + out_w; + input_pos.y = out_nh; + CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos); + int num = i % 4; + switch (num){ + case 0: + square1 += input_data.x * input_data.x; + break; + case 1: + square1 += input_data.y * input_data.y; + break; + case 2: + square1 += input_data.z * input_data.z; + break; + case 3: + square1 += input_data.w * input_data.w; + break; + } + } + start = out_c2 - pad; + end = out_c2 + pad; + for (int i = start; i <= end; i++){ + int input_c0 = i / 4; + int2 input_pos; + input_pos.x = input_c0 * out_C + out_w; + input_pos.y = out_nh; + CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos); + int num = i % 4; + switch (num){ + case 0: + square2 += input_data.x * input_data.x; + break; + case 1: + square2 += input_data.y * input_data.y; + break; + case 2: + square2 += input_data.z * input_data.z; + break; + case 3: + square2 += input_data.w * input_data.w; + break; + } + } + start = out_c3 - pad; + end = out_c3 + pad; + for (int i = start; i <= end; i++){ + int input_c0 = i / 4; + int2 input_pos; + input_pos.x = input_c0 * out_C + out_w; + input_pos.y = out_nh; + CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos); + int num = i % 4; + switch (num){ + case 0: + square3 += input_data.x * input_data.x; + break; + case 1: + square3 += input_data.y * input_data.y; + break; + case 2: + square3 += input_data.z * input_data.z; + break; + case 3: + square3 += input_data.w * input_data.w; + break; + } + } + int2 out_pos; + out_pos.x = out_c * out_W + out_w; + out_pos.y = out_nh; + CL_DTYPE4 input = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, out_pos); + + float4 out_val; + out_val.x = input.x / (pow(k + alpha * (square0), beta)); + if (out_c1 < out_C){ + out_val.y = input.y / (pow(k + alpha * (square1), beta)); + } + if (out_c2 < out_C){ + out_val.z = input.z / (pow(k + alpha * (square1), beta)); + } + if (out_c3 < out_C){ + out_val.w = input.w / (pow(k + alpha * (square1), beta)); + } + CL_DTYPE4 out_data = CONVERT_TYPE_TO(out_val, CL_DTYPE4); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, out_pos, out_data); +} diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt index c11653f7212941c739f0e0b2152bd96d2fa1b11c..5d00e05f69549c58876a1dd9786f09b54d8cf6eb 100644 --- a/lite/kernels/opencl/CMakeLists.txt +++ b/lite/kernels/opencl/CMakeLists.txt @@ -23,7 +23,9 @@ add_kernel(concat_opencl OPENCL basic SRCS concat_image_compute.cc DEPS ${cl_ker add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(scale_opencl OPENCL basic SRCS scale_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(grid_sampler_opencl OPENCL basic SRCS grid_sampler_image_compute.cc DEPS ${cl_kernel_deps}) +add_kernel(lrn_opencl OPENCL basic SRCS lrn_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(bilinear_interp_opencl OPENCL basic SRCS bilinear_interp_image_compute.cc DEPS ${cl_kernel_deps}) + # extra # wait to add ... @@ -68,6 +70,9 @@ lite_cc_test(test_elementwise_add_image_opencl SRCS elementwise_add_image_comput lite_cc_test(test_grid_sampler_image_opencl SRCS grid_sampler_image_compute_test.cc DEPS grid_sampler_opencl op_registry program context) + +lite_cc_test(test_lrn_image_opencl SRCS lrn_image_compute_test.cc + DEPS lrn_opencl op_registry program context) lite_cc_test(test_bilinear_interp_image_opencl SRCS bilinear_interp_image_compute_test.cc DEPS bilinear_interp_opencl op_registry program context) diff --git a/lite/kernels/opencl/lrn_image_compute.cc b/lite/kernels/opencl/lrn_image_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..bb19e044ae4a7b296fbace00797b0c05521c8adb --- /dev/null +++ b/lite/kernels/opencl/lrn_image_compute.cc @@ -0,0 +1,166 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/backends/opencl/cl_half.h" +#include "lite/backends/opencl/cl_image_converter.h" +#include "lite/backends/opencl/cl_include.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/opencl/image_helper.h" +#include "lite/operators/op_params.h" +#include "lite/utils/logging.h" +#include "lite/utils/replace_stl/stream.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace opencl { +class LrnImageCompute : public KernelLite { + public: + using param_t = operators::LrnParam; + + std::string doc() const override { + return "Lrn using cl::Image2D(ImageDefault/RGBA), kFP16"; + } + + void PrepareForRun() override { + lrn_param_ = param_.get_mutable(); + + auto& context = ctx_->As(); + n_ = lrn_param_->n; + k_ = lrn_param_->k; + alpha_ = lrn_param_->alpha; + beta_ = lrn_param_->beta; + norm_region_ = lrn_param_->norm_region; + context.cl_context()->AddKernel( + kernel_func_name_, "image/lrn_kernel.cl", build_options_); + VLOG(1) << "kernel_func_name_:" << kernel_func_name_; + } + + void Run() override { + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); + + auto* x = lrn_param_->X; + auto* out = lrn_param_->Out; + if (norm_region_ != "AcrossChannels") { + LOG(FATAL) << "This norm_region_: " << norm_region_ << "doesn't support"; + return; + } + auto out_dims = out->dims(); + auto in_dims = x->dims(); + + VLOG(4) << "x->target(): " << TargetToStr(x->target()); + VLOG(4) << "out->target(): " << TargetToStr(out->target()); + VLOG(4) << "x->dims(): " << in_dims; + VLOG(4) << "lrn param: "; + VLOG(4) << "n: " << n_; + VLOG(4) << "k: " << k_; + VLOG(4) << "alpha: " << alpha_; + VLOG(4) << "beta: " << beta_; + VLOG(4) << "norm_region: " << norm_region_; + + auto out_image_shape = InitImageDimInfoWith(out_dims); + auto* x_img = x->data(); + // VLOG(4) << "x_image: " << x_img; + + auto* out_img = out->mutable_data( + out_image_shape["width"], out_image_shape["height"]); + // VLOG(4) << "out_image" << out_img; + VLOG(4) << "out_image_shape[w,h]:" << out_image_shape["width"] << " " + << out_image_shape["height"]; + + STL::stringstream kernel_key; + kernel_key << kernel_func_name_ << build_options_; + auto kernel = context.cl_context()->GetKernel(kernel_key.str()); + + int arg_idx = 0; + int out_channel = out_dims[1]; + int out_width = out_dims[3]; + auto default_work_size = + DefaultWorkSize(out_dims, + DDim(std::vector{ + static_cast(out_image_shape["width"]), + static_cast(out_image_shape["height"])})); + VLOG(4) << "default_work_size: " << default_work_size[0] << ", " + << default_work_size[1] << ", " << default_work_size[3]; + cl_int status = kernel.setArg(arg_idx++, *x_img); + CL_CHECK_FATAL(status); + status = kernel.setArg(arg_idx++, *out_img); + CL_CHECK_FATAL(status); + status = kernel.setArg(arg_idx++, out_channel); + CL_CHECK_FATAL(status); + status = kernel.setArg(arg_idx++, out_width); + CL_CHECK_FATAL(status); + status = kernel.setArg(arg_idx++, n_); + CL_CHECK_FATAL(status); + status = kernel.setArg(arg_idx++, k_); + CL_CHECK_FATAL(status); + status = kernel.setArg(arg_idx++, alpha_); + CL_CHECK_FATAL(status); + status = kernel.setArg(arg_idx++, beta_); + CL_CHECK_FATAL(status); + + auto global_work_size = + cl::NDRange{static_cast(default_work_size[0]), + static_cast(default_work_size[1]), + static_cast(default_work_size[2])}; + + status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( + kernel, + cl::NullRange, + global_work_size, + cl::NullRange, + nullptr, + event_.get()); + CL_CHECK_FATAL(status); + context.cl_wait_list()->emplace(out_img, event_); + + VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " " + << global_work_size[1] << " " << global_work_size[2]; + } + + protected: + param_t* lrn_param_{nullptr}; + int n_{5}; + float alpha_{1e-4}; + float beta_{0.75}; + float k_{1.}; + std::string norm_region_{"AcrossChannels"}; + std::string kernel_func_name_{"lrn"}; + std::string build_options_{"-DCL_DTYPE_half"}; + std::shared_ptr event_{new cl::Event}; +}; + +} // namespace opencl +} // namespace kernels +} // namespace lite +} // namespace paddle + +namespace ocl = paddle::lite::kernels::opencl; +REGISTER_LITE_KERNEL( + lrn, kOpenCL, kFP16, kImageDefault, ocl::LrnImageCompute, ImageDefault) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .Finalize(); diff --git a/lite/kernels/opencl/lrn_image_compute_test.cc b/lite/kernels/opencl/lrn_image_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..9a0fbabbe5f538b09e8ac6e694e96aa512ea6aa3 --- /dev/null +++ b/lite/kernels/opencl/lrn_image_compute_test.cc @@ -0,0 +1,270 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "lite/backends/opencl/target_wrapper.h" +#include "lite/core/op_registry.h" +#include "lite/core/tensor.h" +#include "lite/kernels/opencl/test_helper.h" + +#define FP16_MAX_DIFF (5e-1) + +namespace paddle { +namespace lite { +float lrn_square(const float* din, + int c, + int offset, + int channel, + int height, + int width, + int local_size) { + int pre_pad = (local_size - 1) / 2; + float sum = 0.f; + int start = c - pre_pad; + int end = c + pre_pad; + start = start < 0 ? 0 : start; + end = end < channel - 1 ? end : channel - 1; + for (int i = start; i <= end; i++) { + sum += din[i * height * width] * din[i * height * width]; + } + return sum; +} +void lrn_ref(const float* din, + const DDim& in_dims, + float* output, + int local_size, + float k, + float alpha, + float beta, + std::string norm_region) { + int num = in_dims[0]; + int channel = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + + if (norm_region == "AcrossChannels") { + for (int b = 0; b < num; b++) { + const float* din_batch = din + b * channel * height * width; + float* dout_batch = output + b * channel * height * width; + int offset_num = b * channel * height * width; + for (int c = 0; c < channel; c++) { + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + int offset_within_channel = h * width + w; + int dst_id = c * height * width + offset_within_channel; + float square = lrn_square(din_batch, + c, + offset_within_channel, + channel, + height, + width, + local_size); + dout_batch[dst_id] = + din_batch[dst_id] * pow(k + alpha * square, -beta); + } + } + } + } + } +} +// #define LRN_FP16_LOOP_TEST +// #define LRN_FP16_PRINT_RESULT +TEST(lrn_image2d, compute) { +#ifdef LRN_FP16_LOOP_TEST + for (int n = 1; n <= 100; n += 33) { + for (auto c : {1, 3, 8, 23, 32}) { + for (int h = 12; h <= 100; h += 13) { + for (int w = 12; w <= 100; w += 25) { + for (auto num : {3, 5, 9}) { + for (auto k : {1.0, 1.5}) { + for (auto alpha : {1e-4}) { + for (auto beta : {0.5, 0.75}) { + for (auto norm_region : {"AcrossChannels"}) { +#else + const int n = 1; + const int c = 5; + const int h = 2; + const int w = 4; + const int num = 5; + const float k = 1.0; + const float alpha = 1e-4; + const float beta = 0.75; + const std::string norm_region = "AcrossChannels"; +#endif // GRID_FP16_LOOP_TEST + + LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " + << c << " " << h << " " << w << " ========"; + LOG(INFO) << "LRN parameters: "; + LOG(INFO) << "num: " << num << ", k: " << k + << ", alpha: " << alpha << ", beta: " << beta + << ", norm_region: " << norm_region; + auto kernels = KernelRegistry::Global().Create( + "lrn", + TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault)); + ASSERT_FALSE(kernels.empty()); + auto kernel = std::move(kernels.front()); + LOG(INFO) << "get kernel:" << kernel->doc(); + + lite::Tensor x, out; + operators::LrnParam param; + param.X = &x; + param.Out = &out; + param.n = num; + param.k = k; + param.alpha = alpha; + param.beta = beta; + param.norm_region = norm_region; + + std::unique_ptr context(new KernelContext); + context->As().InitOnce(); + + kernel->SetParam(param); + std::unique_ptr lrn_context( + new KernelContext); + context->As().CopySharedTo( + &(lrn_context->As())); + kernel->SetContext(std::move(lrn_context)); + + const DDim in_dim = + DDim(std::vector{n, c, h, w}); + const DDim out_dim = + DDim(std::vector{n, c, h, w}); + x.Resize(in_dim); + out.Resize(out_dim); + + std::default_random_engine engine; + std::uniform_real_distribution dist(-1, 1); + int sum = n * c * h * w; + std::vector input_v(sum); + for (auto& i : input_v) { + i = dist(engine); + } + + LOG(INFO) << "prepare input"; + CLImageConverterDefault* default_converter = + new CLImageConverterDefault(); + DDim x_image_shape = + default_converter->InitImageDimInfoWith(in_dim); + LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " " + << x_image_shape[1]; + std::vector x_image_data( + x_image_shape.production() * 4); // 4 : RGBA + default_converter->NCHWToImage( + input_v.data(), x_image_data.data(), in_dim); + auto* x_image = x.mutable_data( + x_image_shape[0], + x_image_shape[1], + x_image_data.data()); + // LOG(INFO) << "x_image:" << x_image; + + DDim out_image_shape = + default_converter->InitImageDimInfoWith(out_dim); + LOG(INFO) << "out_image_shape = " << out_image_shape[0] + << " " << out_image_shape[1]; + auto* out_image = out.mutable_data( + out_image_shape[0], out_image_shape[1]); + // LOG(INFO) << "out_image:" << out_image; + kernel->Launch(); + + auto* wait_list = + context->As().cl_wait_list(); + auto* out_ptr = param.Out->data(); + auto it = wait_list->find(out_ptr); + if (it != wait_list->end()) { + VLOG(4) << "--- Find the sync event for the target cl " + "tensor. ---"; + auto& event = *(it->second); + event.wait(); + } else { + LOG(FATAL) << "Could not find the sync event for the " + "target cl tensor."; + } + + std::unique_ptr out_ref( + new float[out_dim.production()]); + lrn_ref(input_v.data(), + in_dim, + out_ref.get(), + num, + k, + alpha, + beta, + norm_region); + + const size_t cl_image2d_row_pitch{0}; + const size_t cl_image2d_slice_pitch{0}; + half_t* out_image_data = + new half_t[40000]; // out_image_shape.production() * + // 4]; + TargetWrapperCL::ImgcpySync(out_image_data, + out_image, + out_image_shape[0], + out_image_shape[1], + cl_image2d_row_pitch, + cl_image2d_slice_pitch, + IoDirection::DtoH); + float* out_data = + new float[40000]; // out_image_shape.production() * 4]; + default_converter->ImageToNCHW( + out_image_data, out_data, out_image_shape, out_dim); +// result +#ifdef LRN_FP16_PRINT_RESULT + LOG(INFO) + << "---- print kernel result (input -> output) ----"; + for (int eidx = 0; eidx < in_dim.production(); ++eidx) { + std::cout << input_v[eidx] << " -> " << out_data[eidx] + << std::endl; + } +#endif // LRN_FP16_PRINT_RESULT + for (int i = 0; i < out_dim.production(); i++) { + auto abs_diff = abs(out_data[i] - out_ref[i]); + auto relative_diff = + COMPUTE_RELATIVE_DIFF(out_data[i], out_ref[i]); + EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) || + (abs_diff <= FP16_MAX_DIFF), + true); + if ((relative_diff > FP16_MAX_DIFF) && + (abs_diff > FP16_MAX_DIFF)) { + LOG(ERROR) << "error idx: " << i << ", input_v[" << i + << "]: " << input_v[i] << ", output_data[" + << i << "]: " << out_data[i] << ", out_ref[" + << i << "]:" << out_ref[i] + << " abs_diff:" << abs_diff + << " relative_diff:" << relative_diff + << " FP16_MAX_DIFF:" << FP16_MAX_DIFF; + } + } +#ifdef LRN_FP16_LOOP_TEST + } // norm_region + } // beta + } // alpha + } // k + } // num + } // w + } // h + } // c + } // n +#else +// nothing to do. +#endif +} + +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(lrn, kOpenCL, kFP16, kImageDefault, ImageDefault); diff --git a/lite/utils/cv/image_resize.cc b/lite/utils/cv/image_resize.cc index 39c50e78dd76a47e9e0789b91e615f20297d9f70..1baef9de2e636ade8630d76dce14e7cfc1ee25f5 100644 --- a/lite/utils/cv/image_resize.cc +++ b/lite/utils/cv/image_resize.cc @@ -236,10 +236,10 @@ void resize(const uint8_t* src, "vorr.s32 q10, q12, q12 \n" "vorr.s32 q11, q12, q12 \n" - "vmull.s16 q0, d2, %[_b0] \n" - "vmull.s16 q1, d3, %[_b0] \n" - "vmull.s16 q2, d6, %[_b1] \n" - "vmull.s16 q3, d7, %[_b1] \n" + "vmull.s16 q0, d2, %e[_b0] \n" + "vmull.s16 q1, d3, %e[_b0] \n" + "vmull.s16 q2, d6, %e[_b1] \n" + "vmull.s16 q3, d7, %e[_b1] \n" "vsra.s32 q10, q0, #16 \n" "vsra.s32 q11, q1, #16 \n"