diff --git a/lite/backends/opencl/cl_kernel/image/box_coder_kernel.cl b/lite/backends/opencl/cl_kernel/image/box_coder_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..72b0b66f9737ce0ca9c740e6d4e399d06eaf2cd8 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/box_coder_kernel.cl @@ -0,0 +1,152 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +__kernel void decode_center_size(__read_only image2d_t prior_box_image, + __read_only image2d_t prior_box_var_image, + __read_only image2d_t target_box_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H){ + const int out_c = get_global_id(0); + const int out_nh = get_global_id(1); + const int out_h = out_nh % out_H; + const int out_n = 1; + + const int prior_box_n = 1; + const int prior_box_c = 0; + const int prior_box_h = out_h; + + const int prior_box_var_n = 1; + const int prior_box_var_c = 0; + const int prior_box_var_h = out_h; + + const int target_box_n = 1; + const int target_box_c = out_c; + const int target_box_h = out_h; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + int2 prior_box_pos; + int2 prior_box_var_pos; + int2 target_box_pos; + int2 output_pos; + + prior_box_pos.x = prior_box_c * 4; + prior_box_pos.y = prior_box_n * prior_box_h; + + prior_box_var_pos.x = prior_box_var_c * 4; + prior_box_var_pos.y = prior_box_var_n * prior_box_var_h; + + target_box_pos.x = target_box_c * 4; + target_box_pos.y = target_box_n * target_box_h; + + output_pos.x = out_c * 4; + output_pos.y = out_n * out_h; + + CL_DTYPE4 prior_box_input[4]; + CL_DTYPE4 prior_box_var_input[4]; + CL_DTYPE4 target_box_input[4]; + + prior_box_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler, + (int2)(prior_box_pos.x + 0, prior_box_pos.y)); + prior_box_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler, + (int2)(prior_box_pos.x + 1, prior_box_pos.y)); + prior_box_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler, + (int2)(prior_box_pos.x + 2, prior_box_pos.y)); + prior_box_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler, + (int2)(prior_box_pos.x + 3, prior_box_pos.y)); + + prior_box_var_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler, + (int2)(prior_box_var_pos.x + 0, prior_box_var_pos.y)); + prior_box_var_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler, + (int2)(prior_box_var_pos.x + 1, prior_box_var_pos.y)); + prior_box_var_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler, + (int2)(prior_box_var_pos.x + 2, prior_box_var_pos.y)); + prior_box_var_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler, + (int2)(prior_box_var_pos.x + 3, prior_box_var_pos.y)); + + target_box_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler, + (int2)(target_box_pos.x + 0,target_box_pos.y)); + target_box_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler, + (int2)(target_box_pos.x + 1, target_box_pos.y)); + target_box_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler, + (int2)(target_box_pos.x + 2, target_box_pos.y)); + target_box_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler, + (int2)(target_box_pos.x + 3, target_box_pos.y)); + + CL_DTYPE prior_box_width = prior_box_input[2].x - prior_box_input[0].x; + CL_DTYPE prior_box_height = prior_box_input[3].x - prior_box_input[1].x; + CL_DTYPE prior_box_center_x = (prior_box_input[2].x + prior_box_input[0].x)/(CL_DTYPE)2; + CL_DTYPE prior_box_center_y = (prior_box_input[3].x + prior_box_input[1].x)/(CL_DTYPE)2; + + CL_DTYPE4 target_box_center_x; + CL_DTYPE4 target_box_center_y; + CL_DTYPE4 target_box_width; + CL_DTYPE4 target_box_height; + CL_DTYPE4 output[4]; + + output[0] = 0.0f; + output[1] = 0.0f; + output[2] = 0.0f; + output[3] = 0.0f; + + target_box_center_x.x = prior_box_var_input[0].x * target_box_input[0].x * prior_box_width + prior_box_center_x; + target_box_center_y.x = prior_box_var_input[1].x * target_box_input[1].x * prior_box_height + prior_box_center_y; + target_box_width.x = exp(prior_box_var_input[2].x * target_box_input[2].x) * prior_box_width; + target_box_height.x = exp(prior_box_var_input[3].x * target_box_input[3].x) * prior_box_height; + + output[0].x = target_box_center_x.x - target_box_width.x/(half)2; + output[1].x = target_box_center_y.x - target_box_height.x/(half)2; + output[2].x = target_box_center_x.x + target_box_width.x/(half)2; + output[3].x = target_box_center_y.x + target_box_height.x/(half)2; + + if(out_C - out_c * 4 >= 2){ + target_box_center_x.y = prior_box_var_input[0].x * target_box_input[0].y * prior_box_width + prior_box_center_x; + target_box_center_y.y = prior_box_var_input[1].x * target_box_input[1].y * prior_box_height + prior_box_center_y; + target_box_width.y = exp(prior_box_var_input[2].x * target_box_input[2].y) * prior_box_width; + target_box_height.y = exp(prior_box_var_input[3].x * target_box_input[3].y) * prior_box_height; + output[0].y = target_box_center_x.y - target_box_width.y/(half)2; + output[1].y = target_box_center_y.y - target_box_height.y/(half)2; + output[2].y = target_box_center_x.y + target_box_width.y/(half)2; + output[3].y = target_box_center_y.y + target_box_height.y/(half)2; + } + if(out_C - out_c * 4 >= 3){ + target_box_center_x.z = prior_box_var_input[0].x * target_box_input[0].z * prior_box_width + prior_box_center_x; + target_box_center_y.z = prior_box_var_input[1].x * target_box_input[1].z * prior_box_height + prior_box_center_y; + target_box_width.z = exp(prior_box_var_input[2].x * target_box_input[2].z) * prior_box_width; + target_box_height.z = exp(prior_box_var_input[3].x * target_box_input[3].z) * prior_box_height; + output[0].z = target_box_center_x.z - target_box_width.z/(half)2; + output[1].z = target_box_center_y.z - target_box_height.z/(half)2; + output[2].z = target_box_center_x.z + target_box_width.z/(half)2; + output[3].z = target_box_center_y.z + target_box_height.z/(half)2; + } + if(out_C - out_c * 4 >= 4){ + target_box_center_x.w = prior_box_var_input[0].x * target_box_input[0].w * prior_box_width + prior_box_center_x; + target_box_center_y.w = prior_box_var_input[1].x * target_box_input[1].w * prior_box_height + prior_box_center_y; + target_box_width.w = exp(prior_box_var_input[2].x * target_box_input[2].w) * prior_box_width; + target_box_height.w = exp(prior_box_var_input[3].x * target_box_input[3].w) * prior_box_height; + output[0].w = target_box_center_x.w - target_box_width.w/(half)2; + output[1].w = target_box_center_y.w - target_box_height.w/(half)2; + output[2].w = target_box_center_x.w + target_box_width.w/(half)2; + output[3].w = target_box_center_y.w + target_box_height.w/(half)2; + } + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 0, output_pos.y), output[0]); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 1, output_pos.y), output[1]); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 2, output_pos.y), output[2]); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 3, output_pos.y), output[3]); +} diff --git a/lite/kernels/opencl/box_coder_image_compute.cc b/lite/kernels/opencl/box_coder_image_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..295317c34498adc5f152277aae898b95e6abe89d --- /dev/null +++ b/lite/kernels/opencl/box_coder_image_compute.cc @@ -0,0 +1,169 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/backends/opencl/cl_half.h" +#include "lite/backends/opencl/cl_image_converter.h" +#include "lite/backends/opencl/cl_include.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/opencl/image_helper.h" +#include "lite/operators/op_params.h" +#include "lite/utils/logging.h" +#include "lite/utils/replace_stl/stream.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace opencl { + class BoxCoderComputeImage : public KernelLite { + public: + using param_t = operators::BoxCoderParam; + + void PrepareForRun() override { + auto& context = ctx_->As(); + boxcoder_param_ = param_.get_mutable(); + if (boxcoder_param_->code_type == "decode_center_size" && + boxcoder_param_->box_normalized == true) { + kernel_func_name_ = "decode_center_size"; + } else { + printf("This code_type %s doesn't support \n", boxcoder_param_->code_type.c_str()); + return; + } + CHECK(context.cl_context() != nullptr); + VLOG(1) << "kernel_func_name_:" << kernel_func_name_; + context.cl_context()->AddKernel( + kernel_func_name_, "image/box_coder_kernel.cl", build_options_); + } + + void Run() override { + boxcoder_param_ = param_.get_mutable(); + const auto& out_dims = boxcoder_param_->proposals->dims(); + auto image_shape = InitImageDimInfoWith(out_dims); + + auto* out_buf = boxcoder_param_->proposals->mutable_data( + image_shape["width"], image_shape["height"]); + +#ifndef LITE_SHUTDOWN_LOG + VLOG(4) << "boxcoder input shape: "; + +#endif + const auto* input_priorbox = boxcoder_param_->prior_box; + const auto* input_priorboxvar = boxcoder_param_->prior_box_var; + const auto* input_targetbox = boxcoder_param_->target_box; + const auto& code_type = boxcoder_param_->code_type; + if (code_type == "decode_center_size") { + auto* prior_box_image = input_priorbox->data(); + auto* prior_box_var_image = input_priorboxvar->data(); + auto* target_box_image = input_targetbox->data(); + + int new_dims[4] = {1, 1, 1, 1}; + for (int i = 0; i < out_dims.size(); i++) { + new_dims[4 - out_dims.size() + i] = out_dims[i]; + } + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); + STL::stringstream kernel_key; + kernel_key << kernel_func_name_ << build_options_; + auto kernel = context.cl_context()->GetKernel(kernel_key.str()); + + auto default_work_size = DefaultWorkSize(out_dims, + DDim(std::vector{ + static_cast(image_shape["width"]), + static_cast(image_shape["height"])})); + + int out_C = new_dims[1]; + int out_H = new_dims[2]; +#ifndef LITE_SHUTDOWN_LOG + VLOG(4) << TargetToStr(boxcoder_param_->proposals->target()); + VLOG(4) << "output shape: " << out_dims[0] << ", " << + out_dims[1] << ", " << + out_dims[2] << ", " << + out_dims[3]; + VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " " + << image_shape["height"]; + VLOG(4) << "out_C = " << out_C; + VLOG(4) << "out_H = " << out_H; + VLOG(4) << "default_work_size = " << default_work_size[0] << ", " + << default_work_size[1] << ", " << default_work_size[2]; +#endif + int arg_idx = 0; + cl_int status = kernel.setArg(arg_idx++, *prior_box_image); + CL_CHECK_FATAL(status); + status = kernel.setArg(arg_idx++, *prior_box_var_image); + CL_CHECK_FATAL(status); + status = kernel.setArg(arg_idx++, *target_box_image); + CL_CHECK_FATAL(status); + status = kernel.setArg(arg_idx++, *out_buf); + CL_CHECK_FATAL(status); + status = kernel.setArg(arg_idx++, out_C); + CL_CHECK_FATAL(status); + status = kernel.setArg(arg_idx++, out_H); + CL_CHECK_FATAL(status); + auto global_work_size = + cl::NDRange{static_cast(default_work_size[0]), + static_cast(default_work_size[2])}; + + status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( + kernel, + cl::NullRange, + global_work_size, + cl::NullRange, + nullptr, + event_.get()); + CL_CHECK_FATAL(status); + context.cl_wait_list()->emplace(out_buf, event_); + +#ifndef LITE_SHUTDOWN_LOG + VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " " + << global_work_size[1]; +#endif + } + } + std::string doc() { return "Boxcoder using cl::Image, kFP16"; } + + param_t* boxcoder_param_{nullptr}; + std::string kernel_func_name_{}; + std::string build_options_{" -DCL_DTYPE_half"}; + std::shared_ptr event_{new cl::Event}; +}; + +} // namespace opencl +} // namespace kernels +} // namespace lite +} // namespace paddle +typedef paddle::lite::kernels::opencl::BoxCoderComputeImage BoxCoder_image; + +REGISTER_LITE_KERNEL( + box_coder, kOpenCL, kFP16, kImageDefault, BoxCoder_image, ImageDefault) + .BindInput("PriorBox", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindInput("PriorBoxVar", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindInput("TargetBox", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindOutput("OutputBox", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .Finalize(); diff --git a/lite/kernels/opencl/box_coder_image_compute_test.cc b/lite/kernels/opencl/box_coder_image_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..426ecbbca46bb24b915f9b6c15374aef3e9b8efe --- /dev/null +++ b/lite/kernels/opencl/box_coder_image_compute_test.cc @@ -0,0 +1,279 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "lite/backends/opencl/target_wrapper.h" +#include "lite/core/op_registry.h" +#include "lite/core/tensor.h" +#include "lite/kernels/opencl/test_helper.h" + +#define FP16_MAX_DIFF (5e-1) +namespace paddle { +namespace lite { +void box_coder_ref(float* proposals_data, + const float* anchors_data, + const float* bbox_deltas_data, + const float* variances_data, + int axis, + bool box_normalized, + std::string code_type, + int row, + int col) { + if (code_type == "decode_center_size") { + int anchor_len = 4; + int out_len = 4; + int var_len = 4; + int delta_len = 4; + float normalized = !box_normalized ? 1.f : 0; + + for (int64_t row_id = 0; row_id < row; ++row_id) { + for (int64_t col_id = 0; col_id < col; ++col_id) { + size_t delta_offset = row_id * col * delta_len + col_id * delta_len; + size_t out_offset = row_id * col * out_len + col_id * out_len; + int prior_box_offset = + axis == 0 ? col_id * anchor_len : row_id * anchor_len; + int var_offset = axis == 0 ? col_id * var_len : row_id * var_len; + auto anchor_data_tmp = anchors_data + prior_box_offset; + auto bbox_deltas_data_tmp = bbox_deltas_data + delta_offset; + auto proposals_data_tmp = proposals_data + out_offset; + auto anchor_width = + anchor_data_tmp[2] - anchor_data_tmp[0] + normalized; + auto anchor_height = + anchor_data_tmp[3] - anchor_data_tmp[1] + normalized; + auto anchor_center_x = anchor_data_tmp[0] + 0.5 * anchor_width; + auto anchor_center_y = anchor_data_tmp[1] + 0.5 * anchor_height; + float bbox_center_x = 0, bbox_center_y = 0; + float bbox_width = 0, bbox_height = 0; + + auto variances_data_tmp = variances_data + var_offset; + bbox_center_x = + variances_data_tmp[0] * bbox_deltas_data_tmp[0] * anchor_width + + anchor_center_x; + bbox_center_y = + variances_data_tmp[1] * bbox_deltas_data_tmp[1] * anchor_height + + anchor_center_y; + bbox_width = std::exp(variances_data_tmp[2] * bbox_deltas_data_tmp[2]) * + anchor_width; + bbox_height = + std::exp(variances_data_tmp[3] * bbox_deltas_data_tmp[3]) * + anchor_height; + proposals_data_tmp[0] = bbox_center_x - bbox_width / 2; + proposals_data_tmp[1] = bbox_center_y - bbox_height / 2; + proposals_data_tmp[2] = bbox_center_x + bbox_width / 2 - normalized; + proposals_data_tmp[3] = bbox_center_y + bbox_height / 2 - normalized; + } + } + } else if (code_type == "encode_center_size") { + LOG(FATAL) << "not implemented type: " << code_type; + } else { + LOG(FATAL) << "not supported type: " << code_type; + } +} +// #define BOXCODER_FP16_LOOP_TEST +// #define BOXCODER_FP16_PRINT_RESULT +TEST(box_coder_image2d, compute) { +#ifdef BOXCODER_FP16_LOOP_TEST + for (auto n : {1, 2, 3, 4}) { + for (auto m : {1, 3, 4, 8}) { + for (auto norm : {true}) { + for (auto code_type : {"decode_center_size"}) { + for (auto axis : {0}) { +#else + const int n = 1; + const int m = 1; + const bool norm = true; + const std::string code_type = "decode_center_size"; + const int axis = 0; +#endif // BOXCODER_FP16_LOOP_TEST + + LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << m + << " ========"; + LOG(INFO) << "======== parameters: norm = " << norm + << ", axis = " << axis << "code_type: " << code_type; + + auto kernels = KernelRegistry::Global().Create( + "box_coder", + TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault)); + ASSERT_FALSE(kernels.empty()); + auto kernel = std::move(kernels.front()); + LOG(INFO) << "get kernel:" << kernel->doc(); + + lite::Tensor prior_box, prior_box_var, target_box, output_box; + operators::BoxCoderParam param; + param.prior_box = &prior_box; + param.prior_box_var = &prior_box_var; + param.target_box = &target_box; + param.proposals = &output_box; + param.axis = axis; + param.box_normalized = norm; + param.code_type = code_type; + + std::unique_ptr context(new KernelContext); + context->As().InitOnce(); + + kernel->SetParam(param); + std::unique_ptr boxcoder_context( + new KernelContext); + context->As().CopySharedTo( + &(boxcoder_context->As())); + kernel->SetContext(std::move(boxcoder_context)); + + const DDim prior_box_dims = + DDim(std::vector{1, 1, m, 4}); + const DDim prior_box_var_dims = DDim(std::vector{1, 1, m, 4}); + const DDim target_box_dims = DDim(std::vector{1, n, m, 4}); + + const DDim out_dim = + DDim(std::vector{1, n, m, 4}); + prior_box.Resize(prior_box_dims); + prior_box_var.Resize(prior_box_var_dims); + target_box.Resize(target_box_dims); + output_box.Resize(out_dim); + + std::vector prior_box_data(prior_box_dims.production()); + std::vector prior_box_var_data(prior_box_var_dims.production()); + std::vector target_box_data(target_box_dims.production()); + for (int i = 0; i < prior_box_dims.production(); i++) { + prior_box_data[i] = i * 1.1 / prior_box_dims.production(); + } + for (int i = 0; i < prior_box_var_dims.production(); i++) { + prior_box_var_data[i] = i * 1.2 / prior_box_var_dims.production(); + } + for (int i = 0; i < target_box_dims.production(); i++) { + target_box_data[i] = i * 1.3 / target_box_dims.production(); + } + + LOG(INFO) << "prepare input"; + CLImageConverterDefault* default_converter = + new CLImageConverterDefault(); + DDim prior_box_image_shape = + default_converter->InitImageDimInfoWith(prior_box_dims); + LOG(INFO) << "prior_box_image_shape = " << prior_box_image_shape[0] << " " + << prior_box_image_shape[1]; + std::vector prior_box_image_data(prior_box_image_shape.production() * + 4); // 4 : RGBA + default_converter->NCHWToImage( + prior_box_data.data(), prior_box_image_data.data(), prior_box_dims); + auto* prior_box_image = prior_box.mutable_data( + prior_box_image_shape[0], prior_box_image_shape[1], prior_box_image_data.data()); + + DDim prior_box_var_image_shape = + default_converter->InitImageDimInfoWith(prior_box_var_dims); + LOG(INFO) << "prior_box_var_image_shape = " << prior_box_var_image_shape[0] << " " + << prior_box_var_image_shape[1]; + std::vector prior_box_var_image_data(prior_box_var_image_shape.production() * + 4); // 4 : RGBA + default_converter->NCHWToImage( + prior_box_var_data.data(), prior_box_var_image_data.data(), prior_box_var_dims); + auto* prior_box_var_image = prior_box_var.mutable_data( + prior_box_var_image_shape[0], prior_box_var_image_shape[1], prior_box_var_image_data.data()); + + DDim target_box_image_shape = + default_converter->InitImageDimInfoWith(target_box_dims); + LOG(INFO) << "target_box_image_shape = " << target_box_image_shape[0] << " " + << target_box_image_shape[1]; + std::vector target_box_image_data(target_box_image_shape.production() * + 4); // 4 : RGBA + default_converter->NCHWToImage( + target_box_data.data(), target_box_image_data.data(), target_box_dims); + auto* target_box_image = target_box.mutable_data( + target_box_image_shape[0], target_box_image_shape[1], target_box_image_data.data()); + + DDim out_image_shape = + default_converter->InitImageDimInfoWith(out_dim); + LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " " + << out_image_shape[1]; + auto* out_image = output_box.mutable_data( + out_image_shape[0], out_image_shape[1]); + kernel->Launch(); + + auto* wait_list = context->As().cl_wait_list(); + auto* out_ptr = param.proposals->data(); + auto it = wait_list->find(out_ptr); + if (it != wait_list->end()) { + VLOG(4) << "--- Find the sync event for the target cl " + "tensor. ---"; + auto& event = *(it->second); + event.wait(); + } else { + LOG(FATAL) << "Could not find the sync event for the " + "target cl tensor."; + } + + lite::Tensor out_ref_tensor; + out_ref_tensor.Resize(out_dim); + box_coder_ref(out_ref_tensor.mutable_data(), prior_box_data.data(), target_box_data.data(), + prior_box_var_data.data(), axis, norm, code_type, target_box_dims[0], target_box_dims[1]); + + const size_t cl_image2d_row_pitch{0}; + const size_t cl_image2d_slice_pitch{0}; + half_t* out_image_data = + new half_t[out_image_shape.production() * 4]; + TargetWrapperCL::ImgcpySync(out_image_data, + out_image, + out_image_shape[0], + out_image_shape[1], + cl_image2d_row_pitch, + cl_image2d_slice_pitch, + IoDirection::DtoH); + float* out_data = new float[out_image_shape.production() * 4]; + default_converter->ImageToNCHW( + out_image_data, out_data, out_image_shape, out_dim); +// result +#ifdef BOXCODER_FP16_PRINT_RESULT + LOG(INFO) + << "---- print kernel result (input -> output) ----"; + for (int eidx = 0; eidx < out_dim.production(); ++eidx) { + std::cout << target_box_data[eidx] << " -> " << out_data[eidx] + << std::endl; + } +#endif // BOXCODER_FP16_PRINT_RESULT + const float* out_ref = out_ref_tensor.data(); + for (int i = 0; i < out_dim.production(); i++) { + auto abs_diff = abs(out_data[i] - out_ref[i]); + auto relative_diff = + COMPUTE_RELATIVE_DIFF(out_data[i], out_ref[i]); + EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) || + (abs_diff <= FP16_MAX_DIFF), + true); + if ((relative_diff > FP16_MAX_DIFF) && + (abs_diff > FP16_MAX_DIFF)) { + LOG(ERROR) << "error idx:" << i << ", in_data[" << i + << "]: " << target_box_data[i] << ", out_data[" << i + << "]: " << out_data[i] << ", out_ref[" << i + << "]: " << out_ref[i] + << ", abs_diff: " << abs_diff + << ", relative_diff: " << relative_diff + << ", FP16_MAX_DIFF: " << FP16_MAX_DIFF; + } + } +#ifdef BOXCODER_FP16_LOOP_TEST + } // axis + } // code_type + } // norm + } // m + } // n +#else +// nothing to do. +#endif +} + +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(box_coder, kOpenCL, kFP16, kImageDefault, ImageDefault);