提交 d501487a 编写于 作者: H HappyAngel 提交者: GitHub

【OpenCL】add bilinear_interp op (#3088)



* fix

* fix format. test=develop

* fix format, test=develop

* fix resize error

* fix format. test=develop

* fix format, test=develop

* fix format

* note log(4) << x_image. test=develop

* note x_image printf. test=develop
上级 cc227f27
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cl_common.h>
__kernel void bilinear_interp(__read_only image2d_t input,
__write_only image2d_t output,
__private const float scale_h,
__private const float scale_w,
__private const float align_delta,
__private const int in_dims_h,
__private const int in_dims_w,
__private const int out_dims_h,
__private const int out_dims_w){
const int c = get_global_id(0);
const int w = get_global_id(1);
const int nh = get_global_id(2);
int2 output_pos;
output_pos.x = c * out_dims_w + w;
output_pos.y = nh;
// calculate center pixel's pos
int out_n = nh / out_dims_h;
int out_h = nh % out_dims_h;
float center_w = (w + align_delta) * scale_w - align_delta;
float center_h = (out_h + align_delta) * scale_h - align_delta;
int floor_w = (int)center_w;
int floor_h = (int)center_h;
int ceil_w = floor_w + 1;
int ceil_h = floor_h + 1;
if (floor_w < 0){
floor_w = 0;
}
if (floor_h < 0){
floor_h = 0;
}
if (ceil_w > in_dims_w - 1) {
ceil_w = in_dims_w - 1;
}
if (ceil_h > in_dims_h - 1) {
ceil_h = in_dims_h- 1;
}
float wight0_w = center_w - floor_w;
float wight0_h = center_h - floor_h;
float wight1_w = 1.0 - wight0_w;
float wight1_h = 1.0 - wight0_h;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
// get left up pixel data
int2 left_up;
left_up.x = c * in_dims_w + floor_w;
left_up.y = out_n * in_dims_h + ceil_h;
CL_DTYPE4 left_up_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, left_up);
// get left down pixel data
int2 left_down;
left_down.x = c * in_dims_w + floor_w;
left_down.y = out_n * in_dims_h + floor_h;
CL_DTYPE4 left_down_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, left_down);
// get right up pixel data
int2 right_up;
right_up.x = c * in_dims_w + ceil_w;
right_up.y = out_n * in_dims_h + ceil_h;
CL_DTYPE4 right_up_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, right_up);
// get right down pixel's data
int2 right_down;
right_down.x = c * in_dims_w + ceil_w;
right_down.y = out_n * in_dims_h + floor_h;
CL_DTYPE4 right_down_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, right_down);
// calculate output data
CL_DTYPE4 out = (left_down_data * wight1_w + right_down_data * wight0_w) * wight1_h
+ (left_up_data * wight1_w + right_up_data * wight0_w) * wight0_h;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, out);
}
...@@ -28,7 +28,7 @@ OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include ...@@ -28,7 +28,7 @@ OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
#CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS) #CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
############################################################### ###############################################################
# How to use one of static libaray: # # How to use one of static libaray: #
......
...@@ -28,7 +28,7 @@ OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include ...@@ -28,7 +28,7 @@ OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
#CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS) #CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
############################################################### ###############################################################
# How to use one of static libaray: # # How to use one of static libaray: #
# `libpaddle_api_full_bundled.a` # # `libpaddle_api_full_bundled.a` #
......
...@@ -17,7 +17,7 @@ example: ...@@ -17,7 +17,7 @@ example:
wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
tar zxvf mobilenet_v1.tar.gz tar zxvf mobilenet_v1.tar.gz
./lite/tools/build.sh build_optimize_tool ./lite/tools/build.sh build_optimize_tool
./build.model_optimize_tool/lite/api/model_optimize_tool ./build.opt/lite/api/opt
--optimize_out_type=naive_buffer --optimize_out_type=naive_buffer
--optimize_out=model_dir --optimize_out=model_dir
--model_dir=model_dir --model_dir=model_dir
......
...@@ -23,7 +23,7 @@ add_kernel(concat_opencl OPENCL basic SRCS concat_image_compute.cc DEPS ${cl_ker ...@@ -23,7 +23,7 @@ add_kernel(concat_opencl OPENCL basic SRCS concat_image_compute.cc DEPS ${cl_ker
add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(scale_opencl OPENCL basic SRCS scale_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(scale_opencl OPENCL basic SRCS scale_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(grid_sampler_opencl OPENCL basic SRCS grid_sampler_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(grid_sampler_opencl OPENCL basic SRCS grid_sampler_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(bilinear_interp_opencl OPENCL basic SRCS bilinear_interp_image_compute.cc DEPS ${cl_kernel_deps})
# extra # extra
# wait to add ... # wait to add ...
...@@ -67,9 +67,11 @@ lite_cc_test(test_elementwise_add_image_opencl SRCS elementwise_add_image_comput ...@@ -67,9 +67,11 @@ lite_cc_test(test_elementwise_add_image_opencl SRCS elementwise_add_image_comput
DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context) DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context)
lite_cc_test(test_grid_sampler_image_opencl SRCS grid_sampler_image_compute_test.cc lite_cc_test(test_grid_sampler_image_opencl SRCS grid_sampler_image_compute_test.cc
DEPS grid_sampler_opencl op_registry program context DEPS grid_sampler_opencl op_registry program context)
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_bilinear_interp_image_opencl SRCS bilinear_interp_image_compute_test.cc
DEPS bilinear_interp_opencl op_registry program context)
###################### ######################
# buffer kernel # # buffer kernel #
###################### ######################
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include <string>
#include "lite/backends/opencl/cl_half.h"
#include "lite/backends/opencl/cl_image_converter.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/logging.h"
#include "lite/utils/replace_stl/stream.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
class BilinearInterpImageCompute
: public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::InterpolateParam;
std::string doc() const override {
return "BilinearInterp using cl::Image2D(ImageDefault/RGBA), kFP16";
}
void PrepareForRun() override {
bilinear_interp_param_ = param_.get_mutable<param_t>();
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/bilinear_interp_kernel.cl", build_options_);
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
}
void Run() override {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* x = bilinear_interp_param_->X;
auto* out = bilinear_interp_param_->Out;
float scale_h = 0.0;
float scale_w = 0.0;
auto out_dims = out->dims();
auto in_dims = x->dims();
if (bilinear_interp_param_->align_corners) {
scale_h = (in_dims[2] - 1.0f) / (out_dims[2] - 1.0f);
scale_w = (in_dims[3] - 1.0f) / (out_dims[3] - 1.0f);
} else {
scale_h = in_dims[2] / static_cast<float>(out_dims[2]);
scale_w = in_dims[3] / static_cast<float>(out_dims[3]);
}
float align_delta = 0.0f;
if (!bilinear_interp_param_->align_corners &&
bilinear_interp_param_->align_mode == 0) {
align_delta = 0.5f;
}
int in_h = in_dims[2];
int in_w = in_dims[3];
int out_h = out_dims[2];
int out_w = out_dims[3];
VLOG(4) << "x->target():" << TargetToStr(x->target());
VLOG(4) << "out->target():" << TargetToStr(out->target());
VLOG(4) << "x->dims():" << in_dims;
VLOG(4) << "out->dims():" << out_dims;
auto out_image_shape = InitImageDimInfoWith(out_dims);
auto* x_img = x->data<half_t, cl::Image2D>();
// VLOG(4) << "x_image: " << x_img;
auto* out_img = out->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
// VLOG(4) << "out_image: " << out_img;
VLOG(4) << "out_image_shape[w,h]: " << out_image_shape["width"] << " "
<< out_image_shape["height"];
VLOG(4) << "scale_h: " << scale_h << ", scale_w: " << scale_w
<< ", align_delta: " << align_delta;
VLOG(4) << "in_h: " << in_h << ", in_w: " << in_w;
VLOG(4) << "out_h: " << out_h << ", out_w: " << out_w;
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
auto default_work_size =
DefaultWorkSize(out_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
<< default_work_size[1] << ", " << default_work_size[2];
cl_int status = kernel.setArg(arg_idx++, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, *out_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, scale_h);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, scale_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, align_delta);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, in_h);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, in_w);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, out_h);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, out_w);
CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
static_cast<cl::size_type>(default_work_size[1]),
static_cast<cl::size_type>(default_work_size[2])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_img, event_);
VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
<< global_work_size[1] << " " << global_work_size[2];
}
protected:
param_t* bilinear_interp_param_{nullptr};
std::string kernel_func_name_{"bilinear_interp"};
std::string build_options_{"-DCL_DTYPE_half"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
namespace ocl = paddle::lite::kernels::opencl;
REGISTER_LITE_KERNEL(bilinear_interp,
kOpenCL,
kFP16,
kImageDefault,
ocl::BilinearInterpImageCompute,
ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <memory>
#include <random>
#include "lite/backends/opencl/target_wrapper.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/kernels/opencl/test_helper.h"
#define FP16_MAX_DIFF (5e-1)
namespace paddle {
namespace lite {
void bilinear_interp_ref(const float* din,
const DDim& x_dims,
float* dout,
const DDim& out_dims,
bool align_corners,
int align_mode) {
int batch_size = x_dims[0];
int channel_size = x_dims[1];
auto in_h = x_dims[2];
auto in_w = x_dims[3];
int out_h = out_dims[2];
int out_w = out_dims[3];
// copy from x if no change
if (in_h == out_h && in_w == out_w) {
memcpy(dout, din, sizeof(float) * x_dims.production());
return;
}
float ratio_h = 0.f;
float ratio_w = 0.f;
if (out_h > 1) {
ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
: static_cast<float>(in_h) / out_h;
}
if (out_w > 1) {
ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
: static_cast<float>(in_w) / out_w;
}
// naive bilinear interpolation
bool align_flag = (align_mode == 0 && !align_corners);
for (int n = 0; n < batch_size; n++) {
float* dout_data = dout + n * channel_size * out_h * out_w;
const float* din_data = din + n * channel_size * in_h * in_w;
for (int c = 0; c < channel_size; c++) {
float* dout_data_c = dout_data + c * out_h * out_w;
const float* din_data_c = din_data + c * in_h * in_w;
for (int h = 0; h < out_h; h++) {
float center_h = align_flag ? (ratio_h * (h + 0.5) - 0.5) : ratio_h * h;
int floor_h = static_cast<int>(center_h);
int ceil_h = floor_h + 1;
floor_h = floor_h > 0 ? floor_h : 0;
ceil_h = ceil_h > in_h - 1 ? in_h - 1 : ceil_h;
float hs = center_h - floor_h;
float he = 1.0 - hs;
for (int w = 0; w < out_w; w++) {
float center_w =
align_flag ? (ratio_w * (w + 0.5) - 0.5) : ratio_w * w;
int floor_w = static_cast<int>(center_w);
int ceil_w = floor_w + 1;
floor_w = floor_w > 0 ? floor_w : 0;
ceil_w = ceil_w > in_w - 1 ? in_w - 1 : ceil_w;
float ws = center_w - floor_w;
float we = 1.0 - ws;
float left_up = din_data_c[ceil_h * in_w + floor_w] * we * hs;
float left_down = din_data_c[floor_h * in_w + floor_w] * we * he;
float right_up = din_data_c[ceil_h * in_w + ceil_w] * ws * hs;
float right_down = din_data_c[floor_h * in_w + ceil_w] * ws * he;
dout_data_c[h * out_w + w] =
left_up + left_down + right_up + right_down;
}
}
}
}
}
// #define BILINEAR_FP16_LOOP_TEST
// #define BILINEAR_FP16_PRINT_RESULT
TEST(bilinear_interp_image2d, compute) {
#ifdef BILINEAR_FP16_LOOP_TEST
for (auto n : {1, 3}) {
for (auto c : {1, 3, 8, 23, 32}) {
for (auto h : {2, 20, 64, 112}) {
for (auto w : {2, 20, 64, 112}) {
for (auto out_h : {4, 32, 96, 224}) {
for (auto out_w : {4, 32, 96, 224}) {
for (auto align_corners : {true, false}) {
for (auto align_mode : {0, 1}) {
#else
const int n = 1;
const int c = 1;
const int h = 2;
const int w = 2;
const int out_h = 4;
const int out_w = 4;
const bool align_corners = true;
const int align_mode = 0;
#endif // BILINEAR_FP16_LOOP_TEST
LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c
<< " " << h << " " << w << " ========";
LOG(INFO) << "======== parameters: out_h = " << out_h
<< ", out_w = " << out_w;
LOG(INFO) << "align_corners: " << align_corners
<< ", align_mode: " << align_mode;
auto kernels = KernelRegistry::Global().Create(
"bilinear_interp",
TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());
LOG(INFO) << "get kernel:" << kernel->doc();
lite::Tensor x, out;
operators::InterpolateParam param;
param.X = &x;
param.Out = &out;
param.align_corners = align_corners;
param.align_mode = align_mode;
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
kernel->SetParam(param);
std::unique_ptr<KernelContext> bilinear_context(
new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(bilinear_context->As<OpenCLContext>()));
kernel->SetContext(std::move(bilinear_context));
const DDim in_dim =
DDim(std::vector<DDim::value_type>{n, c, h, w});
const DDim out_dim =
DDim(std::vector<DDim::value_type>{n, c, out_h, out_w});
x.Resize(in_dim);
out.Resize(out_dim);
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-1, 1);
int sum = n * c * h * w;
std::vector<float> input_v(sum);
for (auto& i : input_v) {
i = dist(engine);
}
LOG(INFO) << "prepare input";
CLImageConverterDefault* default_converter =
new CLImageConverterDefault();
DDim x_image_shape =
default_converter->InitImageDimInfoWith(in_dim);
LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " "
<< x_image_shape[1];
std::vector<half_t> x_image_data(x_image_shape.production() *
4); // 4 : RGBA
default_converter->NCHWToImage(
input_v.data(), x_image_data.data(), in_dim);
auto* x_image = x.mutable_data<half_t, cl::Image2D>(
x_image_shape[0], x_image_shape[1], x_image_data.data());
// LOG(INFO) << "x_image:" << x_image;
DDim out_image_shape =
default_converter->InitImageDimInfoWith(out_dim);
LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
<< out_image_shape[1];
auto* out_image = out.mutable_data<half_t, cl::Image2D>(
out_image_shape[0], out_image_shape[1]);
// LOG(INFO) << "out_image:" << out_image;
kernel->Launch();
auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
auto* out_ptr = param.Out->data<half_t, cl::Image2D>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl "
"tensor. ---";
auto& event = *(it->second);
event.wait();
} else {
LOG(FATAL) << "Could not find the sync event for the "
"target cl tensor.";
}
std::unique_ptr<float[]> out_ref(
new float[out_dim.production()]);
bilinear_interp_ref(input_v.data(),
in_dim,
out_ref.get(),
out_dim,
align_corners,
align_mode);
const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0};
half_t* out_image_data =
new half_t[40000]; // out_image_shape.production() * 4
TargetWrapperCL::ImgcpySync(out_image_data,
out_image,
out_image_shape[0],
out_image_shape[1],
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH);
float* out_data = new float[out_image_shape.production() * 4];
default_converter->ImageToNCHW(
out_image_data, out_data, out_image_shape, out_dim);
// result
#ifdef BILINEAR_FP16_PRINT_RESULT
LOG(INFO)
<< "---- print kernel result (input -> output) ----";
for (int eidx = 0; eidx < in_dim.production(); ++eidx) {
std::cout << input_v[eidx] << " -> " << out_data[eidx]
<< std::endl;
}
#endif // BILINEAR_FP16_PRINT_RESULT
for (int i = 0; i < out_dim.production(); i++) {
auto abs_diff = abs(out_data[i] - out_ref[i]);
auto relative_diff =
COMPUTE_RELATIVE_DIFF(out_data[i], out_ref[i]);
EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) ||
(abs_diff <= FP16_MAX_DIFF),
true);
if ((relative_diff > FP16_MAX_DIFF) &&
(abs_diff > FP16_MAX_DIFF)) {
LOG(ERROR) << "error idx:" << i << ", in_data[" << i
<< "]: " << input_v[i] << ", out_data[" << i
<< "]: " << out_data[i] << ", out_ref[" << i
<< "]: " << out_ref[i]
<< ", abs_diff: " << abs_diff
<< ", relative_diff: " << relative_diff
<< ", FP16_MAX_DIFF: " << FP16_MAX_DIFF;
}
}
#ifdef BILINEAR_FP16_LOOP_TEST
} // mode
} // corners
} // out_w
} // out_h
} // w
} // h
} // c
} // n
#else
// nothing to do.
#endif
}
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(bilinear_interp, kOpenCL, kFP16, kImageDefault, ImageDefault);
...@@ -35,7 +35,7 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -35,7 +35,7 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
using param_t = operators::GridSamplerParam; using param_t = operators::GridSamplerParam;
std::string doc() const override { std::string doc() const override {
return "GridSampler using cl::Image2D(ImageDefault/RGBA), kFP32"; return "GridSampler using cl::Image2D(ImageDefault/RGBA), kFP16";
} }
void PrepareForRun() override { void PrepareForRun() override {
...@@ -44,7 +44,7 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -44,7 +44,7 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(
kernel_func_name_, "image/grid_sampler_kernel.cl", build_options_); kernel_func_name_, "image/grid_sampler_kernel.cl", build_options_);
VLOG(1) << "kernel_func_name_:" << kernel_func_name_; VLOG(4) << "kernel_func_name_:" << kernel_func_name_;
} }
void Run() override { void Run() override {
...@@ -64,14 +64,14 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -64,14 +64,14 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
auto out_image_shape = InitImageDimInfoWith(out_dims); auto out_image_shape = InitImageDimInfoWith(out_dims);
auto* x_img = x->data<half_t, cl::Image2D>(); auto* x_img = x->data<half_t, cl::Image2D>();
VLOG(4) << "x_image: " << x_img; // VLOG(4) << "x_image: " << x_img;
auto* grid_img = x->data<half_t, cl::Image2D>(); auto* grid_img = x->data<half_t, cl::Image2D>();
VLOG(4) << "grid_img: " << grid_img; // VLOG(4) << "grid_img: " << grid_img;
auto* out_img = out->mutable_data<half_t, cl::Image2D>( auto* out_img = out->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]); out_image_shape["width"], out_image_shape["height"]);
VLOG(4) << "out_image" << out_img; // VLOG(4) << "out_image" << out_img;
VLOG(4) << "out_image_shape[w,h]:" << out_image_shape["width"] << " " VLOG(4) << "out_image_shape[w,h]:" << out_image_shape["width"] << " "
<< out_image_shape["height"]; << out_image_shape["height"];
...@@ -87,7 +87,8 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -87,7 +87,8 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
DDim(std::vector<DDim::value_type>{ DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]), static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])})); static_cast<int64_t>(out_image_shape["height"])}));
VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
<< default_work_size[1] << ", " << default_work_size[2];
cl_int status = kernel.setArg(arg_idx++, *x_img); cl_int status = kernel.setArg(arg_idx++, *x_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, *grid_img); status = kernel.setArg(arg_idx++, *grid_img);
...@@ -101,8 +102,8 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -101,8 +102,8 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
auto global_work_size = auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(default_work_size[0]), cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
static_cast<cl::size_type>(default_work_size[2]), static_cast<cl::size_type>(default_work_size[1]),
static_cast<cl::size_type>(default_work_size[3] / 4)}; static_cast<cl::size_type>(default_work_size[2] / 4)};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
......
...@@ -168,7 +168,7 @@ TEST(grid_samler_image2d, compute) { ...@@ -168,7 +168,7 @@ TEST(grid_samler_image2d, compute) {
input_v.data(), x_image_data.data(), in_dim); input_v.data(), x_image_data.data(), in_dim);
auto* x_image = x.mutable_data<half_t, cl::Image2D>( auto* x_image = x.mutable_data<half_t, cl::Image2D>(
x_image_shape[0], x_image_shape[1], x_image_data.data()); x_image_shape[0], x_image_shape[1], x_image_data.data());
LOG(INFO) << "x_image:" << x_image; // LOG(INFO) << "x_image:" << x_image;
DDim grid_image_shape = DDim grid_image_shape =
default_converter->InitImageDimInfoWith(grid_dim); default_converter->InitImageDimInfoWith(grid_dim);
...@@ -180,7 +180,7 @@ TEST(grid_samler_image2d, compute) { ...@@ -180,7 +180,7 @@ TEST(grid_samler_image2d, compute) {
grid_v.data(), grid_image_data.data(), grid_dim); grid_v.data(), grid_image_data.data(), grid_dim);
auto* grid_image = grid.mutable_data<half_t, cl::Image2D>( auto* grid_image = grid.mutable_data<half_t, cl::Image2D>(
grid_image_shape[0], grid_image_shape[1], grid_image_data.data()); grid_image_shape[0], grid_image_shape[1], grid_image_data.data());
LOG(INFO) << "grid_image:" << grid_image; // LOG(INFO) << "grid_image:" << grid_image;
DDim out_image_shape = DDim out_image_shape =
default_converter->InitImageDimInfoWith(out_dim); default_converter->InitImageDimInfoWith(out_dim);
...@@ -188,7 +188,7 @@ TEST(grid_samler_image2d, compute) { ...@@ -188,7 +188,7 @@ TEST(grid_samler_image2d, compute) {
<< out_image_shape[1]; << out_image_shape[1];
auto* out_image = out.mutable_data<half_t, cl::Image2D>( auto* out_image = out.mutable_data<half_t, cl::Image2D>(
out_image_shape[0], out_image_shape[1]); out_image_shape[0], out_image_shape[1]);
LOG(INFO) << "out_image:" << out_image; // LOG(INFO) << "out_image:" << out_image;
kernel->Launch(); kernel->Launch();
auto* wait_list = context->As<OpenCLContext>().cl_wait_list(); auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
......
...@@ -206,3 +206,38 @@ REGISTER_LITE_KERNEL(reshape2, ...@@ -206,3 +206,38 @@ REGISTER_LITE_KERNEL(reshape2,
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kImageDefault))}) DATALAYOUT(kImageDefault))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL(flatten,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::ReshapeComputeFloatImage,
image2d)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
REGISTER_LITE_KERNEL(flatten2,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::ReshapeComputeFloatImage,
image2d)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
...@@ -46,7 +46,20 @@ bool Pad2dOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { ...@@ -46,7 +46,20 @@ bool Pad2dOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
scope->FindVar(op_desc.Output("Out").front())->GetMutable<Tensor>(); scope->FindVar(op_desc.Output("Out").front())->GetMutable<Tensor>();
param_.mode = op_desc.GetAttr<std::string>("mode"); param_.mode = op_desc.GetAttr<std::string>("mode");
param_.pad_value = op_desc.GetAttr<float>("pad_value"); param_.pad_value = op_desc.GetAttr<float>("pad_value");
param_.paddings = op_desc.GetAttr<std::vector<int>>("paddings"); if (op_desc.HasAttr("variable_padding") &&
op_desc.GetAttr<bool>("variable_paddings")) {
auto Paddings =
scope->FindVar(op_desc.Input("Paddings").front())->GetMutable<Tensor>();
auto ptr = Paddings->data<int>();
if (Paddings->dims().size() < 4) {
printf("Paddings size must be four: %d \n",
static_cast<int>(Paddings->dims().size()));
return false;
}
param_.paddings = {ptr[0], ptr[1], ptr[2], ptr[3]};
} else {
param_.paddings = op_desc.GetAttr<std::vector<int>>("paddings");
}
param_.data_format = op_desc.GetAttr<std::string>("data_format"); param_.data_format = op_desc.GetAttr<std::string>("data_format");
return true; return true;
} }
......
...@@ -51,6 +51,7 @@ void compute_xy(int srcw, ...@@ -51,6 +51,7 @@ void compute_xy(int srcw,
int srch, int srch,
int dstw, int dstw,
int dsth, int dsth,
int num,
double scale_x, double scale_x,
double scale_y, double scale_y,
int* xofs, int* xofs,
...@@ -77,8 +78,8 @@ void resize(const uint8_t* src, ...@@ -77,8 +78,8 @@ void resize(const uint8_t* src,
memcpy(dst, src, sizeof(uint8_t) * size); memcpy(dst, src, sizeof(uint8_t) * size);
return; return;
} }
double scale_x = static_cast<double>(srcw / dstw); double scale_x = static_cast<double>(srcw) / dstw;
double scale_y = static_cast<double>(srch / dsth); double scale_y = static_cast<double>(srch) / dsth;
int* buf = new int[dstw * 2 + dsth * 2]; int* buf = new int[dstw * 2 + dsth * 2];
...@@ -87,9 +88,6 @@ void resize(const uint8_t* src, ...@@ -87,9 +88,6 @@ void resize(const uint8_t* src,
int16_t* ialpha = reinterpret_cast<int16_t*>(buf + dstw + dsth); int16_t* ialpha = reinterpret_cast<int16_t*>(buf + dstw + dsth);
int16_t* ibeta = reinterpret_cast<int16_t*>(buf + 2 * dstw + dsth); int16_t* ibeta = reinterpret_cast<int16_t*>(buf + 2 * dstw + dsth);
compute_xy(
srcw, srch, dstw, dsth, scale_x, scale_y, xofs, yofs, ialpha, ibeta);
int w_out = dstw; int w_out = dstw;
int w_in = srcw; int w_in = srcw;
int num = 1; int num = 1;
...@@ -111,6 +109,9 @@ void resize(const uint8_t* src, ...@@ -111,6 +109,9 @@ void resize(const uint8_t* src,
num = 4; num = 4;
} }
compute_xy(
srcw, srch, dstw, dsth, num, scale_x, scale_y, xofs, yofs, ialpha, ibeta);
int* xofs1 = nullptr; int* xofs1 = nullptr;
int* yofs1 = nullptr; int* yofs1 = nullptr;
int16_t* ialpha1 = nullptr; int16_t* ialpha1 = nullptr;
...@@ -124,6 +125,7 @@ void resize(const uint8_t* src, ...@@ -124,6 +125,7 @@ void resize(const uint8_t* src,
srch / 2, srch / 2,
w, w,
tmp, tmp,
num,
scale_x, scale_x,
scale_y, scale_y,
xofs1, xofs1,
...@@ -134,6 +136,7 @@ void resize(const uint8_t* src, ...@@ -134,6 +136,7 @@ void resize(const uint8_t* src,
int cnt = w_out >> 3; int cnt = w_out >> 3;
int remain = w_out % 8; int remain = w_out % 8;
int32x4_t _v2 = vdupq_n_s32(2); int32x4_t _v2 = vdupq_n_s32(2);
int prev_sy1 = -1;
#pragma omp parallel for #pragma omp parallel for
for (int dy = 0; dy < dsth; dy++) { for (int dy = 0; dy < dsth; dy++) {
int16_t* rowsbuf0 = new int16_t[w_out]; int16_t* rowsbuf0 = new int16_t[w_out];
...@@ -144,27 +147,20 @@ void resize(const uint8_t* src, ...@@ -144,27 +147,20 @@ void resize(const uint8_t* src,
yofs = yofs1; yofs = yofs1;
ialpha = ialpha1; ialpha = ialpha1;
} }
if (sy < 0) { if (sy == prev_sy1) {
memset(rowsbuf0, 0, sizeof(uint16_t) * w_out); memset(rowsbuf0, 0, sizeof(uint16_t) * w_out);
const uint8_t* S1 = src + srcw * (sy + 1); const uint8_t* S1 = src + srcw * (sy + 1);
const int16_t* ialphap = ialpha; const int16_t* ialphap = ialpha;
int16_t* rows1p = rowsbuf1; int16_t* rows1p = rowsbuf1;
for (int dx = 0; dx < dstw; dx++) { for (int dx = 0; dx < dstw; dx++) {
int sx = xofs[dx] * num; // num = 4 int sx = xofs[dx];
int16_t a0 = ialphap[0]; int16_t a0 = ialphap[0];
int16_t a1 = ialphap[1]; int16_t a1 = ialphap[1];
const uint8_t* S1pl = S1 + sx; const uint8_t* S1pl = S1 + sx;
const uint8_t* S1pr = S1 + sx + num; const uint8_t* S1pr = S1 + sx + num;
if (sx < 0) {
S1pl = S1;
}
for (int i = 0; i < num; i++) { for (int i = 0; i < num; i++) {
if (sx < 0) { *rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4;
*rows1p++ = ((*S1pl++) * a1) >> 4;
} else {
*rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4;
}
} }
ialphap += 2; ialphap += 2;
} }
...@@ -176,7 +172,7 @@ void resize(const uint8_t* src, ...@@ -176,7 +172,7 @@ void resize(const uint8_t* src,
int16_t* rows0p = rowsbuf0; int16_t* rows0p = rowsbuf0;
int16_t* rows1p = rowsbuf1; int16_t* rows1p = rowsbuf1;
for (int dx = 0; dx < dstw; dx++) { for (int dx = 0; dx < dstw; dx++) {
int sx = xofs[dx] * num; // num = 4 int sx = xofs[dx];
int16_t a0 = ialphap[0]; int16_t a0 = ialphap[0];
int16_t a1 = ialphap[1]; int16_t a1 = ialphap[1];
...@@ -184,32 +180,21 @@ void resize(const uint8_t* src, ...@@ -184,32 +180,21 @@ void resize(const uint8_t* src,
const uint8_t* S0pr = S0 + sx + num; const uint8_t* S0pr = S0 + sx + num;
const uint8_t* S1pl = S1 + sx; const uint8_t* S1pl = S1 + sx;
const uint8_t* S1pr = S1 + sx + num; const uint8_t* S1pr = S1 + sx + num;
if (sx < 0) {
S0pl = S0;
S1pl = S1;
}
for (int i = 0; i < num; i++) { for (int i = 0; i < num; i++) {
if (sx < 0) { *rows0p++ = ((*S0pl++) * a0 + (*S0pr++) * a1) >> 4;
*rows0p = ((*S0pl++) * a1) >> 4; *rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4;
*rows1p = ((*S1pl++) * a1) >> 4;
rows0p++;
rows1p++;
} else {
*rows0p++ = ((*S0pl++) * a0 + (*S0pr++) * a1) >> 4;
*rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4;
}
} }
ialphap += 2; ialphap += 2;
} }
} }
int ind = dy * 2; prev_sy1 = sy + 1;
int16_t b0 = ibeta[ind]; int16_t b0 = ibeta[0];
int16_t b1 = ibeta[ind + 1]; int16_t b1 = ibeta[1];
int16x8_t _b0 = vdupq_n_s16(b0);
int16x8_t _b1 = vdupq_n_s16(b1);
uint8_t* dp_ptr = dst + dy * w_out; uint8_t* dp_ptr = dst + dy * w_out;
int16_t* rows0p = rowsbuf0; int16_t* rows0p = rowsbuf0;
int16_t* rows1p = rowsbuf1; int16_t* rows1p = rowsbuf1;
int16x8_t _b0 = vdupq_n_s16(b0);
int16x8_t _b1 = vdupq_n_s16(b1);
int re_cnt = cnt; int re_cnt = cnt;
if (re_cnt > 0) { if (re_cnt > 0) {
#ifdef __aarch64__ #ifdef __aarch64__
...@@ -295,6 +280,7 @@ void resize(const uint8_t* src, ...@@ -295,6 +280,7 @@ void resize(const uint8_t* src,
(int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >> (int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
2); 2);
} }
ibeta += 2;
} }
delete[] buf; delete[] buf;
} }
...@@ -303,6 +289,7 @@ void compute_xy(int srcw, ...@@ -303,6 +289,7 @@ void compute_xy(int srcw,
int srch, int srch,
int dstw, int dstw,
int dsth, int dsth,
int num,
double scale_x, double scale_x,
double scale_y, double scale_y,
int* xofs, int* xofs,
...@@ -334,7 +321,7 @@ void compute_xy(int srcw, ...@@ -334,7 +321,7 @@ void compute_xy(int srcw,
fx = 1.f; fx = 1.f;
} }
xofs[dx] = sx; xofs[dx] = sx * num;
float a0 = (1.f - fx) * resize_coef_scale; float a0 = (1.f - fx) * resize_coef_scale;
float a1 = fx * resize_coef_scale; float a1 = fx * resize_coef_scale;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册