未验证 提交 18d974c0 编写于 作者: H HappyAngel 提交者: GitHub

[opencl] add grid_sampler op (#3075)


* add grid sampler

* reset act

* fix conflict and readme, test=develop

* fix ios v8 run error

* fix grid_sampler compute error. test=develop

* fix exp act run error, test=develop

* fix format, test=develop
上级 98f37b08
...@@ -96,7 +96,8 @@ enum class ActivationType : int { ...@@ -96,7 +96,8 @@ enum class ActivationType : int {
kLeakyRelu = 4, kLeakyRelu = 4,
kSigmoid = 5, kSigmoid = 5,
kTanh = 6, kTanh = 6,
kSwish = 7 kSwish = 7,
kExp = 8
}; };
static size_t PrecisionTypeLength(PrecisionType type) { static size_t PrecisionTypeLength(PrecisionType type) {
......
...@@ -2203,7 +2203,7 @@ inline void act_switch_c8_fp32(const float* din_ptr, ...@@ -2203,7 +2203,7 @@ inline void act_switch_c8_fp32(const float* din_ptr,
[cnt] "+r"(cnt_loop) [cnt] "+r"(cnt_loop)
: :
: "cc", : "cc",
"meemory", "memory",
"q0", "q0",
"q1", "q1",
"q2", "q2",
......
...@@ -97,7 +97,7 @@ __kernel void leaky_relu(__read_only image2d_t input, ...@@ -97,7 +97,7 @@ __kernel void leaky_relu(__read_only image2d_t input,
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
} }
__kernel void tanhAct(__read_only image2d_t input, __kernel void tanh_act(__read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const float threshold, __private const float threshold,
__private const float scale) { __private const float scale) {
...@@ -113,3 +113,38 @@ __kernel void tanhAct(__read_only image2d_t input, ...@@ -113,3 +113,38 @@ __kernel void tanhAct(__read_only image2d_t input,
CL_DTYPE4 out= (exp(in) - exp(-in))/ (exp(in) + exp(-in)); CL_DTYPE4 out= (exp(in) - exp(-in))/ (exp(in) + exp(-in));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
} }
__kernel void exp_act(__read_only image2d_t input,
__write_only image2d_t output,
__private const float threshold,
__private const float scale) {
const int x = get_global_id(0); // image_width
const int y = get_global_id(1); // image_height
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 out = exp(in);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
}
__kernel void swish(__read_only image2d_t input,
__write_only image2d_t output,
__private const float threshold,
__private const float scale) {
const int x = get_global_id(0); // image_width
const int y = get_global_id(1); // image_height
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 out = in / (1 + exp(-scale * in));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cl_common.h>
__kernel void grid_sampler(__read_only image2d_t input,
__read_only image2d_t grid,
__write_only image2d_t output,
__private const int out_height,
__private const int out_width) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2) * 4;
const int out_n = out_nh / out_height;
const int out_h = out_nh % out_height;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords1, coords2, outpoints;
coords1.x = out_h / 4 * 2;
coords1.y = out_n * out_width + out_w;
coords2.x = coords1.x + 1;
coords2.y = coords1.y;
outpoints.x = out_c * out_width + out_w;
outpoints.x = out_n * out_height + out_h;
CL_DTYPE4 g1 = READ_IMG_TYPE(CL_DTYPE_CHAR, grid, sampler, coords1);
CL_DTYPE4 g2 = READ_IMG_TYPE(CL_DTYPE_CHAR, grid, sampler, coords2);
// x
float x = (g1.x + 1) * (out_width - 1) * 0.5;
float y = (g2.x + 1) * (out_height - 1) * 0.5;
int x0 = floor(x);
int y0 = floor(y);
int x_p = out_c * out_width + x0;
int y_p = out_n * out_height + y0;
float xs = x - x0;
float xe = x0 + 1 - x;
float ys = y - y0;
float ye = y0 + 1 - y;
CL_DTYPE4 input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p));
CL_DTYPE4 input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p));
CL_DTYPE4 input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p + 1));
CL_DTYPE4 input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p + 1));
if (x0 < 0 || x0 > out_width - 1 || y0 < 0 || y0 > out_height - 1){
input0 = (CL_DTYPE4)(0.0);
}
if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 < 0 || y0 > out_height - 1){
input1 = (CL_DTYPE4)(0.0);
}
if (x0 < 0 || x0 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){
input2 = (CL_DTYPE4)(0.0);
}
if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){
input3 = (CL_DTYPE4)(0.0);
}
CL_DTYPE4 out_val = input0 * xe * ye + input1 * xs * ye + input2 * xe * ys + input3 * xs * ys;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, outpoints, out_val);
// y
x = (g1.y + 1) * (out_width - 1) / 2;
y = (g2.y + 1) * (out_height - 1) / 2;
x0 = floor(x);
y0 = floor(y);
x_p = out_c * out_width + x0;
y_p = out_n * out_height + y0;
xs = x - x0;
xe = x0 + 1 - x;
ys = y - y0;
ye = y0 + 1 - y;
input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p));
input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p));
input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p + 1));
input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p + 1));
if (x0 < 0 || x0 > out_width - 1 || y0 < 0 || y0 > out_height - 1){
input0 = (CL_DTYPE4)(0.0);
}
if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 < 0 || y0 > out_height - 1){
input1 = (CL_DTYPE4)(0.0);
}
if (x0 < 0 || x0 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){
input2 = (CL_DTYPE4)(0.0);
}
if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){
input3 = (CL_DTYPE4)(0.0);
}
out_val = input0 * xe * ye + input1 * xs * ye + input2 * xe * ys + input3 * xs * ys;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(outpoints.x, outpoints.y + 1), out_val);
// z
x = (g1.z + 1) * (out_width - 1) / 2;
y = (g2.z + 1) * (out_height - 1) / 2;
x0 = floor(x);
y0 = floor(y);
x_p = out_c * out_width + x0;
y_p = out_n * out_height + y0;
xs = x - x0;
xe = x0 + 1 - x;
ys = y - y0;
ye = y0 + 1 - y;
input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p));
input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p));
input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p + 1));
input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p + 1));
if (x0 < 0 || x0 > out_width - 1 || y0 < 0 || y0 > out_height - 1){
input0 = (CL_DTYPE4)(0.0);
}
if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 < 0 || y0 > out_height - 1){
input1 = (CL_DTYPE4)(0.0);
}
if (x0 < 0 || x0 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){
input2 = (CL_DTYPE4)(0.0);
}
if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){
input3 = (CL_DTYPE4)(0.0);
}
out_val = input0 * xe * ye + input1 * xs * ye + input2 * xe * ys + input3 * xs * ys;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(outpoints.x, outpoints.y + 2), out_val);
// w
x = (g1.w + 1) * (out_width - 1) / 2;
y = (g2.w + 1) * (out_height - 1) / 2;
x0 = floor(x);
y0 = floor(y);
x_p = out_c * out_width + x0;
y_p = out_n * out_height + y0;
xs = x - x0;
xe = x0 + 1 - x;
ys = y - y0;
ye = y0 + 1 - y;
input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p));
input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p));
input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p + 1));
input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p + 1));
if (x0 < 0 || x0 > out_width - 1 || y0 < 0 || y0 > out_height - 1){
input0 = (CL_DTYPE4)(0.0);
}
if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 < 0 || y0 > out_height - 1){
input1 = (CL_DTYPE4)(0.0);
}
if (x0 < 0 || x0 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){
input2 = (CL_DTYPE4)(0.0);
}
if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){
input3 = (CL_DTYPE4)(0.0);
}
out_val = input0 * xe * ye + input1 * xs * ye + input2 * xe * ys + input3 * xs * ys;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(outpoints.x, outpoints.y + 3), out_val);
}
...@@ -22,6 +22,7 @@ add_kernel(layout_opencl OPENCL basic SRCS layout_image_compute.cc DEPS ${cl_ker ...@@ -22,6 +22,7 @@ add_kernel(layout_opencl OPENCL basic SRCS layout_image_compute.cc DEPS ${cl_ker
add_kernel(concat_opencl OPENCL basic SRCS concat_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(concat_opencl OPENCL basic SRCS concat_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(scale_opencl OPENCL basic SRCS scale_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(scale_opencl OPENCL basic SRCS scale_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(grid_sampler_opencl OPENCL basic SRCS grid_sampler_image_compute.cc DEPS ${cl_kernel_deps})
# extra # extra
# wait to add ... # wait to add ...
...@@ -76,6 +77,9 @@ lite_cc_test(test_elementwise_add_image_opencl SRCS elementwise_add_image_comput ...@@ -76,6 +77,9 @@ lite_cc_test(test_elementwise_add_image_opencl SRCS elementwise_add_image_comput
DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
lite_cc_test(test_grid_sampler_image_opencl SRCS grid_sampler_image_compute_test.cc
DEPS grid_sampler_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
###################### ######################
# buffer kernel # # buffer kernel #
......
...@@ -56,7 +56,14 @@ class ActivationComputeImageDefault ...@@ -56,7 +56,14 @@ class ActivationComputeImageDefault
kernel_func_name_ = "sigmoid"; kernel_func_name_ = "sigmoid";
break; break;
case 6: case 6:
kernel_func_name_ = "tanhAct"; kernel_func_name_ = "tanh_act";
break;
case 7:
kernel_func_name_ = "swish";
scale_ = act_param_->Swish_beta;
break;
case 8:
kernel_func_name_ = "exp_act";
break; break;
default: default:
printf("This act type: %d doesn't support \n", act_type); printf("This act type: %d doesn't support \n", act_type);
...@@ -80,7 +87,6 @@ class ActivationComputeImageDefault ...@@ -80,7 +87,6 @@ class ActivationComputeImageDefault
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0; int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_img); cl_int status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
...@@ -147,9 +153,45 @@ REGISTER_LITE_KERNEL( ...@@ -147,9 +153,45 @@ REGISTER_LITE_KERNEL(
DATALAYOUT(kImageDefault))}) DATALAYOUT(kImageDefault))})
.Finalize(); .Finalize();
// swish
REGISTER_LITE_KERNEL(
swish,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::ActivationComputeImageDefault,
ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
// exp
REGISTER_LITE_KERNEL(
exp_act,
kOpenCL,
kFP16,
kImageDefault,
paddle::lite::kernels::opencl::ActivationComputeImageDefault,
ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
// tanh // tanh
REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL(
tanhAct, tanh_act,
kOpenCL, kOpenCL,
kFP16, kFP16,
kImageDefault, kImageDefault,
...@@ -164,6 +206,7 @@ REGISTER_LITE_KERNEL( ...@@ -164,6 +206,7 @@ REGISTER_LITE_KERNEL(
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kImageDefault))}) DATALAYOUT(kImageDefault))})
.Finalize(); .Finalize();
// Relu // Relu
REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL(
relu, relu,
......
...@@ -51,13 +51,19 @@ void act_compute_ref(const dtype *x_data, ...@@ -51,13 +51,19 @@ void act_compute_ref(const dtype *x_data,
out_data[i] = (expf(x_data[i]) - expf(-x_data[i])) / out_data[i] = (expf(x_data[i]) - expf(-x_data[i])) /
(expf(x_data[i]) + expf(-x_data[i])); (expf(x_data[i]) + expf(-x_data[i]));
break; break;
case 7: // swish
out_data[i] = x_data[i] / (1 + expf(-x_data[i] * scale));
break;
case 8: // exp
out_data[i] = expf(x_data[i]);
break;
default: default:
break; break;
} }
} }
} }
// #define ACT_FP16_LOOP_TEST // #define ACT_FP16_LOOP_TEST
// #define ACT_FP16_PRINT_RESULT // #define ACT_FP16_PRINT_RESULT
TEST(act_image2d_fp16, compute) { TEST(act_image2d_fp16, compute) {
LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu(img) -> " LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu(img) -> "
...@@ -69,7 +75,7 @@ TEST(act_image2d_fp16, compute) { ...@@ -69,7 +75,7 @@ TEST(act_image2d_fp16, compute) {
for (auto c : {1, 3, 8, 23, 32}) { for (auto c : {1, 3, 8, 23, 32}) {
for (int h = 12; h <= 100; h += 13) { for (int h = 12; h <= 100; h += 13) {
for (int w = 12; w <= 100; w += 25) { for (int w = 12; w <= 100; w += 25) {
for (auto act_type : {1, 2, 4, 5, 6}) { for (auto act_type : {1, 2, 4, 5, 6, 7, 8}) {
for (auto scale : {0.5, 0.8}) { for (auto scale : {0.5, 0.8}) {
for (auto threshold : {6.0}) { for (auto threshold : {6.0}) {
#else #else
...@@ -103,7 +109,13 @@ TEST(act_image2d_fp16, compute) { ...@@ -103,7 +109,13 @@ TEST(act_image2d_fp16, compute) {
func_name = "sigmoid"; func_name = "sigmoid";
break; break;
case 6: // tanh case 6: // tanh
func_name = "tanhAct"; func_name = "tanh_act";
break;
case 7: // tanh
func_name = "swish";
break;
case 8: // tanh
func_name = "exp_act";
break; break;
} }
LOG(INFO) << "func_name: " << func_name; LOG(INFO) << "func_name: " << func_name;
...@@ -153,6 +165,7 @@ TEST(act_image2d_fp16, compute) { ...@@ -153,6 +165,7 @@ TEST(act_image2d_fp16, compute) {
(paddle::lite_api::ActivationType)act_type; (paddle::lite_api::ActivationType)act_type;
actParam.Relu_clipped_coef = threshold; actParam.Relu_clipped_coef = threshold;
actParam.Leaky_relu_alpha = scale; actParam.Leaky_relu_alpha = scale;
actParam.Swish_beta = scale;
const DDim x_dim = const DDim x_dim =
DDim(std::vector<DDim::value_type>{n, c, h, w}); DDim(std::vector<DDim::value_type>{n, c, h, w});
...@@ -175,9 +188,11 @@ TEST(act_image2d_fp16, compute) { ...@@ -175,9 +188,11 @@ TEST(act_image2d_fp16, compute) {
x_data, 0, sizeof(float) * x_dim.production())); x_data, 0, sizeof(float) * x_dim.production()));
auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map( auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
y_data, 0, sizeof(float) * x_dim.production())); y_data, 0, sizeof(float) * x_dim.production()));
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-1, 1);
for (int i = 0; i < x_dim.production(); ++i) { for (int i = 0; i < x_dim.production(); ++i) {
mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2; mapped_x[i] = dist(engine);
mapped_y[i] = static_cast<int>(0); mapped_y[i] = 0.0f;
} }
auto *act_in_data = act_in.mutable_data<half_t, cl::Image2D>( auto *act_in_data = act_in.mutable_data<half_t, cl::Image2D>(
act_image2d_shape["width"], act_image2d_shape["height"]); act_image2d_shape["width"], act_image2d_shape["height"]);
...@@ -290,11 +305,18 @@ TEST(act_image2d_fp16, compute) { ...@@ -290,11 +305,18 @@ TEST(act_image2d_fp16, compute) {
// layout // layout
USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault); USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW); USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
// leakyRelu
// exp
USE_LITE_KERNEL(exp_act, kOpenCL, kFP16, kImageDefault, ImageDefault);
// swish
USE_LITE_KERNEL(swish, kOpenCL, kFP16, kImageDefault, ImageDefault);
// leaky_relu
USE_LITE_KERNEL(leaky_relu, kOpenCL, kFP16, kImageDefault, ImageDefault); USE_LITE_KERNEL(leaky_relu, kOpenCL, kFP16, kImageDefault, ImageDefault);
// tanh // tanh act
USE_LITE_KERNEL(tanhAct, kOpenCL, kFP16, kImageDefault, ImageDefault); USE_LITE_KERNEL(tanh_act, kOpenCL, kFP16, kImageDefault, ImageDefault);
// relu image2d fp16 // relu image2d fp16
USE_LITE_KERNEL(relu, kOpenCL, kFP16, kImageDefault, ImageDefault); USE_LITE_KERNEL(relu, kOpenCL, kFP16, kImageDefault, ImageDefault);
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include <string>
#include "lite/backends/opencl/cl_half.h"
#include "lite/backends/opencl/cl_image_converter.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/logging.h"
#include "lite/utils/replace_stl/stream.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::GridSamplerParam;
std::string doc() const override {
return "GridSampler using cl::Image2D(ImageDefault/RGBA), kFP32";
}
void PrepareForRun() override {
grid_param_ = param_.get_mutable<param_t>();
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/grid_sampler_kernel.cl", build_options_);
}
void Run() override {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* x = grid_param_->x;
auto* out = grid_param_->out;
auto* grid = grid_param_->grid;
auto out_dims = out->dims();
auto in_dims = x->dims();
VLOG(4) << "x->target():" << TargetToStr(x->target());
VLOG(4) << "out->target():" << TargetToStr(out->target());
VLOG(4) << "x->dims():" << in_dims;
VLOG(4) << "out->dims():" << out_dims;
auto out_image_shape = InitImageDimInfoWith(out_dims);
auto* x_img = x->data<half_t, cl::Image2D>();
VLOG(4) << "x_image: " << x_img;
auto* grid_img = x->data<half_t, cl::Image2D>();
VLOG(4) << "grid_img: " << grid_img;
auto* out_img = out->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
VLOG(4) << "out_image" << out_img;
VLOG(4) << "out_image_shape[w,h]:" << out_image_shape["width"] << " "
<< out_image_shape["height"];
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
int out_height = out_dims[2];
int out_width = out_dims[3];
auto default_work_size =
DefaultWorkSize(out_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
cl_int status = kernel.setArg(arg_idx++, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, *grid_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, *out_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, out_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, out_width);
CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
static_cast<cl::size_type>(default_work_size[2]),
static_cast<cl::size_type>(default_work_size[3] / 4)};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_img, event_);
VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
<< global_work_size[1] << " " << global_work_size[2];
}
protected:
param_t* grid_param_{nullptr};
std::string kernel_func_name_{"grid_sampler"};
std::string build_options_{"-DCL_DTYPE_half"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle
namespace ocl = paddle::lite::kernels::opencl;
REGISTER_LITE_KERNEL(grid_sampler,
kOpenCL,
kFP16,
kImageDefault,
ocl::GridSamplerImageCompute,
ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindInput("Grid",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindOutput("Output",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <memory>
#include <random>
#include "lite/backends/opencl/target_wrapper.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/kernels/opencl/test_helper.h"
#define FP16_MAX_DIFF (5e-1)
namespace paddle {
namespace lite {
void gird_sampler_ref(const float* din,
const DDim& in_dims,
const float* grid,
float* output) {
int num = in_dims[0];
int channel = in_dims[1];
int height = in_dims[2];
int width = in_dims[3];
int spatial_size = height * width;
auto inbound = [](int x, int y, float x_max, float y_max) {
if (x < 0 || x > x_max || y < 0 || y > y_max) {
return false;
}
return true;
};
for (int n = 0; n < num; ++n) {
const float* x_n = din + n * channel * height * width;
float* out_n = output + n * channel * height * width;
const float* grid_n = grid + n * height * width * 2;
for (int c = 0; c < channel; ++c) {
const float* x_c = x_n + c * spatial_size;
float* out_c = out_n + c * spatial_size;
for (int s = 0; s < spatial_size; ++s) {
float x = grid_n[s * 2];
float y = grid_n[s * 2 + 1];
float xwf = (x + 1.f) * 0.5 * (width - 1);
float ynf = (y + 1.f) * 0.5 * (height - 1);
int xw = floor(xwf);
int xe = xw + 1;
int yn = floor(ynf);
int ys = yn + 1;
float dw = xwf - xw;
float de = xe - xwf;
float dn = ynf - yn;
float ds = ys - ynf;
float wn = inbound(xw,
yn,
static_cast<float>(width - 1),
static_cast<float>(height - 1))
? x_c[yn * width + xw]
: 0.f;
float en = inbound(xe,
yn,
static_cast<float>(width - 1),
static_cast<float>(height - 1))
? x_c[yn * width + xe]
: 0.f;
float ws = inbound(xw,
ys,
static_cast<float>(width - 1),
static_cast<float>(height - 1))
? x_c[ys * width + xw]
: 0.f;
float es = inbound(xe,
ys,
static_cast<float>(width - 1),
static_cast<float>(height - 1))
? x_c[ys * width + xe]
: 0.f;
out_c[s] = wn * de * ds + en * dw * ds + ws * de * dn + es * dw * dn;
}
}
}
}
// #define GRID_FP16_LOOP_TEST
// #define GRID_FP16_PRINT_RESULT
TEST(grid_samler_image2d, compute) {
#ifdef GRID_FP16_LOOP_TEST
for (int n = 1; n <= 100; n += 33) {
for (auto c : {1, 3, 8, 23, 32}) {
for (int h = 12; h <= 100; h += 13) {
for (int w = 12; w <= 100; w += 25) {
#else
const int n = 1;
const int c = 2;
const int h = 3;
const int w = 4;
#endif // GRID_FP16_LOOP_TEST
LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
<< h << " " << w << " ========";
auto kernels =
KernelRegistry::Global().Create("grid_sampler",
TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());
LOG(INFO) << "get kernel:" << kernel->doc();
lite::Tensor x, grid, out;
operators::GridSamplerParam param;
param.x = &x;
param.grid = &grid;
param.out = &out;
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
kernel->SetParam(param);
std::unique_ptr<KernelContext> grid_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(grid_context->As<OpenCLContext>()));
kernel->SetContext(std::move(grid_context));
const DDim in_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
const DDim grid_dim = DDim(std::vector<DDim::value_type>{n, h, w, 2});
const DDim out_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
x.Resize(in_dim);
grid.Resize(grid_dim);
out.Resize(out_dim);
std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-1, 1);
int sum = n * c * h * w;
int sum2 = n * h * w * 2;
std::vector<float> input_v(sum);
std::vector<float> grid_v(sum2);
for (auto& i : input_v) {
i = dist(engine);
}
for (auto& i : grid_v) {
i = dist(engine);
}
LOG(INFO) << "prepare input";
CLImageConverterDefault* default_converter =
new CLImageConverterDefault();
DDim x_image_shape = default_converter->InitImageDimInfoWith(in_dim);
LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " "
<< x_image_shape[1];
std::vector<half_t> x_image_data(x_image_shape.production() *
4); // 4 : RGBA
default_converter->NCHWToImage(
input_v.data(), x_image_data.data(), in_dim);
auto* x_image = x.mutable_data<half_t, cl::Image2D>(
x_image_shape[0], x_image_shape[1], x_image_data.data());
LOG(INFO) << "x_image:" << x_image;
DDim grid_image_shape =
default_converter->InitImageDimInfoWith(grid_dim);
LOG(INFO) << "grid_image_shape = " << grid_image_shape[0] << " "
<< grid_image_shape[1];
std::vector<half_t> grid_image_data(grid_image_shape.production() *
4); // 4 : RGBA
default_converter->NCHWToImage(
grid_v.data(), grid_image_data.data(), grid_dim);
auto* grid_image = grid.mutable_data<half_t, cl::Image2D>(
grid_image_shape[0], grid_image_shape[1], grid_image_data.data());
LOG(INFO) << "grid_image:" << grid_image;
DDim out_image_shape =
default_converter->InitImageDimInfoWith(out_dim);
LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
<< out_image_shape[1];
auto* out_image = out.mutable_data<half_t, cl::Image2D>(
out_image_shape[0], out_image_shape[1]);
LOG(INFO) << "out_image:" << out_image;
kernel->Launch();
auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
auto* out_ptr = param.out->data<half_t, cl::Image2D>();
auto it = wait_list->find(out_ptr);
if (it != wait_list->end()) {
VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
auto& event = *(it->second);
event.wait();
} else {
LOG(FATAL)
<< "Could not find the sync event for the target cl tensor.";
}
std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
gird_sampler_ref(
input_v.data(), in_dim, grid_v.data(), out_ref.get());
const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0};
half_t* out_image_data = new half_t[out_image_shape.production() * 4];
TargetWrapperCL::ImgcpySync(out_image_data,
out_image,
out_image_shape[0],
out_image_shape[1],
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH);
float* out_data = new float[out_image_shape.production() * 4];
default_converter->ImageToNCHW(
out_image_data, out_data, out_image_shape, out_dim);
// result
#ifdef GRID_FP16_PRINT_RESULT
LOG(INFO) << "---- print kernel result (input -> output) ----";
for (int eidx = 0; eidx < in_dim.production(); ++eidx) {
std::cout << input_v[eidx] << " -> " << out_data[eidx] << std::endl;
}
#endif // GRID_FP16_PRINT_RESULT
for (int i = 0; i < out_dim.production(); i++) {
auto abs_diff = abs(out_data[i] - out_ref[i]);
auto relative_diff = COMPUTE_RELATIVE_DIFF(out_data[i], out_ref[i]);
EXPECT_EQ(
(relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
true);
if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
LOG(ERROR) << "error idx:" << i << " out_data[" << i
<< "]:" << out_data[i] << " "
"out_ref["
<< i << "]:" << out_ref[i] << " abs_diff:" << abs_diff
<< " relative_diff:" << relative_diff
<< " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
}
}
#ifdef GRID_FP16_LOOP_TEST
} // w
} // h
} // c
} // n
#else
// nothing to do.
#endif
}
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(grid_sampler, kOpenCL, kFP16, kImageDefault, ImageDefault);
...@@ -142,15 +142,15 @@ void gray_to_tensor(const uint8_t* src, ...@@ -142,15 +142,15 @@ void gray_to_tensor(const uint8_t* src,
"ucvtf v14.4s, v8.4s \n" "ucvtf v14.4s, v8.4s \n"
"ucvtf v15.4s, v9.4s \n" "ucvtf v15.4s, v9.4s \n"
// sub -mean // sub -mean
"fsub v12.4s, v12.4s, %w[vmean].4s \n" "fsub v12.4s, v12.4s, %[vmean].4s \n"
"fsub v13.4s, v13.4s, %w[vmean].4s \n" "fsub v13.4s, v13.4s, %[vmean].4s \n"
"fsub v14.4s, v14.4s, %w[vmean].4s \n" "fsub v14.4s, v14.4s, %[vmean].4s \n"
"fsub v15.4s, v15.4s, %w[vmean].4s \n" "fsub v15.4s, v15.4s, %[vmean].4s \n"
// mul * scale // mul * scale
"fmul v6.4s, v12.4s, %w[vscale].4s \n" "fmul v6.4s, v12.4s, %[vscale].4s \n"
"fmul v7.4s, v13.4s, %w[vscale].4s \n" "fmul v7.4s, v13.4s, %[vscale].4s \n"
"fmul v8.4s, v14.4s, %w[vscale].4s \n" "fmul v8.4s, v14.4s, %[vscale].4s \n"
"fmul v9.4s, v15.4s, %w[vscale].4s \n" "fmul v9.4s, v15.4s, %[vscale].4s \n"
// store // store
"st1 {v6.4s}, [%[outr0]], #16 \n" "st1 {v6.4s}, [%[outr0]], #16 \n"
"subs %w[cnt], %w[cnt], #1 \n" "subs %w[cnt], %w[cnt], #1 \n"
...@@ -301,19 +301,19 @@ void bgr_to_tensor_chw(const uint8_t* src, ...@@ -301,19 +301,19 @@ void bgr_to_tensor_chw(const uint8_t* src,
"ucvtf v16.4s, v10.4s \n" "ucvtf v16.4s, v10.4s \n"
"ucvtf v17.4s, v11.4s \n" "ucvtf v17.4s, v11.4s \n"
// sub -mean // sub -mean
"fsub v12.4s, v12.4s, %w[vbmean].4s \n" "fsub v12.4s, v12.4s, %[vbmean].4s \n"
"fsub v13.4s, v13.4s, %w[vbmean].4s \n" "fsub v13.4s, v13.4s, %[vbmean].4s \n"
"fsub v14.4s, v14.4s, %w[vgmean].4s \n" "fsub v14.4s, v14.4s, %[vgmean].4s \n"
"fsub v15.4s, v15.4s, %w[vgmean].4s \n" "fsub v15.4s, v15.4s, %[vgmean].4s \n"
"fsub v16.4s, v16.4s, %w[vrmean].4s \n" "fsub v16.4s, v16.4s, %[vrmean].4s \n"
"fsub v17.4s, v17.4s, %w[vrmean].4s \n" "fsub v17.4s, v17.4s, %[vrmean].4s \n"
// mul * scale // mul * scale
"fmul v6.4s, v12.4s, %w[vbscale].4s \n" "fmul v6.4s, v12.4s, %[vbscale].4s \n"
"fmul v7.4s, v13.4s, %w[vbscale].4s \n" "fmul v7.4s, v13.4s, %[vbscale].4s \n"
"fmul v8.4s, v14.4s, %w[vgscale].4s \n" "fmul v8.4s, v14.4s, %[vgscale].4s \n"
"fmul v9.4s, v15.4s, %w[vgscale].4s \n" "fmul v9.4s, v15.4s, %[vgscale].4s \n"
"fmul v10.4s, v16.4s, %w[vrscale].4s \n" "fmul v10.4s, v16.4s, %[vrscale].4s \n"
"fmul v11.4s, v17.4s, %w[vrscale].4s \n" "fmul v11.4s, v17.4s, %[vrscale].4s \n"
// store // store
"st1 {v6.4s}, [%[outr0]], #16 \n" "st1 {v6.4s}, [%[outr0]], #16 \n"
"st1 {v8.4s}, [%[outr1]], #16 \n" "st1 {v8.4s}, [%[outr1]], #16 \n"
......
...@@ -829,15 +829,9 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) { ...@@ -829,15 +829,9 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
uint8x8_t vb = vdup_n_u8(b); uint8x8_t vb = vdup_n_u8(b);
uint8x8_t vg = vdup_n_u8(g); uint8x8_t vg = vdup_n_u8(g);
uint8x8_t vr = vdup_n_u8(r); uint8x8_t vr = vdup_n_u8(r);
#ifdef __aarch64__
uint8x16_t vb1 = vdupq_n_u8(b);
uint8x16_t vg1 = vdupq_n_u8(g);
uint8x16_t vr1 = vdupq_n_u8(r);
#else
uint8_t vb_array[8] = {b, b, b, b, b, b, b, b}; uint8_t vb_array[8] = {b, b, b, b, b, b, b, b};
uint8_t vg_array[8] = {g, g, g, g, g, g, g, g}; uint8_t vg_array[8] = {g, g, g, g, g, g, g, g};
uint8_t vr_array[8] = {r, r, r, r, r, r, r, r}; uint8_t vr_array[8] = {r, r, r, r, r, r, r, r};
#endif
int cnt_pro = srcw >> 3; int cnt_pro = srcw >> 3;
int remain_pro = srcw % 8; int remain_pro = srcw % 8;
int win = srcw * 3; int win = srcw * 3;
...@@ -866,6 +860,9 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) { ...@@ -866,6 +860,9 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
"prfm pldl1keep, [%[inptr2], #128] \n" "prfm pldl1keep, [%[inptr2], #128] \n"
"prfm pldl1keep, [%[inptr3]] \n" "prfm pldl1keep, [%[inptr3]] \n"
"prfm pldl1keep, [%[inptr3], #128] \n" "prfm pldl1keep, [%[inptr3], #128] \n"
"ld1 {v21.8b}, [%[vb]] \n"
"ld1 {v22.8b}, [%[vg]] \n"
"ld1 {v23.8b}, [%[vr]] \n"
"1: \n" "1: \n"
"ld3 {v0.8b - v2.8b}, [%[inptr0]], #24 \n" // d8 = y0y3y6y9.. d9 = "ld3 {v0.8b - v2.8b}, [%[inptr0]], #24 \n" // d8 = y0y3y6y9.. d9 =
// y1y4y7... // y1y4y7...
...@@ -876,20 +873,20 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) { ...@@ -876,20 +873,20 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
"ld3 {v9.8b - v11.8b}, [%[inptr3]], #24 \n" // d8 = y0y3y6y9.. d9 = "ld3 {v9.8b - v11.8b}, [%[inptr3]], #24 \n" // d8 = y0y3y6y9.. d9 =
// y1y4y7... // y1y4y7...
// mul b // mul b
"umull v12.8h, v0.8b, %w[vb].8b \n" // v0 * vb "umull v12.8h, v0.8b, v21.8b \n" // v0 * vb
"umull v13.8h, v3.8b, %w[vb].8b \n" // v0 * vb "umull v13.8h, v3.8b, v21.8b \n" // v0 * vb
"umull v14.8h, v6.8b, %w[vb].8b \n" // v0 * vb "umull v14.8h, v6.8b, v21.8b \n" // v0 * vb
"umull v15.8h, v9.8b, %w[vb].8b \n" // v0 * vb "umull v15.8h, v9.8b, v21.8b \n" // v0 * vb
// mul g // mul g
"umull v16.8h, v1.8b, %w[vg].8b \n" // v0 * vb "umull v16.8h, v1.8b, v22.8b \n" // v0 * vb
"umull v17.8h, v4.8b, %w[vg].8b \n" // v0 * vb "umull v17.8h, v4.8b, v22.8b \n" // v0 * vb
"umull v18.8h, v7.8b, %w[vg].8b \n" // v0 * vb "umull v18.8h, v7.8b, v22.8b \n" // v0 * vb
"umull v19.8h, v10.8b, %w[vg].8b \n" // v0 * vb "umull v19.8h, v10.8b, v22.8b \n" // v0 * vb
// mul r // mul r
"umlal v12.8h, v2.8b, %w[vr].8b \n" // v0 * vb "umlal v12.8h, v2.8b, v23.8b \n" // v0 * vb
"umlal v13.8h, v5.8b, %w[vr].8b \n" // v0 * vb "umlal v13.8h, v5.8b, v23.8b \n" // v0 * vb
"umlal v14.8h, v8.8b, %w[vr].8b \n" // v0 * vb "umlal v14.8h, v8.8b, v23.8b \n" // v0 * vb
"umlal v15.8h, v11.8b, %w[vr].8b \n" // v0 * vb "umlal v15.8h, v11.8b, v23.8b \n" // v0 * vb
// 16->32 // 16->32
"uaddl v0.4s, v16.4h, v12.4h \n" "uaddl v0.4s, v16.4h, v12.4h \n"
"uaddl2 v1.4s, v16.8h, v12.8h \n" "uaddl2 v1.4s, v16.8h, v12.8h \n"
...@@ -928,7 +925,7 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) { ...@@ -928,7 +925,7 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
[outr2] "+r"(outr2), [outr2] "+r"(outr2),
[outr3] "+r"(outr3), [outr3] "+r"(outr3),
[cnt] "+r"(cnt) [cnt] "+r"(cnt)
: [vb] "w"(vb1), [vg] "w"(vg1), [vr] "w"(vr1) : [vb] "r"(vb_array), [vg] "r"(vg_array), [vr] "r"(vr_array)
: "cc", : "cc",
"memory", "memory",
"v0", "v0",
...@@ -951,7 +948,10 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) { ...@@ -951,7 +948,10 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
"v17", "v17",
"v18", "v18",
"v19", "v19",
"v20"); "v20",
"v21",
"v22",
"v23");
#else #else
asm volatile( asm volatile(
"pld [%[inptr0]] @ preload a, 64byte\n" "pld [%[inptr0]] @ preload a, 64byte\n"
...@@ -1106,15 +1106,9 @@ void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) { ...@@ -1106,15 +1106,9 @@ void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
uint8x8_t vb = vdup_n_u8(b); uint8x8_t vb = vdup_n_u8(b);
uint8x8_t vg = vdup_n_u8(g); uint8x8_t vg = vdup_n_u8(g);
uint8x8_t vr = vdup_n_u8(r); uint8x8_t vr = vdup_n_u8(r);
#ifdef __aarch64__
uint8x16_t vb1 = vdupq_n_u8(b);
uint8x16_t vg1 = vdupq_n_u8(g);
uint8x16_t vr1 = vdupq_n_u8(r);
#else
uint8_t vb_array[8] = {b, b, b, b, b, b, b, b}; uint8_t vb_array[8] = {b, b, b, b, b, b, b, b};
uint8_t vg_array[8] = {g, g, g, g, g, g, g, g}; uint8_t vg_array[8] = {g, g, g, g, g, g, g, g};
uint8_t vr_array[8] = {r, r, r, r, r, r, r, r}; uint8_t vr_array[8] = {r, r, r, r, r, r, r, r};
#endif
int cnt_pro = srcw >> 3; int cnt_pro = srcw >> 3;
int remain_pro = srcw % 8; int remain_pro = srcw % 8;
int win = srcw * 4; int win = srcw * 4;
...@@ -1143,6 +1137,9 @@ void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) { ...@@ -1143,6 +1137,9 @@ void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
"prfm pldl1keep, [%[inptr2], #128] \n" "prfm pldl1keep, [%[inptr2], #128] \n"
"prfm pldl1keep, [%[inptr3]] \n" "prfm pldl1keep, [%[inptr3]] \n"
"prfm pldl1keep, [%[inptr3], #128] \n" "prfm pldl1keep, [%[inptr3], #128] \n"
"ld1 {v21.8b}, [%[vb]] \n"
"ld1 {v22.8b}, [%[vg]] \n"
"ld1 {v23.8b}, [%[vr]] \n"
"1: \n" "1: \n"
"ld4 {v0.8b - v3.8b}, [%[inptr0]], #32 \n" // d8 = y0y3y6y9.. d9 = "ld4 {v0.8b - v3.8b}, [%[inptr0]], #32 \n" // d8 = y0y3y6y9.. d9 =
// y1y4y7... // y1y4y7...
...@@ -1153,20 +1150,20 @@ void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) { ...@@ -1153,20 +1150,20 @@ void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
"ld4 {v12.8b - v15.8b}, [%[inptr3]], #32 \n" // d8 = y0y3y6y9.. d9 = "ld4 {v12.8b - v15.8b}, [%[inptr3]], #32 \n" // d8 = y0y3y6y9.. d9 =
// y1y4y7... // y1y4y7...
// mul b // mul b
"umull v13.8h, v0.8b, %w[vb].8b \n" // v0 * vb "umull v13.8h, v0.8b, v21.8b \n" // v0 * vb
"umull v14.8h, v4.8b, %w[vb].8b \n" // v0 * vb "umull v14.8h, v4.8b, v21.8b \n" // v0 * vb
"umull v15.8h, v8.8b, %w[vb].8b \n" // v0 * vb "umull v15.8h, v8.8b, v21.8b \n" // v0 * vb
"umull v16.8h, v12.8b, %w[vb].8b \n" // v0 * vb "umull v16.8h, v12.8b, v21.8b \n" // v0 * vb
// mul g // mul g
"umull v17.8h, v1.8b, %w[vg].8b \n" // v0 * vb "umull v17.8h, v1.8b, v22.8b \n" // v0 * vb
"umull v18.8h, v5.8b, %w[vg].8b \n" // v0 * vb "umull v18.8h, v5.8b, v22.8b \n" // v0 * vb
"umull v19.8h, v9.8b, %w[vg].8b \n" // v0 * vb "umull v19.8h, v9.8b, v22.8b \n" // v0 * vb
"umull v20.8h, v13.8b, %w[vg].8b \n" // v0 * vb "umull v20.8h, v13.8b, v22.8b \n" // v0 * vb
// mul r // mul r
"umlal v13.8h, v2.8b, %w[vr].8b \n" // v0 * vb "umlal v13.8h, v2.8b, v23.8b \n" // v0 * vb
"umlal v14.8h, v6.8b, %w[vr].8b \n" // v0 * vb "umlal v14.8h, v6.8b, v23.8b \n" // v0 * vb
"umlal v15.8h, v10.8b, %w[vr].8b \n" // v0 * vb "umlal v15.8h, v10.8b, v23.8b \n" // v0 * vb
"umlal v16.8h, v14.8b, %w[vr].8b \n" // v0 * vb "umlal v16.8h, v14.8b, v23.8b \n" // v0 * vb
// 16->32 // 16->32
"uaddl v0.4s, v17.4h, v13.4h \n" "uaddl v0.4s, v17.4h, v13.4h \n"
"uaddl2 v1.4s, v17.8h, v13.8h \n" "uaddl2 v1.4s, v17.8h, v13.8h \n"
...@@ -1205,7 +1202,7 @@ void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) { ...@@ -1205,7 +1202,7 @@ void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
[outr2] "+r"(outr2), [outr2] "+r"(outr2),
[outr3] "+r"(outr3), [outr3] "+r"(outr3),
[cnt] "+r"(cnt) [cnt] "+r"(cnt)
: [vb] "w"(vb1), [vg] "w"(vg1), [vr] "w"(vr1) : [vb] "r"(vb_array), [vg] "r"(vg_array), [vr] "r"(vr_array)
: "cc", : "cc",
"memory", "memory",
"v0", "v0",
...@@ -1228,7 +1225,10 @@ void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) { ...@@ -1228,7 +1225,10 @@ void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
"v17", "v17",
"v18", "v18",
"v19", "v19",
"v20"); "v20",
"v21",
"v22",
"v23");
#else #else
asm volatile( asm volatile(
"pld [%[inptr0]] @ preload a, 64byte\n" "pld [%[inptr0]] @ preload a, 64byte\n"
......
...@@ -217,12 +217,12 @@ void resize(const uint8_t* src, ...@@ -217,12 +217,12 @@ void resize(const uint8_t* src,
"1: \n" "1: \n"
"ld1 {v0.8h}, [%[rows0p]], #16 \n" "ld1 {v0.8h}, [%[rows0p]], #16 \n"
"ld1 {v1.8h}, [%[rows1p]], #16 \n" "ld1 {v1.8h}, [%[rows1p]], #16 \n"
"orr v6.16b, %w[_v2].16b, %w[_v2].16b \n" "orr v6.16b, %[_v2].16b, %[_v2].16b \n"
"orr v7.16b, %w[_v2].16b, %w[_v2].16b \n" "orr v7.16b, %[_v2].16b, %[_v2].16b \n"
"smull v2.4s, v0.4h, %w[_b0].4h \n" "smull v2.4s, v0.4h, %[_b0].4h \n"
"smull2 v4.4s, v0.8h, %w[_b0].8h \n" "smull2 v4.4s, v0.8h, %[_b0].8h \n"
"smull v3.4s, v1.4h, %w[_b1].4h \n" "smull v3.4s, v1.4h, %[_b1].4h \n"
"smull2 v5.4s, v1.8h, %w[_b1].8h \n" "smull2 v5.4s, v1.8h, %[_b1].8h \n"
"ssra v6.4s, v2.4s, #16 \n" "ssra v6.4s, v2.4s, #16 \n"
"ssra v7.4s, v4.4s, #16 \n" "ssra v7.4s, v4.4s, #16 \n"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册