提交 0ade1bc5 编写于 作者: H HappyAngel 提交者: Yuan Shuai

[LITE][ARM]add cv image process (#2402)

* add cv image process

* fix arm liunx build error

* add LITE_WITH_CV defien to make cv, test=develop

* fix cv format, annd add describe in utils/cv

* delete some Meaningless comments, test=develop

* set LITE_WITH_CV=OFF in build.sh, test=develop

* delete cv_enum.h in utils/cv, push the contents in cv_ennum.h to paddle_image_preprocess.h, test=develop

* according to reviews to redefine paddle_image_preprocess.h, test=develop

* add detailed note of flipParam, test=develop

* fix format in paddle_image_preprocess.h, test=develop

* fix error when build x86. test=develop

* lite_with_X86 does not contain lite_with_cv
上级 26470600
......@@ -72,6 +72,9 @@ lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
# publish options
lite_option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kernels and operators" OFF)
lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF)
# cv build options
lite_option(LITE_WITH_CV "Enable build cv image in lite" OFF IF NOT LITE_WITH_ARM)
# TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
if(ANDROID OR IOS OR ARMLINUX)
......@@ -181,7 +184,7 @@ include(external/xxhash) # download install xxhash needed for x86 jit
include(cudnn)
include(configure) # add paddle env configuration
if(LITE_WITH_CUDA)
if(LITE_WITH_CUDA)
include(cuda)
endif()
......
......@@ -117,8 +117,12 @@ endif()
if (LITE_WITH_ARM)
add_definitions("-DLITE_WITH_ARM")
if (LITE_WITH_CV)
add_definitions("-DLITE_WITH_CV")
endif()
endif()
if (WITH_ARM_DOTPROD)
add_definitions("-DWITH_ARM_DOTPROD")
endif()
......
......@@ -43,6 +43,11 @@ function (lite_deps TARGET)
foreach(var ${lite_deps_ARM_DEPS})
set(deps ${deps} ${var})
endforeach(var)
if(LITE_WITH_CV)
foreach(var ${lite_cv_deps})
set(deps ${deps} ${var})
endforeach(var)
endif()
endif()
if(LITE_WITH_PROFILE)
......@@ -341,7 +346,7 @@ function(add_kernel TARGET device level)
file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
nv_library(${TARGET} SRCS ${args_SRCS} DEPS ${args_DEPS})
return()
return()
endif()
# the source list will collect for paddle_use_kernel.h code generation.
......
......@@ -9,6 +9,7 @@ message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")
set(LITE_MODEL_DIR "${THIRD_PARTY_PATH}/install")
set(LITE_ON_MOBILE ${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK})
......@@ -129,6 +130,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
#COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/model_optimize_tool" "${INFER_LITE_PUBLISH_ROOT}/bin"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/gen_code/paddle_code_generator" "${INFER_LITE_PUBLISH_ROOT}/bin"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
)
if(NOT IOS)
#add_dependencies(publish_inference_cxx_lib model_optimize_tool)
......@@ -136,10 +138,10 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
add_dependencies(publish_inference_cxx_lib bundle_full_api)
add_dependencies(publish_inference_cxx_lib bundle_light_api)
add_dependencies(publish_inference_cxx_lib test_model_bin)
if (ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux")
if (ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux")
add_dependencies(publish_inference_cxx_lib paddle_full_api_shared)
add_dependencies(publish_inference paddle_light_api_shared)
add_custom_command(TARGET publish_inference_cxx_lib
add_custom_command(TARGET publish_inference_cxx_lib
COMMAND cp ${CMAKE_BINARY_DIR}/lite/api/*.so ${INFER_LITE_PUBLISH_ROOT}/cxx/lib)
endif()
add_dependencies(publish_inference publish_inference_cxx_lib)
......@@ -155,6 +157,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/include"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include"
COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/lib"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
)
add_dependencies(tiny_publish_lib bundle_light_api)
add_dependencies(publish_inference tiny_publish_lib)
......@@ -166,6 +169,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/libpaddle_light_api_shared.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
)
add_dependencies(tiny_publish_cxx_lib paddle_light_api_shared)
add_dependencies(publish_inference tiny_publish_cxx_lib)
......
add_subdirectory(kernels)
add_subdirectory(math)
add_subdirectory(cv)
if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ARM)
lite_cc_test(image_convert_test SRCS image_convert_test.cc DEPS paddle_cv_arm paddle_api_light ${lite_cv_deps} ${arm_kernels} ${lite_ops} ${host_kernels})
endif()
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/utils/cv/paddle_image_preprocess.h"
typedef paddle::lite::utils::cv::ImageFormat ImageFormat;
typedef paddle::lite::utils::cv::FlipParam FlipParam;
typedef paddle::lite::Tensor Tensor;
typedef paddle::lite_api::DataLayoutType LayoutType;
void nv2bgr(const uint8_t* in_data,
uint8_t* out_data,
int srcw,
int srch,
int v_num,
int u_num) {
int size = srch * srcw;
const uint8_t* y_ptr = in_data;
const uint8_t* uv_ptr = in_data + size;
for (int i = 0; i < srch; i++) {
int j = 0;
const uint8_t* ptr_y1 = y_ptr + i * srcw;
const uint8_t* ptr_vu = uv_ptr + (i / 2) * srcw;
uint8_t* ptr_bgr1 = out_data + i * 3 * srcw;
for (; j < srcw; j += 2) {
uint8_t _y0 = ptr_y1[0];
uint8_t _y1 = ptr_y1[1];
uint8_t _v = ptr_vu[v_num];
uint8_t _u = ptr_vu[u_num];
int ra = floor((179 * (_v - 128)) >> 7);
int ga = floor((44 * (_u - 128) + 91 * (_v - 128)) >> 7);
int ba = floor((227 * (_u - 128)) >> 7);
int r = _y0 + ra;
int g = _y0 - ga;
int b = _y0 + ba;
int r1 = _y1 + ra;
int g1 = _y1 - ga;
int b1 = _y1 + ba;
r = r < 0 ? 0 : (r > 255) ? 255 : r;
g = g < 0 ? 0 : (g > 255) ? 255 : g;
b = b < 0 ? 0 : (b > 255) ? 255 : b;
r1 = r1 < 0 ? 0 : (r1 > 255) ? 255 : r1;
g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1;
b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1;
*ptr_bgr1++ = b;
*ptr_bgr1++ = g;
*ptr_bgr1++ = r;
*ptr_bgr1++ = b1;
*ptr_bgr1++ = g1;
*ptr_bgr1++ = r1;
ptr_y1 += 2;
ptr_vu += 2;
}
if (j < srcw) {
uint8_t _y = ptr_y1[0];
uint8_t _v = ptr_vu[v_num];
uint8_t _u = ptr_vu[u_num];
int r = _y + ((179 * (_v - 128)) >> 7);
int g = _y - ((44 * (_u - 128) - 91 * (_v - 128)) >> 7);
int b = _y + ((227 * (_u - 128)) >> 7);
r = r < 0 ? 0 : (r > 255) ? 255 : r;
g = g < 0 ? 0 : (g > 255) ? 255 : g;
b = b < 0 ? 0 : (b > 255) ? 255 : b;
ptr_bgr1[0] = b;
ptr_bgr1[1] = g;
ptr_bgr1[2] = r;
}
}
}
void nv2bgra(const uint8_t* in_data,
uint8_t* out_data,
int srcw,
int srch,
int v_num,
int u_num) {
int size = srch * srcw;
const uint8_t* y_ptr = in_data;
const uint8_t* uv_ptr = in_data + size;
for (int i = 0; i < srch; i++) {
int j = 0;
const uint8_t* ptr_y1 = y_ptr + i * srcw;
const uint8_t* ptr_vu = uv_ptr + (i / 2) * srcw;
uint8_t* ptr_bgr1 = out_data + i * 4 * srcw;
for (; j < srcw; j += 2) {
uint8_t _y0 = ptr_y1[0];
uint8_t _y1 = ptr_y1[1];
uint8_t _v = ptr_vu[v_num];
uint8_t _u = ptr_vu[u_num];
int ra = floor((179 * (_v - 128)) >> 7);
int ga = floor((44 * (_u - 128) + 91 * (_v - 128)) >> 7);
int ba = floor((227 * (_u - 128)) >> 7);
int r = _y0 + ra;
int g = _y0 - ga;
int b = _y0 + ba;
int r1 = _y1 + ra;
int g1 = _y1 - ga;
int b1 = _y1 + ba;
r = r < 0 ? 0 : (r > 255) ? 255 : r;
g = g < 0 ? 0 : (g > 255) ? 255 : g;
b = b < 0 ? 0 : (b > 255) ? 255 : b;
r1 = r1 < 0 ? 0 : (r1 > 255) ? 255 : r1;
g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1;
b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1;
*ptr_bgr1++ = b;
*ptr_bgr1++ = g;
*ptr_bgr1++ = r;
*ptr_bgr1++ = 255;
*ptr_bgr1++ = b1;
*ptr_bgr1++ = g1;
*ptr_bgr1++ = r1;
*ptr_bgr1++ = 255;
ptr_y1 += 2;
ptr_vu += 2;
}
if (j < srcw) {
uint8_t _y = ptr_y1[0];
uint8_t _v = ptr_vu[v_num];
uint8_t _u = ptr_vu[u_num];
int r = _y + ((179 * (_v - 128)) >> 7);
int g = _y - ((44 * (_u - 128) - 91 * (_v - 128)) >> 7);
int b = _y + ((227 * (_u - 128)) >> 7);
r = r < 0 ? 0 : (r > 255) ? 255 : r;
g = g < 0 ? 0 : (g > 255) ? 255 : g;
b = b < 0 ? 0 : (b > 255) ? 255 : b;
ptr_bgr1[0] = b;
ptr_bgr1[1] = g;
ptr_bgr1[2] = r;
ptr_bgr1[3] = 255;
}
}
}
void nv12_bgr_basic(const uint8_t* in_data,
uint8_t* out_data,
int srcw,
int srch) {
nv2bgr(in_data, out_data, srcw, srch, 1, 0);
}
void nv21_bgr_basic(const uint8_t* in_data,
uint8_t* out_data,
int srcw,
int srch) {
nv2bgr(in_data, out_data, srcw, srch, 0, 1);
}
void nv12_bgra_basic(const uint8_t* in_data,
uint8_t* out_data,
int srcw,
int srch) {
nv2bgra(in_data, out_data, srcw, srch, 1, 0);
}
void nv21_bgra_basic(const uint8_t* in_data,
uint8_t* out_data,
int srcw,
int srch) {
nv2bgra(in_data, out_data, srcw, srch, 0, 1);
}
/*
/*
采用CV_BGR2GRAY,转换公式Gray = 0.1140*B + 0.5870*G + 0.2989*R
采用CV_RGB2GRAY,转换公式Gray = 0.1140*R + 0.5870*G + 0.2989*B
b = 0.114 *128 = 14.529 = 15
g = 0.587 * 128 = 75.136 = 75
r = 0.2989 * 128 = 38.2592 = 38
Gray = (15*B + 75*G + 38*R)/128
bgr2gray, rgb2gray
*/
void bgr_gray_basic(const uint8_t* in_data,
uint8_t* out_data,
int srcw,
int srch) {
for (int i = 0; i < srch; i++) {
const uint8_t* din_ptr = in_data + i * 3 * srcw;
uint8_t* dout_ptr = out_data + i * srcw;
for (int j = 0; j < srcw; j++) {
int sum = din_ptr[0] * 15 + din_ptr[1] * 75 + din_ptr[2] * 38;
sum = sum >> 7;
*dout_ptr++ = sum;
din_ptr += 3;
}
}
}
void gray_bgr_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
for (int i = 0; i < srch; i++) {
for (int j = 0; j < srcw; j++) {
*dst++ = *src;
*dst++ = *src;
*dst++ = *src;
src++;
}
}
}
// bgr2bgra, rgb2rgba
void hwc3_to_hwc4_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
for (int i = 0; i < srch; i++) {
for (int j = 0; j < srcw; j++) {
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = 255;
}
}
}
// bgra2bgr, rgba2rgb
void hwc4_to_hwc3_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
for (int i = 0; i < srch; i++) {
for (int j = 0; j < srcw; j++) {
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
src++;
}
}
}
// bgr2rgb, rgb2bgr
void hwc3_trans_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
for (int i = 0; i < srch; i++) {
for (int j = 0; j < srcw; j++) {
*dst++ = src[2]; // r
*dst++ = src[1]; // g
*dst++ = src[0]; // b
src += 3;
}
}
}
// bgra2rgba, rgba2bgra
void hwc4_trans_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
for (int i = 0; i < srch; i++) {
for (int j = 0; j < srcw; j++) {
*dst++ = src[2]; // r
*dst++ = src[1]; // g
*dst++ = src[0]; // b
*dst++ = src[3]; // a
src += 4;
}
}
}
// bgra2rgb, rgba2bgr
void hwc4_trans_hwc3_basic(const uint8_t* src,
uint8_t* dst,
int srcw,
int srch) {
for (int i = 0; i < srch; i++) {
for (int j = 0; j < srcw; j++) {
*dst++ = src[2]; // r
*dst++ = src[1]; // g
*dst++ = src[0]; // b
// *dst++ = src[4];//a
src += 4;
}
}
}
// bgr2rgba, rgb2bga
void hwc3_trans_hwc4_basic(const uint8_t* src,
uint8_t* dst,
int srcw,
int srch) {
for (int i = 0; i < srch; i++) {
for (int j = 0; j < srcw; j++) {
*dst++ = src[2]; // r
*dst++ = src[1]; // g
*dst++ = src[0]; // b
*dst++ = 255; // a
src += 3;
}
}
}
void image_convert_basic(const uint8_t* in_data,
uint8_t* out_data,
ImageFormat srcFormat,
ImageFormat dstFormat,
int srcw,
int srch,
int out_size) {
if (srcFormat == dstFormat) {
// copy
memcpy(out_data, in_data, sizeof(uint8_t) * out_size);
return;
} else {
if (srcFormat == ImageFormat::NV12 &&
(dstFormat == ImageFormat::BGR || dstFormat == ImageFormat::RGB)) {
nv12_bgr_basic(in_data, out_data, srcw, srch);
} else if (srcFormat == ImageFormat::NV21 &&
(dstFormat == ImageFormat::BGR ||
dstFormat == ImageFormat::RGB)) {
nv21_bgr_basic(in_data, out_data, srcw, srch);
} else if (srcFormat == ImageFormat::NV12 &&
(dstFormat == ImageFormat::BGRA ||
dstFormat == ImageFormat::RGBA)) {
nv12_bgra_basic(in_data, out_data, srcw, srch);
} else if (srcFormat == ImageFormat::NV21 &&
(dstFormat == ImageFormat::BGRA ||
dstFormat == ImageFormat::RGBA)) {
nv21_bgra_basic(in_data, out_data, srcw, srch);
} else if ((srcFormat == ImageFormat::RGB &&
dstFormat == ImageFormat::GRAY) ||
(srcFormat == ImageFormat::BGR &&
dstFormat == ImageFormat::GRAY)) {
bgr_gray_basic(in_data, out_data, srcw, srch);
} else if ((srcFormat == ImageFormat::GRAY &&
dstFormat == ImageFormat::RGB) ||
(srcFormat == ImageFormat::GRAY &&
dstFormat == ImageFormat::BGR)) {
gray_bgr_basic(in_data, out_data, srcw, srch);
} else if ((srcFormat == ImageFormat::RGBA &&
dstFormat == ImageFormat::RGB) ||
(srcFormat == ImageFormat::BGRA &&
dstFormat == ImageFormat::BGR)) {
hwc4_to_hwc3_basic(in_data, out_data, srcw, srch);
} else if ((srcFormat == ImageFormat::RGB &&
dstFormat == ImageFormat::RGBA) ||
(srcFormat == ImageFormat::BGR &&
dstFormat == ImageFormat::BGRA)) {
hwc3_to_hwc4_basic(in_data, out_data, srcw, srch);
} else if ((srcFormat == ImageFormat::RGB &&
dstFormat == ImageFormat::BGR) ||
(srcFormat == ImageFormat::BGR &&
dstFormat == ImageFormat::RGB)) {
hwc3_trans_basic(in_data, out_data, srcw, srch);
} else if ((srcFormat == ImageFormat::RGBA &&
dstFormat == ImageFormat::BGRA) ||
(srcFormat == ImageFormat::BGRA &&
dstFormat == ImageFormat::RGBA)) {
hwc4_trans_basic(in_data, out_data, srcw, srch);
} else if ((srcFormat == ImageFormat::RGBA &&
dstFormat == ImageFormat::BGR) ||
(srcFormat == ImageFormat::BGRA &&
dstFormat == ImageFormat::RGB)) {
hwc4_trans_hwc3_basic(in_data, out_data, srcw, srch);
} else if ((srcFormat == ImageFormat::RGB &&
dstFormat == ImageFormat::BGRA) ||
(srcFormat == ImageFormat::BGR &&
dstFormat == ImageFormat::RGBA)) {
hwc3_trans_hwc4_basic(in_data, out_data, srcw, srch);
} else {
printf("srcFormat: %d, dstFormat: %d does not support! \n",
srcFormat,
dstFormat);
}
// for (int i = 0; i < out_size; i++){
// printf("%d ", *out_data++);
// if ((i+1) % 10 == 0){
// printf("\n");
// }
// }
}
}
void compute_xy(int srcw,
int srch,
int dstw,
int dsth,
double scale_x,
double scale_y,
int* xofs,
int* yofs,
float* ialpha,
float* ibeta) {
float fy = 0.f;
float fx = 0.f;
int sy = 0;
int sx = 0;
const int resize_coef_bits = 11;
const int resize_coef_scale = 1 << resize_coef_bits;
for (int dx = 0; dx < dstw; dx++) {
fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
sx = floor(fx);
fx -= sx;
if (sx < 0) {
sx = 0;
fx = 0.f;
}
if (sx >= srcw - 1) {
sx = srcw - 2;
fx = 1.f;
}
xofs[dx] = sx;
float a0 = (1.f - fx);
float a1 = fx;
ialpha[dx * 2] = a0;
ialpha[dx * 2 + 1] = a1;
}
for (int dy = 0; dy < dsth; dy++) {
fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
sy = floor(fy);
fy -= sy;
if (sy < 0) {
sy = 0;
fy = 0.f;
}
if (sy >= srch - 1) {
sy = srch - 2;
fy = 1.f;
}
yofs[dy] = sy;
float b0 = (1.f - fy);
float b1 = fy;
ibeta[dy * 2] = b0;
ibeta[dy * 2 + 1] = b1;
}
}
void image_resize_basic(const uint8_t* in_data,
uint8_t* out_data,
ImageFormat srcFormat,
int srcw,
int srch,
int dstw,
int dsth) {
int size = srcw * srch;
if (srcw == dstw && srch == dsth) {
if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
size = srcw * (ceil(1.5 * srch));
} else if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
size = 3 * srcw * srch;
} else if (srcFormat == ImageFormat::BGRA ||
srcFormat == ImageFormat::RGBA) {
size = 4 * srcw * srch;
}
memcpy(out_data, in_data, sizeof(uint8_t) * size);
return;
}
double scale_x = static_cast<double>(srcw / dstw);
double scale_y = static_cast<double>(srch / dsth);
int* buf = new int[dstw + dsth];
int* xofs = buf;
int* yofs = buf + dstw;
float* ialpha = new float[dstw * 2];
float* ibeta = new float[dsth * 2];
int w_in = srcw;
int w_out = dstw;
int num = 1;
int orih = dsth;
compute_xy(
srcw, srch, dstw, dsth, scale_x, scale_y, xofs, yofs, ialpha, ibeta);
if (srcFormat == ImageFormat::GRAY) {
num = 1;
} else if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
int hout = static_cast<int>(0.5 * dsth);
// uv todo
w_out = dstw;
num = 1;
dsth += hout;
} else if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
w_in = srcw * 3;
w_out = dstw * 3;
num = 3;
} else if (srcFormat == ImageFormat::BGRA || srcFormat == ImageFormat::RGBA) {
w_in = srcw * 4;
w_out = dstw * 4;
num = 4;
}
float* ialpha1 = nullptr;
int* xofs1 = nullptr;
int* yofs1 = nullptr;
if (orih < dsth) {
int tmp = dsth - orih;
float* ialpha1 = new float[dstw];
int* xofs1 = new int[srcw];
int* yofs1 = new int[tmp];
compute_xy(srcw / 2,
srch / 2,
dstw / 2,
tmp,
scale_x,
scale_y,
xofs1,
yofs1,
ialpha1,
ibeta + dsth);
}
#pragma omp parallel for
for (int dy = 0; dy < dsth; dy++) {
uint8_t* out_ptr = out_data + dy * w_out;
int y_in_start = yofs[dy];
int y_in_end = y_in_start + 1;
int y_flag = 0; // only one line
if (y_in_start < 0) {
y_flag = 1;
}
float b0 = ibeta[dy * 2];
float b1 = ibeta[dy * 2 + 1];
if (dy >= orih) {
num = 2; // uv
ialpha = ialpha1;
xofs = xofs1;
yofs = yofs1;
}
for (int dx = 0; dx < w_out; dx += num) {
int tmp = dx / num;
int x_in_start = xofs[tmp] * num; // 0
int x_in_end = x_in_start + num; // 2
int x_flag = 0;
if (x_in_start < 0) {
x_flag = 1;
x_in_end = 0;
}
// printf("x_in: %d, y_in: %d \n", x_in_start, y_in_start);
float a0 = ialpha[tmp * 2];
float a1 = ialpha[tmp * 2 + 1];
int tl_index = y_in_start * w_in + x_in_start; // 0
int tr_index = y_in_start * w_in + x_in_end; // 2
int bl_index = y_in_end * w_in + x_in_start;
int br_index = y_in_end * w_in + x_in_end;
int ind = dx;
for (int i = 0; i < num; i++) {
int tl = in_data[tl_index];
int tr = in_data[tr_index];
int bl = in_data[bl_index];
int br = in_data[br_index];
if (y_flag == 1) {
tl = 0;
tr = 0;
}
if (x_flag == 1) {
tl = 0;
bl = 0;
}
tl_index++;
tr_index++;
bl_index++;
br_index++;
float outval = (tl * a0 + tr * a1) * b0 + (bl * a0 + br * a1) * b1;
// printf("tl: %d, tr: %d, bl: %d, br: %d \n", tl, tr, bl, br);
// printf("br_index: %d, a0: %f, b1: %f, out: %f \n", br_index, a0, b1,
// outval);
out_ptr[ind++] = ceil(outval);
}
}
}
}
void rotate90_basic(const uint8_t* in_data,
int h_in,
int w_in,
uint8_t* out_data,
int h_out,
int w_out,
int num) {
int win = w_in * num;
int wout = w_out * num;
for (int x = 0; x < h_in; x++) {
for (int y = 0; y < w_in; y++) {
int tmpy = y * num;
int tmpx = (w_out - 1 - x) * num; // x
for (int i = 0; i < num; i++) {
out_data[y * wout + tmpx] = in_data[x * win + tmpy];
tmpx++;
tmpy++;
}
}
}
}
void rotate180_basic(const uint8_t* in_data,
int h_in,
int w_in,
uint8_t* out_data,
int h_out,
int w_out,
int num) {
int win = w_in * num;
int h = h_in - 1;
int w = win - 1;
for (int x = 0; x < h_in; x++) {
for (int y = 0; y < w_in; y++) {
int tmpy = y * num;
int tmp = tmpy + (num - 1);
for (int i = 0; i < num; i++) {
out_data[(h - x) * win + w - tmp] = in_data[x * win + tmpy];
tmpy++;
tmp--;
}
}
}
}
void rotate270_basic(const uint8_t* in_data,
int h_in,
int w_in,
uint8_t* out_data,
int h_out,
int w_out,
int num) {
int win = w_in * num;
int wout = w_out * num;
int h = h_out - 1;
for (int x = 0; x < h_in; x++) {
for (int y = 0; y < w_in; y++) {
int tmpy = y * num;
int tmpx = x * num;
for (int i = 0; i < num; i++) {
out_data[(h - y) * wout + tmpx] =
in_data[x * win + tmpy]; // (y,x) = in(x,y)
tmpx++;
tmpy++;
}
}
}
}
void image_rotate_basic(const uint8_t* in_data,
uint8_t* out_data,
ImageFormat srcFormat,
int srcw,
int srch,
float rotate) {
int num = 1;
if (srcFormat == ImageFormat::GRAY) {
num = 1;
} else if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
num = 1; // todo
return;
} else if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
num = 3;
} else if (srcFormat == ImageFormat::BGRA || srcFormat == ImageFormat::RGBA) {
num = 4;
}
if (rotate == 90) {
rotate90_basic(in_data, srch, srcw, out_data, srcw, srch, num);
} else if (rotate == 180) {
rotate180_basic(in_data, srch, srcw, out_data, srch, srcw, num);
} else if (rotate == 270) {
rotate270_basic(in_data, srch, srcw, out_data, srcw, srch, num);
}
}
void flipx_basic(
const uint8_t* in_data, int h_in, int w_in, uint8_t* out_data, int num) {
int h = h_in - 1;
int w = w_in * num;
for (int x = 0; x < h_in; x++) {
for (int y = 0; y < w_in; y++) {
int tmpy = y * num;
for (int i = 0; i < num; i++) {
out_data[(h - x) * w + tmpy] =
in_data[x * w + tmpy]; // (y,x) = in(x,y)
tmpy++;
}
}
}
}
void flipy_basic(
const uint8_t* in_data, int h_in, int w_in, uint8_t* out_data, int num) {
int w = w_in * num - 1;
for (int x = 0; x < h_in; x++) {
for (int y = 0; y < w_in; y++) {
int tmpy = y * num;
int tmp = tmpy + (num - 1);
for (int i = 0; i < num; i++) {
out_data[x * w_in * num + w - tmp] =
in_data[x * w_in * num + tmpy]; // (y,x) = in(x,y)
tmpy++;
tmp--;
}
}
}
}
void flipxy_basic(
const uint8_t* in_data, int h_in, int w_in, uint8_t* out_data, int num) {
int win = w_in * num;
int h = h_in - 1;
int w = win - 1;
for (int x = 0; x < h_in; x++) {
for (int y = 0; y < w_in; y++) {
int tmpy = y * num;
int tmp = tmpy + (num - 1);
for (int i = 0; i < num; i++) {
out_data[(h - x) * win + w - tmp] =
in_data[x * win + tmpy]; // (h-y,w-x) = in(x,y)
tmpy++;
tmp--;
}
}
}
}
void image_flip_basic(const uint8_t* in_data,
uint8_t* out_data,
ImageFormat srcFormat,
int srcw,
int srch,
FlipParam flip) {
int num = 1;
if (srcFormat == ImageFormat::GRAY) {
num = 1;
} else if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
num = 1; // todo
return;
} else if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
num = 3;
} else if (srcFormat == ImageFormat::BGRA || srcFormat == ImageFormat::RGBA) {
num = 4;
}
// printf("image_flip_basic: %d \n", flip);
if (flip == FlipParam::X) {
flipx_basic(in_data, srch, srcw, out_data, num);
} else if (flip == FlipParam::Y) {
flipy_basic(in_data, srch, srcw, out_data, num);
} else if (flip == FlipParam::XY) {
flipxy_basic(in_data, srch, srcw, out_data, num);
}
}
void bgr_to_tensor_chw_basic(const uint8_t* bgr,
float* output,
int width,
int height,
float* means,
float* scales,
int num) {
int size = width * height;
float r_means = means[0];
float g_means = means[1];
float b_means = means[2];
float r_scales = scales[0];
float g_scales = scales[1];
float b_scales = scales[2];
for (int h = 0; h < height; h++) {
const uint8_t* ptr_bgr = bgr + h * width * num;
float* ptr_b = output + h * width;
float* ptr_g = ptr_b + size;
float* ptr_r = ptr_g + size;
for (int i = 0; i < width; i++) {
*ptr_b++ = (ptr_bgr[0] - b_means) * b_scales;
*ptr_g++ = (ptr_bgr[1] - g_means) * g_scales;
*ptr_r++ = (ptr_bgr[2] - r_means) * r_scales;
ptr_bgr += num;
}
}
}
void bgr_to_tensor_hwc_basic(const uint8_t* bgr,
float* output,
int width,
int height,
float* means,
float* scales,
int num) {
int size = width * height;
float r_means = means[0];
float g_means = means[1];
float b_means = means[2];
float r_scales = scales[0];
float g_scales = scales[1];
float b_scales = scales[2];
for (int h = 0; h < height; h++) {
const uint8_t* ptr_bgr = bgr + h * width * num;
float* out_bgr = output + h * width * num;
for (int i = 0; i < width; i++) {
*out_bgr++ = (ptr_bgr[0] - b_means) * b_scales;
*out_bgr++ = (ptr_bgr[1] - g_means) * g_scales;
*out_bgr++ = (ptr_bgr[2] - r_means) * r_scales;
ptr_bgr += num;
}
}
}
void image_to_tensor_basic(const uint8_t* in_data,
Tensor* dst,
ImageFormat srcFormat,
LayoutType layout,
int srcw,
int srch,
float* means,
float* scales) {
float* output = dst->mutable_data<float>();
if (layout == LayoutType::kNCHW &&
(srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB)) {
bgr_to_tensor_chw_basic(in_data, output, srcw, srch, means, scales, 3);
} else if (layout == LayoutType::kNHWC &&
(srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB)) {
bgr_to_tensor_hwc_basic(in_data, output, srcw, srch, means, scales, 3);
} else if (layout == LayoutType::kNCHW && (srcFormat == ImageFormat::BGRA ||
srcFormat == ImageFormat::RGBA)) {
bgr_to_tensor_chw_basic(in_data, output, srcw, srch, means, scales, 4);
} else if (layout == LayoutType::kNHWC && (srcFormat == ImageFormat::BGRA ||
srcFormat == ImageFormat::RGBA)) {
bgr_to_tensor_hwc_basic(in_data, output, srcw, srch, means, scales, 4);
}
}
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <math.h>
#include <random>
#include "lite/core/context.h"
#include "lite/tests/cv/cv_basic.h"
#include "lite/tests/utils/timer.h"
#include "lite/utils/cv/paddle_image_preprocess.h"
DEFINE_int32(cluster, 3, "cluster id");
DEFINE_int32(threads, 1, "threads num");
DEFINE_int32(warmup, 0, "warmup times");
DEFINE_int32(repeats, 1, "repeats times");
DEFINE_bool(basic_test, false, "do all tests");
DEFINE_bool(check_result, true, "check the result");
DEFINE_int32(srcFormat, 0, "input image format");
DEFINE_int32(dstFormat, 1, "output image format");
DEFINE_int32(srch, 1920, "input height");
DEFINE_int32(srcw, 1080, "input width");
DEFINE_int32(dsth, 960, "output height");
DEFINE_int32(dstw, 540, "output width");
DEFINE_int32(angle, 90, "rotate angel");
DEFINE_int32(flip_num, 0, "flip x");
DEFINE_int32(layout, 0, "layout nchw");
typedef paddle::lite::utils::cv::ImageFormat ImageFormat;
typedef paddle::lite::utils::cv::FlipParam FlipParam;
typedef paddle::lite_api::DataLayoutType LayoutType;
typedef paddle::lite::utils::cv::TransParam TransParam;
typedef paddle::lite::utils::cv::ImagePreprocess ImagePreprocess;
typedef paddle::lite_api::Tensor Tensor_api;
typedef paddle::lite::Tensor Tensor;
using paddle::lite::Timer;
void fill_tensor_host_rand(uint8_t* dio, int64_t size) {
uint seed = 256;
for (int64_t i = 0; i < size; ++i) {
dio[i] = rand_r(&seed) % 256; // -128;
}
}
void print_int8(uint8_t* ptr, int size, int width) {
for (int i = 0; i < size; i++) {
printf("%d ", *ptr++);
if ((i + 1) % width == 0) {
printf("\n");
}
}
printf("\n");
}
void print_int(int* ptr, int size, int width) {
int j = 0;
for (int i = 0; i < size; i++) {
printf("%d ", *ptr++);
if ((i + 1) % width == 0) {
printf("\n");
}
}
printf("\n");
}
void print_ff(const float* ptr, int size, int width) {
int j = 0;
for (int i = 0; i < size; i++) {
printf("%f ", *ptr++);
if ((i + 1) % width == 0) {
printf("\n");
}
}
printf("\n");
}
#ifdef LITE_WITH_ARM
void test_img(const std::vector<int>& cluster_id,
const std::vector<int>& thread_num,
int srcw,
int srch,
int dstw,
int dsth,
ImageFormat srcFormat,
ImageFormat dstFormat,
float rotate,
FlipParam flip,
LayoutType layout,
int test_iter = 1) {
#ifdef LITE_WITH_ARM
paddle::lite::DeviceInfo::Init();
#endif
for (auto& cls : cluster_id) {
for (auto& th : thread_num) {
std::unique_ptr<paddle::lite::KernelContext> ctx1(
new paddle::lite::KernelContext);
auto& ctx = ctx1->As<paddle::lite::ARMContext>();
ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
LOG(INFO) << "cluster: " << cls << ", threads: " << th;
LOG(INFO) << " input tensor size, num= " << 1 << ", channel= " << 1
<< ", height= " << srch << ", width= " << srcw
<< ", srcFormat= " << (ImageFormat)srcFormat;
// RGBA = 0, BGRA, RGB, BGR, GRAY, NV21 = 11, NV12,
if (srcFormat == ImageFormat::NV21) {
LOG(INFO) << "srcFormat: NV21";
}
if (srcFormat == ImageFormat::NV12) {
LOG(INFO) << "srcFormat: NV12";
}
if (srcFormat == ImageFormat::GRAY) {
LOG(INFO) << "srcFormat: GRAY";
}
if (srcFormat == ImageFormat::BGRA) {
LOG(INFO) << "srcFormat: BGRA";
}
if (srcFormat == ImageFormat::BGR) {
LOG(INFO) << "srcFormat: BGR";
}
if (srcFormat == ImageFormat::RGBA) {
LOG(INFO) << "srcFormat: RGBA";
}
if (srcFormat == ImageFormat::RGB) {
LOG(INFO) << "srcFormat: RGB";
}
LOG(INFO) << " output tensor size, num=" << 1 << ", channel=" << 1
<< ", height=" << dsth << ", width=" << dstw
<< ", dstFormat= " << (ImageFormat)dstFormat;
if (dstFormat == ImageFormat::NV21) {
LOG(INFO) << "dstFormat: NV21";
}
if (dstFormat == ImageFormat::NV12) {
LOG(INFO) << "dstFormat: NV12";
}
if (dstFormat == ImageFormat::GRAY) {
LOG(INFO) << "dstFormat: GRAY";
}
if (dstFormat == ImageFormat::BGRA) {
LOG(INFO) << "dstFormat: BGRA";
}
if (dstFormat == ImageFormat::BGR) {
LOG(INFO) << "dstFormat: BGR";
}
if (dstFormat == ImageFormat::RGBA) {
LOG(INFO) << "dstFormat: RGBA";
}
if (dstFormat == ImageFormat::RGB) {
LOG(INFO) << "dstFormat: RGB";
}
LOG(INFO) << "Rotate = " << rotate << ", Flip = " << flip
<< ", Layout = " << static_cast<int>(layout);
int size = 3 * srch * srcw;
if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
size = ceil(1.5 * srch) * srcw;
} else if (srcFormat == ImageFormat::BGRA ||
srcFormat == ImageFormat::RGBA) {
size = 4 * srch * srcw;
} else if (srcFormat == ImageFormat::GRAY) {
size = srch * srcw;
}
uint8_t* src = new uint8_t[size];
fill_tensor_host_rand(src, size);
int out_size = srch * srcw;
int resize = dstw * dsth;
if (dstFormat == ImageFormat::NV12 || dstFormat == ImageFormat::NV21) {
out_size = ceil(1.5 * srch) * srcw;
resize = ceil(1.5 * dsth) * dstw;
} else if (dstFormat == ImageFormat::BGR ||
dstFormat == ImageFormat::RGB) {
out_size = 3 * srch * srcw;
resize = 3 * dsth * dstw;
} else if (dstFormat == ImageFormat::BGRA ||
dstFormat == ImageFormat::RGBA) {
out_size = 4 * srch * srcw;
resize = 4 * dsth * dstw;
} else if (dstFormat == ImageFormat::GRAY) {
out_size = srch * srcw;
resize = dsth * dstw;
}
// out
uint8_t* basic_dst = new uint8_t[out_size];
uint8_t* lite_dst = new uint8_t[out_size];
// resize
uint8_t* resize_basic = new uint8_t[resize];
uint8_t* resize_tmp = new uint8_t[resize];
uint8_t* tv_out_ratote_basic = new uint8_t[resize];
uint8_t* tv_out_ratote = new uint8_t[resize];
uint8_t* tv_out_flip_basic = new uint8_t[resize];
uint8_t* tv_out_flip = new uint8_t[resize];
std::vector<int64_t> shape_out = {1, 3, dsth, dstw};
Tensor tensor;
Tensor tensor_basic;
tensor.Resize(shape_out);
tensor_basic.Resize(shape_out);
tensor.set_precision(PRECISION(kFloat));
tensor_basic.set_precision(PRECISION(kFloat));
float means[3] = {127.5f, 127.5f, 127.5f};
float scales[3] = {1 / 127.5f, 1 / 127.5f, 1 / 127.5f};
if (FLAGS_check_result) {
LOG(INFO) << "image convert basic compute";
image_convert_basic(src,
basic_dst,
(ImageFormat)srcFormat,
(ImageFormat)dstFormat,
srcw,
srch,
out_size);
LOG(INFO) << "image resize basic compute";
image_resize_basic(basic_dst,
resize_basic,
(ImageFormat)dstFormat,
srcw,
srch,
dstw,
dsth);
LOG(INFO) << "image rotate basic compute";
image_rotate_basic(resize_basic,
tv_out_ratote_basic,
(ImageFormat)dstFormat,
dstw,
dsth,
rotate);
LOG(INFO) << "image flip basic compute";
image_flip_basic(resize_basic,
tv_out_flip_basic,
(ImageFormat)dstFormat,
dstw,
dsth,
flip);
LOG(INFO) << "image to tensor basic compute";
image_to_tensor_basic(resize_basic,
&tensor_basic,
(ImageFormat)dstFormat,
layout,
dstw,
dsth,
means,
scales);
}
Timer t1;
LOG(INFO) << "saber cv compute";
double to = 0;
double min_time = 100000;
TransParam tparam;
tparam.ih = srch;
tparam.iw = srcw;
tparam.oh = dsth;
tparam.ow = dstw;
tparam.flip_param = flip;
tparam.rotate_param = rotate;
Tensor_api dst_tensor(&tensor);
dst_tensor.Resize(shape_out);
ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
for (int i = 0; i < test_iter; ++i) {
t1.clear();
t1.start();
LOG(INFO) << "image convert saber compute";
// 方法一: image_preprocess.imageCovert(src, lite_dst);
image_preprocess.imageCovert(
src, lite_dst, (ImageFormat)srcFormat, (ImageFormat)dstFormat);
LOG(INFO) << "image resize saber compute";
// 方法一:image_preprocess.imageResize(lite_dst, resize_tmp);
image_preprocess.imageResize(lite_dst,
resize_tmp,
(ImageFormat)dstFormat,
srcw,
srch,
dstw,
dsth);
LOG(INFO) << "image rotate saber compute";
// 方法一: image_preprocess.imageRotate(resize_tmp, tv_out_ratote);
image_preprocess.imageRotate(resize_tmp,
tv_out_ratote,
(ImageFormat)dstFormat,
dstw,
dsth,
rotate);
LOG(INFO) << "image flip saber compute";
// 方法一: image_preprocess.imageFlip(resize_tmp, tv_out_flip);
image_preprocess.imageFlip(
resize_tmp, tv_out_flip, (ImageFormat)dstFormat, dstw, dsth, flip);
LOG(INFO) << "image to tensor compute";
// 方法一: image_preprocess.image2Tensor(
// resize_tmp, &dst_tensor, layout, means, scales);
image_preprocess.image2Tensor(resize_tmp,
&dst_tensor,
(ImageFormat)dstFormat,
dstw,
dsth,
layout,
means,
scales);
t1.end();
double tdiff = t1.get_average_ms();
to += tdiff;
if (tdiff < min_time) {
min_time = tdiff;
}
}
LOG(INFO) << "image trans total time : " << to
<< ", avg time : " << to / test_iter;
double max_ratio = 0;
double max_diff = 0;
const double eps = 1e-6f;
if (FLAGS_check_result) {
LOG(INFO) << "diff, image convert size: " << out_size;
uint8_t* diff_v = new uint8_t[out_size];
for (int i = 0; i < out_size; i++) {
uint8_t a = lite_dst[i];
uint8_t b = basic_dst[i];
uint8_t diff1 = a - b;
uint8_t diff = diff1 > 0 ? diff1 : -diff1;
diff_v[i] = diff;
if (max_diff < diff) {
max_diff = diff;
max_ratio = 2.0 * max_diff / (a + b + eps);
}
}
if (std::abs(max_ratio) >= 1e-5f) {
int width = size / srch;
printf("din: \n");
print_int8(src, size, width);
width = out_size / srch;
printf("saber result: \n");
print_int8(lite_dst, out_size, width);
printf("basic result: \n");
print_int8(basic_dst, out_size, width);
printf("diff result: \n");
print_int8(diff_v, out_size, width);
}
delete[] diff_v;
LOG(INFO) << "compare result, max diff: " << max_diff
<< ", max ratio: " << max_ratio;
bool rst = std::abs(max_ratio) < 1e-5f;
CHECK_EQ(rst, true) << "compute result error";
}
LOG(INFO) << "image convert end";
if (FLAGS_check_result) {
max_ratio = 0;
max_diff = 0;
// const double eps = 1e-6f;
int* diff_v = new int[resize];
LOG(INFO) << "diff, image resize size: " << resize;
for (int i = 0; i < resize; i++) {
uint8_t a = resize_tmp[i];
uint8_t b = resize_basic[i];
int diff1 = a - b;
int diff = 0; // basic resize and saber resize 在float ->
// int转换时存在误差,误差范围是{-1, 1}
if (diff1 < -1 || diff1 > 1) diff = diff1 < 0 ? -diff1 : diff1;
diff_v[i] = diff;
if (max_diff < diff) {
max_diff = diff;
max_ratio = 2.0 * max_diff / (a + b + eps);
}
}
if (std::abs(max_ratio) >= 1e-5f) {
int width = out_size / srch;
printf("din: \n");
print_int8(lite_dst, out_size, width);
width = resize / dsth;
printf("saber result: \n");
print_int8(resize_tmp, resize, width);
printf("basic result: \n");
print_int8(resize_basic, resize, width);
printf("diff result: \n");
print_int(diff_v, resize, width);
}
delete[] diff_v;
// printf("\n");
LOG(INFO) << "compare result, max diff: " << max_diff
<< ", max ratio: " << max_ratio;
bool rst = std::abs(max_ratio) < 1e-5f;
CHECK_EQ(rst, true) << "compute result error";
}
delete[] lite_dst;
delete[] basic_dst;
LOG(INFO) << "image resize end";
if (FLAGS_check_result) {
max_ratio = 0;
max_diff = 0;
int* diff_v = new int[resize];
LOG(INFO) << "diff, image rotate size: " << resize;
for (int i = 0; i < resize; i++) {
int a = tv_out_ratote[i];
int b = tv_out_ratote_basic[i];
int diff1 = a - b;
int diff = 0;
if (diff1 < -1 || diff1 > 1) diff = diff1 < 0 ? -diff1 : diff1;
diff_v[i] = diff;
if (max_diff < diff) {
max_diff = diff;
max_ratio = 2.0 * max_diff / (a + b + eps);
}
}
if (std::abs(max_ratio) >= 1e-5f) {
int width = resize / dsth;
printf("din: \n");
print_int8(resize_tmp, resize, width);
printf("saber result: \n");
print_int8(tv_out_ratote, resize, width);
printf("basic result: \n");
print_int8(tv_out_ratote_basic, resize, width);
printf("diff result: \n");
print_int(diff_v, resize, width);
}
delete[] diff_v;
LOG(INFO) << "compare result, max diff: " << max_diff
<< ", max ratio: " << max_ratio;
bool rst = std::abs(max_ratio) < 1e-5f;
CHECK_EQ(rst, true) << "compute result error";
}
delete[] tv_out_ratote;
delete[] tv_out_ratote_basic;
LOG(INFO) << "image rotate end";
if (FLAGS_check_result) {
max_ratio = 0;
max_diff = 0;
int* diff_v = new int[resize];
LOG(INFO) << "diff, image flip size: " << resize;
for (int i = 0; i < resize; i++) {
int a = tv_out_flip[i];
int b = tv_out_flip_basic[i];
int diff1 = a - b;
int diff = 0;
if (diff1 < -1 || diff1 > 1) diff = diff1 < 0 ? -diff1 : diff1;
diff_v[i] = diff;
if (max_diff < diff) {
max_diff = diff;
max_ratio = 2.0 * max_diff / (a + b + eps);
}
}
if (std::abs(max_ratio) >= 1e-5f) {
int width = resize / dsth;
printf("din: \n");
print_int8(resize_tmp, resize, width);
printf("saber result: \n");
print_int8(tv_out_flip, resize, width);
printf("basic result: \n");
print_int8(tv_out_flip_basic, resize, width);
printf("diff result: \n");
print_int(diff_v, resize, width);
}
delete[] diff_v;
LOG(INFO) << "compare result, max diff: " << max_diff
<< ", max ratio: " << max_ratio;
bool rst = std::abs(max_ratio) < 1e-5f;
CHECK_EQ(rst, true) << "compute result error";
}
delete[] tv_out_flip;
delete[] tv_out_flip_basic;
delete[] resize_tmp;
delete[] resize_basic;
LOG(INFO) << "image flip end";
if (FLAGS_check_result) {
max_ratio = 0;
max_diff = 0;
LOG(INFO) << "diff, iamge to tensor size: " << tensor.numel();
const float* ptr_a = tensor.data<float>();
const float* ptr_b = tensor_basic.data<float>();
int ss = tensor.numel();
float* diff_v = new float[ss];
for (int i = 0; i < ss; i++) {
int a = ptr_a[i];
int b = ptr_b[i];
int diff1 = a - b;
int diff = 0;
if (diff1 < -1 || diff1 > 1) diff = diff1 < 0 ? -diff1 : diff1;
diff_v[i] = diff;
if (max_diff < diff) {
max_diff = diff;
max_ratio = 2.0 * max_diff / (a + b + eps);
}
}
if (std::abs(max_ratio) >= 1e-5f) {
int width = resize / srch;
printf("din: \n");
print_int8(resize_tmp, resize, width);
printf("saber result: \n");
print_ff(ptr_a, resize, width);
printf("basic result: \n");
print_ff(ptr_b, resize, width);
printf("diff result: \n");
print_ff(diff_v, resize, width);
}
LOG(INFO) << "compare result, max diff: " << max_diff
<< ", max ratio: " << max_ratio;
bool rst = std::abs(max_ratio) < 1e-5f;
CHECK_EQ(rst, true) << "compute result error";
LOG(INFO) << "iamge to tensor end";
}
}
}
}
#if 1
TEST(TestImageConvertRand, test_func_image_convert_preprocess) {
if (FLAGS_basic_test) {
for (auto w : {1, 4, 8, 16, 112, 224, 1092}) {
for (auto h : {1, 4, 16, 112, 224}) {
for (auto ww : {66}) {
for (auto hh : {12}) {
for (auto rotate : {180}) {
for (auto flip : {0}) {
for (auto srcFormat : {0, 1, 2, 3, 4, 11, 12}) {
for (auto dstFormat : {0, 1, 2, 3}) {
for (auto layout : {1}) {
if ((dstFormat == ImageFormat::GRAY &&
(srcFormat == ImageFormat::RGBA ||
srcFormat == ImageFormat::BGRA)) ||
(srcFormat == ImageFormat::GRAY &&
(dstFormat == ImageFormat::RGBA ||
dstFormat == ImageFormat::BGRA)) ||
(srcFormat == ImageFormat::NV12 ||
srcFormat == ImageFormat::NV21) &&
(dstFormat == ImageFormat::GRAY ||
dstFormat == ImageFormat::RGBA ||
dstFormat == ImageFormat::BGRA)) {
continue;
}
if (srcFormat == ImageFormat::NV12 ||
srcFormat == ImageFormat::NV21) {
if (w % 2) { // is not ou shu, two line y == one line
// uv
continue;
}
}
test_img({FLAGS_cluster},
{1},
w,
h,
ww,
hh,
(ImageFormat)srcFormat,
(ImageFormat)dstFormat,
rotate,
(FlipParam)flip,
(LayoutType)layout);
}
}
}
}
}
}
}
}
}
}
}
#endif
#if 1
TEST(TestImageConvertRand, test_func_image_resize_preprocess) {
if (FLAGS_basic_test) {
for (auto w : {1, 4, 8, 16, 112, 224, 1092}) {
for (auto h : {1, 4, 16, 112, 224}) {
for (auto ww : {1, 2, 8, 32, 112}) {
for (auto hh : {1, 2, 8, 112}) {
for (auto rotate : {180}) {
for (auto flip : {0}) {
for (auto srcFormat : {0, 1, 2, 3, 4, 11, 12}) {
for (auto dstFormat : {0, 1, 2, 3}) {
for (auto layout : {1}) {
if (dstFormat == ImageFormat::NV12 ||
dstFormat == ImageFormat::NV21 ||
(dstFormat == ImageFormat::GRAY &&
(srcFormat == ImageFormat::RGBA ||
srcFormat == ImageFormat::BGRA)) ||
(srcFormat == ImageFormat::GRAY &&
(dstFormat == ImageFormat::RGBA ||
dstFormat == ImageFormat::BGRA)) ||
(srcFormat == ImageFormat::NV12 ||
srcFormat == ImageFormat::NV21) &&
(dstFormat == ImageFormat::GRAY ||
dstFormat == ImageFormat::RGBA ||
dstFormat == ImageFormat::BGRA)) {
continue;
}
if (srcFormat == ImageFormat::NV12 ||
srcFormat == ImageFormat::NV21) {
if (w % 2) { // is not ou shu, two line y == one line
// uv
continue;
}
}
test_img({FLAGS_cluster},
{1, 2, 4},
w,
h,
ww,
hh,
(ImageFormat)srcFormat,
(ImageFormat)dstFormat,
rotate,
(FlipParam)flip,
(LayoutType)layout);
}
}
}
}
}
}
}
}
}
}
}
#endif
#if 1
TEST(TestImageConvertRand, test_func_image_trans_preprocess) {
if (FLAGS_basic_test) {
for (auto w : {1, 8, 16, 112, 224, 1092}) {
for (auto h : {1, 16, 112, 224}) {
for (auto ww : {32, 112}) {
for (auto hh : {112}) {
for (auto rotate : {90, 180, 270}) {
for (auto flip : {0, 1, 2}) {
for (auto srcFormat : {11}) {
for (auto dstFormat : {3}) {
for (auto layout : {1, 3}) {
if (dstFormat == ImageFormat::NV12 ||
dstFormat == ImageFormat::NV21 ||
(dstFormat == ImageFormat::GRAY &&
(srcFormat == ImageFormat::RGBA ||
srcFormat == ImageFormat::BGRA)) ||
(srcFormat == ImageFormat::GRAY &&
(dstFormat == ImageFormat::RGBA ||
dstFormat == ImageFormat::BGRA)) ||
(srcFormat == ImageFormat::NV12 ||
srcFormat == ImageFormat::NV21) &&
(dstFormat == ImageFormat::GRAY ||
dstFormat == ImageFormat::RGBA ||
dstFormat == ImageFormat::BGRA)) {
continue;
}
if (srcFormat == ImageFormat::NV12 ||
srcFormat == ImageFormat::NV21) {
if (w % 2) { // is not ou shu, two line y == one line
// uv
continue;
}
}
test_img({FLAGS_cluster},
{1, 2, 4},
w,
h,
ww,
hh,
(ImageFormat)srcFormat,
(ImageFormat)dstFormat,
rotate,
(FlipParam)flip,
(LayoutType)layout);
}
}
}
}
}
}
}
}
}
}
}
#endif
#if 1
TEST(TestImageConvertCustom, test_func_image_preprocess_custom) {
test_img({FLAGS_cluster},
{1, 2, 4},
FLAGS_srcw,
FLAGS_srch,
FLAGS_dstw,
FLAGS_dsth,
(ImageFormat)FLAGS_srcFormat,
(ImageFormat)FLAGS_dstFormat,
FLAGS_angle,
(FlipParam)FLAGS_flip_num,
(LayoutType)FLAGS_layout);
}
#endif
#endif
......@@ -19,6 +19,7 @@ BUILD_PYTHON=OFF
BUILD_DIR=$(pwd)
OPTMODEL_DIR=""
BUILD_TAILOR=OFF
BUILD_CV=OFF
readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
......@@ -96,6 +97,7 @@ function make_tiny_publish_so {
-DLITE_ON_TINY_PUBLISH=ON \
-DANDROID_STL_TYPE=$android_stl \
-DLITE_BUILD_EXTRA=$BUILD_EXTRA \
-DLITE_WITH_CV=$BUILD_CV \
-DLITE_BUILD_TAILOR=$BUILD_TAILOR \
-DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \
-DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
......@@ -122,7 +124,7 @@ function make_full_publish_so {
fi
mkdir -p $build_directory
cd $build_directory
if [ ${os} == "armlinux" ]; then
BUILD_JAVA=OFF
fi
......@@ -137,6 +139,7 @@ function make_full_publish_so {
-DLITE_SHUTDOWN_LOG=ON \
-DANDROID_STL_TYPE=$android_stl \
-DLITE_BUILD_EXTRA=$BUILD_EXTRA \
-DLITE_WITH_CV=$BUILD_CV \
-DLITE_BUILD_TAILOR=$BUILD_TAILOR \
-DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \
-DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
......@@ -166,6 +169,7 @@ function make_all_tests {
${CMAKE_COMMON_OPTIONS} \
-DWITH_TESTING=ON \
-DLITE_BUILD_EXTRA=$BUILD_EXTRA \
-DLITE_WITH_CV=$BUILD_CV \
-DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
make lite_compile_deps -j$NUM_PROC
......@@ -201,6 +205,7 @@ function make_ios {
-DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
-DARM_TARGET_ARCH_ABI=$abi \
-DLITE_BUILD_EXTRA=$BUILD_EXTRA \
-DLITE_WITH_CV=$BUILD_CV \
-DARM_TARGET_OS=$os
make -j4 publish_inference
......@@ -362,11 +367,11 @@ function main {
shift
;;
tiny_publish)
make_tiny_publish_so $ARM_OS $ARM_ABI $ARM_LANG $ANDROID_STL
make_tiny_publish_so $ARM_OS $ARM_ABI $ARM_LANG $ANDROID_STL
shift
;;
full_publish)
make_full_publish_so $ARM_OS $ARM_ABI $ARM_LANG $ANDROID_STL
make_full_publish_so $ARM_OS $ARM_ABI $ARM_LANG $ANDROID_STL
shift
;;
test)
......@@ -382,7 +387,7 @@ function main {
shift
;;
cuda)
make_cuda
make_cuda
shift
;;
x86)
......
......@@ -24,3 +24,5 @@ if(LITE_ON_TINY_PUBLISH OR LITE_ON_MODEL_OPTIMIZE_TOOL)
else()
lite_cc_library(utils SRCS string.cc DEPS ${utils_DEPS} any)
endif()
add_subdirectory(cv)
if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ARM)
set(lite_cv_deps)
lite_cc_library(paddle_cv_arm SRCS
image_convert.cc
paddle_image_preprocess.cc
image2tensor.cc
image_flip.cc
image_rotate.cc
image_resize.cc
DEPS ${lite_cv_deps} paddle_api_light)
endif()
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/utils/cv/image2tensor.h"
#include <arm_neon.h>
namespace paddle {
namespace lite {
namespace utils {
namespace cv {
void bgr_to_tensor_chw(const uint8_t* src,
float* output,
int width,
int height,
float* means,
float* scales);
void bgra_to_tensor_chw(const uint8_t* src,
float* output,
int width,
int height,
float* means,
float* scales);
void bgr_to_tensor_hwc(const uint8_t* src,
float* output,
int width,
int height,
float* means,
float* scales);
void bgra_to_tensor_hwc(const uint8_t* src,
float* output,
int width,
int height,
float* means,
float* scales);
/*
* change image data to tensor data
* support image format is BGR(RGB) and BGRA(RGBA), Data layout is NHWC and
* NCHW
* param src: input image data
* param dstTensor: output tensor data
* param srcFormat: input image format, support BGR(GRB) and BGRA(RGBA)
* param srcw: input image width
* param srch: input image height
* param layout: output tensor layout,support NHWC and NCHW
* param means: means of image
* param scales: scales of image
*/
void Image2Tensor::choose(const uint8_t* src,
Tensor* dst,
ImageFormat srcFormat,
LayoutType layout,
int srcw,
int srch,
float* means,
float* scales) {
float* output = dst->mutable_data<float>();
if (layout == LayoutType::kNCHW && (srcFormat == BGR || srcFormat == RGB)) {
impl_ = bgr_to_tensor_chw;
} else if (layout == LayoutType::kNHWC &&
(srcFormat == BGR || srcFormat == RGB)) {
impl_ = bgr_to_tensor_hwc;
} else if (layout == LayoutType::kNCHW &&
(srcFormat == BGRA || srcFormat == RGBA)) {
impl_ = bgra_to_tensor_chw;
} else if (layout == LayoutType::kNHWC &&
(srcFormat == BGRA || srcFormat == RGBA)) {
impl_ = bgra_to_tensor_hwc;
} else {
printf("this layout: %d or image format: %d not support \n",
static_cast<int>(layout),
srcFormat);
return;
}
impl_(src, output, srcw, srch, means, scales);
}
void bgr_to_tensor_chw(const uint8_t* src,
float* output,
int width,
int height,
float* means,
float* scales) {
int size = width * height;
float b_means = means[0];
float g_means = means[1];
float r_means = means[2];
float b_scales = scales[0];
float g_scales = scales[1];
float r_scales = scales[2];
float* ptr_b = output;
float* ptr_g = ptr_b + size;
float* ptr_r = ptr_g + size;
int dim8 = width >> 3;
int remain = width % 8;
float32x4_t vbmean = vdupq_n_f32(b_means);
float32x4_t vgmean = vdupq_n_f32(g_means);
float32x4_t vrmean = vdupq_n_f32(r_means);
float32x4_t vbscale = vdupq_n_f32(b_scales);
float32x4_t vgscale = vdupq_n_f32(g_scales);
float32x4_t vrscale = vdupq_n_f32(r_scales);
#pragma omp parallel for
for (int i = 0; i < height; i += 1) {
const uint8_t* din_ptr = src + i * 3 * width;
float* ptr_b_h = ptr_b + i * width;
float* ptr_g_h = ptr_g + i * width;
float* ptr_r_h = ptr_r + i * width;
int cnt = dim8;
if (cnt > 0) {
#ifdef __aarch64__
asm volatile(
"prfm pldl1keep, [%[inptr0]] \n"
"prfm pldl1keep, [%[inptr0], #64] \n"
"prfm pldl1keep, [%[inptr0], #128] \n"
"prfm pldl1keep, [%[inptr0], #192] \n"
"1: \n"
"ld3 {v0.8b, v1.8b, v2.8b}, [%[inptr0]], #24 \n" // d8 = y0y3y6y9..
// d9 = y1y4y7..."
// 8->16
"ushll v3.8h, v0.8b, #0 \n"
"ushll v4.8h, v1.8b, #0 \n"
"ushll v5.8h, v2.8b, #0 \n"
// 16->32
"ushll v6.4s, v3.4h, #0 \n"
"ushll2 v7.4s, v3.8h, #0 \n"
"ushll v8.4s, v4.4h, #0 \n"
"ushll2 v9.4s, v4.8h, #0 \n"
"ushll v10.4s, v5.4h, #0 \n"
"ushll2 v11.4s, v5.8h, #0 \n"
// int32->fp32
"ucvtf v12.4s, v6.4s \n"
"ucvtf v13.4s, v7.4s \n"
"ucvtf v14.4s, v8.4s \n"
"ucvtf v15.4s, v9.4s \n"
"ucvtf v16.4s, v10.4s \n"
"ucvtf v17.4s, v11.4s \n"
// sub -mean
"fsub v12.4s, v12.4s, %w[vbmean].4s \n"
"fsub v13.4s, v13.4s, %w[vbmean].4s \n"
"fsub v14.4s, v14.4s, %w[vgmean].4s \n"
"fsub v15.4s, v15.4s, %w[vgmean].4s \n"
"fsub v16.4s, v16.4s, %w[vrmean].4s \n"
"fsub v17.4s, v17.4s, %w[vrmean].4s \n"
// mul * scale
"fmul v6.4s, v12.4s, %w[vbscale].4s \n"
"fmul v7.4s, v13.4s, %w[vbscale].4s \n"
"fmul v8.4s, v14.4s, %w[vgscale].4s \n"
"fmul v9.4s, v15.4s, %w[vgscale].4s \n"
"fmul v10.4s, v16.4s, %w[vrscale].4s \n"
"fmul v11.4s, v17.4s, %w[vrscale].4s \n"
// store
"st1 {v6.4s}, [%[outr0]], #16 \n"
"st1 {v8.4s}, [%[outr1]], #16 \n"
"st1 {v10.4s}, [%[outr2]], #16 \n"
"subs %w[cnt], %w[cnt], #1 \n"
"st1 {v7.4s}, [%[outr0]], #16 \n"
"st1 {v9.4s}, [%[outr1]], #16 \n"
"st1 {v11.4s}, [%[outr2]], #16 \n"
"bne 1b \n"
: [inptr0] "+r"(din_ptr),
[outr0] "+r"(ptr_b_h),
[outr1] "+r"(ptr_g_h),
[outr2] "+r"(ptr_r_h),
[cnt] "+r"(cnt)
: [vbmean] "w"(vbmean),
[vgmean] "w"(vgmean),
[vrmean] "w"(vrmean),
[vbscale] "w"(vbscale),
[vgscale] "w"(vgscale),
[vrscale] "w"(vrscale)
: "cc",
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20");
#else
asm volatile(
"pld [%[inptr0]] @ preload a, 64byte\n"
"pld [%[inptr0], #64] @ preload a, 64byte\n"
"pld [%[inptr0], #128] @ preload a, 64byte\n"
"pld [%[inptr0], #192] @ preload a, 64byte\n"
"1: \n"
"vld3.8 {d12, d13, d14}, [%[inptr0]]! \n"
// 8->16
"vmovl.u8 q8, d12 \n"
"vmovl.u8 q9, d13 \n"
"vmovl.u8 q10, d14 \n"
// 16->32
"vmovl.u16 q11, d16 \n"
"vmovl.u16 q12, d17 \n"
"vmovl.u16 q13, d18 \n"
"vmovl.u16 q14, d19 \n"
"vmovl.u16 q15, d20 \n"
"vmovl.u16 q6, d21 \n"
// int32->fp32
"vcvt.f32.u32 q7, q11 \n"
"vcvt.f32.u32 q8, q12 \n"
"vcvt.f32.u32 q9, q13 \n"
"vcvt.f32.u32 q10, q14 \n"
"vcvt.f32.u32 q11, q15 \n"
"vcvt.f32.u32 q12, q6 \n"
// sub -mean
"vsub.f32 q7, q7, %q[vbmean] \n"
"vsub.f32 q8, q8, %q[vbmean] \n"
"vsub.f32 q9, q9, %q[vgmean] \n"
"vsub.f32 q10, q10, %q[vgmean] \n"
"vsub.f32 q11, q11, %q[vrmean] \n"
"vsub.f32 q12, q12, %q[vrmean] \n"
// mul *scale
"vmul.f32 q13, q7, %q[vbscale] \n"
"vmul.f32 q14, q8, %q[vbscale] \n"
"vmul.f32 q15, q9, %q[vgscale] \n"
"vmul.f32 q6, q10, %q[vgscale] \n"
"vmul.f32 q7, q11, %q[vrscale] \n"
"vmul.f32 q8, q12, %q[vrscale] \n"
// store
"vst1.32 {d26 - d27}, [%[outr0]]! \n"
"vst1.32 {d30 - d31}, [%[outr1]]! \n"
"vst1.32 {d14 - d15}, [%[outr2]]! \n"
"subs %[cnt], #1 \n"
"vst1.32 {d28 - d29}, [%[outr0]]! \n"
"vst1.32 {d12 - d13}, [%[outr1]]! \n"
"vst1.32 {d16 - d17}, [%[outr2]]! \n"
"bne 1b"
: [inptr0] "+r"(din_ptr),
[outr0] "+r"(ptr_b_h),
[outr1] "+r"(ptr_g_h),
[outr2] "+r"(ptr_r_h),
[cnt] "+r"(cnt)
: [vbmean] "w"(vbmean),
[vgmean] "w"(vgmean),
[vrmean] "w"(vrmean),
[vbscale] "w"(vbscale),
[vgscale] "w"(vgscale),
[vrscale] "w"(vrscale)
: "cc",
"memory",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12",
"q13",
"q14",
"q15");
#endif
}
for (int j = 0; j < remain; j++) {
*ptr_b_h++ = (*din_ptr - b_means) * b_scales;
din_ptr++;
*ptr_g_h++ = (*din_ptr - g_means) * g_scales;
din_ptr++;
*ptr_r_h++ = (*din_ptr - r_means) * r_scales;
din_ptr++;
}
}
}
void bgra_to_tensor_chw(const uint8_t* src,
float* output,
int width,
int height,
float* means,
float* scales) {
int size = width * height;
float b_means = means[0];
float g_means = means[1];
float r_means = means[2];
float b_scales = scales[0];
float g_scales = scales[1];
float r_scales = scales[2];
float* ptr_b = output;
float* ptr_g = ptr_b + size;
float* ptr_r = ptr_g + size;
int dim8 = width >> 3;
int remain = width % 8;
float32x4_t vbmean = vdupq_n_f32(b_means);
float32x4_t vgmean = vdupq_n_f32(g_means);
float32x4_t vrmean = vdupq_n_f32(r_means);
float32x4_t vbscale = vdupq_n_f32(b_scales);
float32x4_t vgscale = vdupq_n_f32(g_scales);
float32x4_t vrscale = vdupq_n_f32(r_scales);
#pragma omp parallel for
for (int i = 0; i < height; i += 1) {
const uint8_t* din_ptr = src + i * 4 * width;
float* ptr_b_h = ptr_b + i * width;
float* ptr_g_h = ptr_g + i * width;
float* ptr_r_h = ptr_r + i * width;
for (int j = 0; j < dim8; j++) {
uint8x8x4_t v_bgr = vld4_u8(din_ptr);
uint16x8_t vb_16 = vmovl_u8(v_bgr.val[0]);
uint16x8_t vg_16 = vmovl_u8(v_bgr.val[1]);
uint16x8_t vr_16 = vmovl_u8(v_bgr.val[2]);
uint32x4_t vb_low_32 = vmovl_u16(vget_low_u16(vb_16));
uint32x4_t vg_low_32 = vmovl_u16(vget_low_u16(vg_16));
uint32x4_t vr_low_32 = vmovl_u16(vget_low_u16(vr_16));
uint32x4_t vb_high_32 = vmovl_u16(vget_high_u16(vb_16));
uint32x4_t vg_high_32 = vmovl_u16(vget_high_u16(vg_16));
uint32x4_t vr_high_32 = vmovl_u16(vget_high_u16(vr_16));
float32x4_t vb_low_f32 = vcvtq_f32_u32(vb_low_32);
float32x4_t vr_low_f32 = vcvtq_f32_u32(vr_low_32);
float32x4_t vg_low_f32 = vcvtq_f32_u32(vg_low_32);
float32x4_t vb_high_f32 = vcvtq_f32_u32(vb_high_32);
float32x4_t vg_high_f32 = vcvtq_f32_u32(vg_high_32);
float32x4_t vr_high_f32 = vcvtq_f32_u32(vr_high_32);
vb_low_f32 = vsubq_f32(vb_low_f32, vbmean);
vg_low_f32 = vsubq_f32(vg_low_f32, vgmean);
vr_low_f32 = vsubq_f32(vr_low_f32, vrmean);
vb_high_f32 = vsubq_f32(vb_high_f32, vbmean);
vg_high_f32 = vsubq_f32(vg_high_f32, vgmean);
vr_high_f32 = vsubq_f32(vr_high_f32, vrmean);
vb_low_f32 = vmulq_f32(vb_low_f32, vbscale);
vg_low_f32 = vmulq_f32(vg_low_f32, vgscale);
vr_low_f32 = vmulq_f32(vr_low_f32, vrscale);
vb_high_f32 = vmulq_f32(vb_high_f32, vbscale);
vg_high_f32 = vmulq_f32(vg_high_f32, vgscale);
vr_high_f32 = vmulq_f32(vr_high_f32, vrscale);
vst1q_f32(ptr_b_h, vb_low_f32);
vst1q_f32(ptr_g_h, vg_low_f32);
vst1q_f32(ptr_r_h, vr_low_f32);
din_ptr += 32;
vst1q_f32(ptr_b_h + 4, vb_high_f32);
vst1q_f32(ptr_g_h + 4, vg_high_f32);
vst1q_f32(ptr_r_h + 4, vr_high_f32);
ptr_b_h += 8;
ptr_g_h += 8;
ptr_r_h += 8;
}
for (int j = 0; j < remain; j++) {
*ptr_b_h++ = (*din_ptr - b_means) * b_scales;
din_ptr++;
*ptr_g_h++ = (*din_ptr - g_means) * g_scales;
din_ptr++;
*ptr_r_h++ = (*din_ptr - r_means) * r_scales;
din_ptr++;
din_ptr++; // a
}
}
}
void bgr_to_tensor_hwc(const uint8_t* src,
float* output,
int width,
int height,
float* means,
float* scales) {
int size = width * height;
float b_means = means[0];
float g_means = means[1];
float r_means = means[2];
float b_scales = scales[0];
float g_scales = scales[1];
float r_scales = scales[2];
float* dout = output;
int dim8 = width >> 3;
int remain = width % 8;
float32x4_t vbmean = vdupq_n_f32(b_means);
float32x4_t vgmean = vdupq_n_f32(g_means);
float32x4_t vrmean = vdupq_n_f32(r_means);
float32x4_t vbscale = vdupq_n_f32(b_scales);
float32x4_t vgscale = vdupq_n_f32(g_scales);
float32x4_t vrscale = vdupq_n_f32(r_scales);
#pragma omp parallel for
for (int i = 0; i < height; i += 1) {
const uint8_t* din_ptr = src + i * 3 * width;
float* dout_ptr = dout + i * 3 * width;
for (int j = 0; j < dim8; j++) {
uint8x8x3_t v_bgr = vld3_u8(din_ptr);
uint16x8_t vb_16 = vmovl_u8(v_bgr.val[0]);
uint16x8_t vg_16 = vmovl_u8(v_bgr.val[1]);
uint16x8_t vr_16 = vmovl_u8(v_bgr.val[2]);
uint32x4_t vb_low_32 = vmovl_u16(vget_low_u16(vb_16));
uint32x4_t vg_low_32 = vmovl_u16(vget_low_u16(vg_16));
uint32x4_t vr_low_32 = vmovl_u16(vget_low_u16(vr_16));
uint32x4_t vb_high_32 = vmovl_u16(vget_high_u16(vb_16));
uint32x4_t vg_high_32 = vmovl_u16(vget_high_u16(vg_16));
uint32x4_t vr_high_32 = vmovl_u16(vget_high_u16(vr_16));
float32x4_t vb_low_f32 = vcvtq_f32_u32(vb_low_32);
float32x4_t vr_low_f32 = vcvtq_f32_u32(vr_low_32);
float32x4_t vg_low_f32 = vcvtq_f32_u32(vg_low_32);
float32x4_t vb_high_f32 = vcvtq_f32_u32(vb_high_32);
float32x4_t vg_high_f32 = vcvtq_f32_u32(vg_high_32);
float32x4_t vr_high_f32 = vcvtq_f32_u32(vr_high_32);
vb_low_f32 = vsubq_f32(vb_low_f32, vbmean);
vg_low_f32 = vsubq_f32(vg_low_f32, vgmean);
vr_low_f32 = vsubq_f32(vr_low_f32, vrmean);
vb_high_f32 = vsubq_f32(vb_high_f32, vbmean);
vg_high_f32 = vsubq_f32(vg_high_f32, vgmean);
vr_high_f32 = vsubq_f32(vr_high_f32, vrmean);
vb_low_f32 = vmulq_f32(vb_low_f32, vbscale);
vg_low_f32 = vmulq_f32(vg_low_f32, vgscale);
vr_low_f32 = vmulq_f32(vr_low_f32, vrscale);
vb_high_f32 = vmulq_f32(vb_high_f32, vbscale);
vg_high_f32 = vmulq_f32(vg_high_f32, vgscale);
vr_high_f32 = vmulq_f32(vr_high_f32, vrscale);
float32x4x3_t val;
val.val[0] = vb_low_f32;
val.val[1] = vg_low_f32;
val.val[2] = vr_low_f32;
vst3q_f32(dout_ptr, val);
din_ptr += 24;
dout_ptr += 12;
val.val[0] = vb_high_f32;
val.val[1] = vg_high_f32;
val.val[2] = vr_high_f32;
vst3q_f32(dout_ptr, val);
dout_ptr += 12;
}
for (int j = 0; j < remain; j++) {
*dout_ptr++ = (*din_ptr - b_means) * b_scales;
din_ptr++;
*dout_ptr++ = (*din_ptr - g_means) * g_scales;
din_ptr++;
*dout_ptr++ = (*din_ptr - r_means) * r_scales;
din_ptr++;
}
}
}
void bgra_to_tensor_hwc(const uint8_t* src,
float* output,
int width,
int height,
float* means,
float* scales) {
int size = width * height;
float b_means = means[0];
float g_means = means[1];
float r_means = means[2];
float b_scales = scales[0];
float g_scales = scales[1];
float r_scales = scales[2];
float* dout = output;
int dim8 = width >> 3;
int remain = width % 8;
float32x4_t vbmean = vdupq_n_f32(b_means);
float32x4_t vgmean = vdupq_n_f32(g_means);
float32x4_t vrmean = vdupq_n_f32(r_means);
float32x4_t vbscale = vdupq_n_f32(b_scales);
float32x4_t vgscale = vdupq_n_f32(g_scales);
float32x4_t vrscale = vdupq_n_f32(r_scales);
#pragma omp parallel for
for (int i = 0; i < height; i += 1) {
const uint8_t* din_ptr = src + i * 4 * width;
float* dout_ptr = dout + i * 3 * width;
for (int j = 0; j < dim8; j++) {
uint8x8x4_t v_bgr = vld4_u8(din_ptr);
uint16x8_t vb_16 = vmovl_u8(v_bgr.val[0]);
uint16x8_t vg_16 = vmovl_u8(v_bgr.val[1]);
uint16x8_t vr_16 = vmovl_u8(v_bgr.val[2]);
// uint16x8_t va_16 = vmovl_u8(v_bgr.val[3]);
uint32x4_t vb_low_32 = vmovl_u16(vget_low_u16(vb_16));
uint32x4_t vg_low_32 = vmovl_u16(vget_low_u16(vg_16));
uint32x4_t vr_low_32 = vmovl_u16(vget_low_u16(vr_16));
uint32x4_t vb_high_32 = vmovl_u16(vget_high_u16(vb_16));
uint32x4_t vg_high_32 = vmovl_u16(vget_high_u16(vg_16));
uint32x4_t vr_high_32 = vmovl_u16(vget_high_u16(vr_16));
float32x4_t vb_low_f32 = vcvtq_f32_u32(vb_low_32);
float32x4_t vr_low_f32 = vcvtq_f32_u32(vr_low_32);
float32x4_t vg_low_f32 = vcvtq_f32_u32(vg_low_32);
float32x4_t vb_high_f32 = vcvtq_f32_u32(vb_high_32);
float32x4_t vg_high_f32 = vcvtq_f32_u32(vg_high_32);
float32x4_t vr_high_f32 = vcvtq_f32_u32(vr_high_32);
vb_low_f32 = vsubq_f32(vb_low_f32, vbmean);
vg_low_f32 = vsubq_f32(vg_low_f32, vgmean);
vr_low_f32 = vsubq_f32(vr_low_f32, vrmean);
vb_high_f32 = vsubq_f32(vb_high_f32, vbmean);
vg_high_f32 = vsubq_f32(vg_high_f32, vgmean);
vr_high_f32 = vsubq_f32(vr_high_f32, vrmean);
vb_low_f32 = vmulq_f32(vb_low_f32, vbscale);
vg_low_f32 = vmulq_f32(vg_low_f32, vgscale);
vr_low_f32 = vmulq_f32(vr_low_f32, vrscale);
vb_high_f32 = vmulq_f32(vb_high_f32, vbscale);
vg_high_f32 = vmulq_f32(vg_high_f32, vgscale);
vr_high_f32 = vmulq_f32(vr_high_f32, vrscale);
float32x4x3_t val;
val.val[0] = vb_low_f32;
val.val[1] = vg_low_f32;
val.val[2] = vr_low_f32;
// val.val[3] = num_a;
vst3q_f32(dout_ptr, val);
din_ptr += 32;
dout_ptr += 12;
val.val[0] = vb_high_f32;
val.val[1] = vg_high_f32;
val.val[2] = vr_high_f32;
vst3q_f32(dout_ptr, val);
dout_ptr += 12;
}
for (int j = 0; j < remain; j++) {
*dout_ptr++ = (*din_ptr - b_means) * b_scales;
din_ptr++;
*dout_ptr++ = (*din_ptr - g_means) * g_scales;
din_ptr++;
*dout_ptr++ = (*din_ptr - r_means) * r_scales;
din_ptr++;
din_ptr++; // a
// *dout_ptr++ = 255;
}
}
}
} // namespace cv
} // namespace utils
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
#include "lite/utils/cv/paddle_image_preprocess.h"
namespace paddle {
namespace lite {
namespace utils {
namespace cv {
typedef void (*tensor_func)(const uint8_t* src,
float* dst,
int srcw,
int srch,
float* means,
float* scales);
class Image2Tensor {
public:
void choose(const uint8_t* src,
Tensor* dst,
ImageFormat srcFormat,
LayoutType layout,
int srcw,
int srch,
float* means,
float* scales);
private:
tensor_func impl_{nullptr};
};
} // namespace cv
} // namespace utils
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/utils/cv/image_convert.h"
#include <arm_neon.h>
#include <math.h>
#include <string.h>
namespace paddle {
namespace lite {
namespace utils {
namespace cv {
void nv_to_bgr(
const uint8_t* src, uint8_t* dst, int srcw, int srch, int x_num, int y_num);
void nv_to_bgra(
const uint8_t* src, uint8_t* dst, int srcw, int srch, int x_num, int y_num);
void nv21_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch);
void nv21_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch);
void nv12_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch);
void nv12_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch);
// bgr rgb to gray
void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch);
// gray to bgr rgb
void hwc1_to_hwc3(const uint8_t* src, uint8_t* dst, int srcw, int srch);
// bgr to bgra or rgb to rgba
void hwc3_to_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch);
// bgra to bgr or rgba to rgb
void hwc4_to_hwc3(const uint8_t* src, uint8_t* dst, int srcw, int srch);
// bgr to rgb or rgb to bgr
void hwc3_trans(const uint8_t* src, uint8_t* dst, int srcw, int srch);
// bgra to rgba or rgba to bgra
void hwc4_trans(const uint8_t* src, uint8_t* dst, int srcw, int srch);
// bgra to rgb or rgba to bgr
void hwc4_trans_hwc3(const uint8_t* src, uint8_t* dst, int srcw, int srch);
// bgr to rgba or rgb to bgra
void hwc3_trans_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch);
/*
* image color convert
* support NV12/NV21_to_BGR(RGB), NV12/NV21_to_BGRA(RGBA),
* BGR(RGB)and BGRA(RGBA) transform,
* BGR(RGB)and RGB(BGR) transform,
* BGR(RGB)and RGBA(BGRA) transform,
* BGR(RGB)and GRAY transform,
* param src: input image data
* param dst: output image data
* param srcFormat: input image image format support: GRAY, NV12(NV21),
* BGR(RGB) and BGRA(RGBA)
* param dstFormat: output image image format, support GRAY, BGR(RGB) and
* BGRA(RGBA)
*/
void ImageConvert::choose(const uint8_t* src,
uint8_t* dst,
ImageFormat srcFormat,
ImageFormat dstFormat,
int srcw,
int srch) {
if (srcFormat == dstFormat) {
// copy
int size = srcw * srch;
if (srcFormat == NV12 || srcFormat == NV21) {
size = srcw * (ceil(1.5 * srch));
} else if (srcFormat == BGR || srcFormat == RGB) {
size = 3 * srcw * srch;
} else if (srcFormat == BGRA || srcFormat == RGBA) {
size = 4 * srcw * srch;
}
memcpy(dst, src, sizeof(uint8_t) * size);
return;
} else {
if (srcFormat == NV12 && (dstFormat == BGR || dstFormat == RGB)) {
impl_ = nv12_to_bgr;
} else if (srcFormat == NV21 && (dstFormat == BGR || dstFormat == RGB)) {
impl_ = nv21_to_bgr;
} else if (srcFormat == NV12 && (dstFormat == BGRA || dstFormat == RGBA)) {
impl_ = nv12_to_bgra;
} else if (srcFormat == NV21 && (dstFormat == BGRA || dstFormat == RGBA)) {
impl_ = nv21_to_bgra;
} else if ((srcFormat == RGBA && dstFormat == RGB) ||
(srcFormat == BGRA && dstFormat == BGR)) {
impl_ = hwc4_to_hwc3;
} else if ((srcFormat == RGB && dstFormat == RGBA) ||
(srcFormat == BGR && dstFormat == BGRA)) {
impl_ = hwc3_to_hwc4;
} else if ((srcFormat == RGB && dstFormat == BGR) ||
(srcFormat == BGR && dstFormat == RGB)) {
impl_ = hwc3_trans;
} else if ((srcFormat == RGBA && dstFormat == BGRA) ||
(srcFormat == BGRA && dstFormat == RGBA)) {
impl_ = hwc4_trans;
} else if ((srcFormat == RGB && dstFormat == GRAY) ||
(srcFormat == BGR && dstFormat == GRAY)) {
impl_ = hwc3_to_hwc1;
} else if ((srcFormat == GRAY && dstFormat == RGB) ||
(srcFormat == GRAY && dstFormat == BGR)) {
impl_ = hwc1_to_hwc3;
} else if ((srcFormat == RGBA && dstFormat == BGR) ||
(srcFormat == BGRA && dstFormat == RGB)) {
impl_ = hwc4_trans_hwc3;
} else if ((srcFormat == RGB && dstFormat == BGRA) ||
(srcFormat == BGR && dstFormat == RGBA)) {
impl_ = hwc3_trans_hwc4;
} else {
printf("srcFormat: %d, dstFormat: %d does not support! \n",
srcFormat,
dstFormat);
}
}
impl_(src, dst, srcw, srch);
}
/*
nv21(yvu) to BGR: stroe hwc dsth * dstw = srch * (srcw)
y_w = srcw, y_h = srch uv_w = srcw uv_h = 1/2 * srch
R = Y + 1.402*(V-128);
G = Y - 0.34414*(U-128) - 0.71414*(V-128);
B = Y + 1.772*(U-128);
浮点乘法用 7位精度处理(即a*b = ((a << 7)*b )>>7)
ra = 1.402 *128 = 179.456 = 179
ga = 0.34414 * 64 = 44.3721 = 44
gb = 0.71414 * 64 = 91.40992 = 91
ba = 1.772 * 62 = 226.816 = 227
nv12bgr, nv21tobgr
*/
void nv_to_bgr(const uint8_t* src,
uint8_t* dst,
int srcw,
int srch,
int x_num,
int y_num) {
// nv21 x = 0, y = 1
// nv12 x = 1, y = 0
int y_h = srch;
int wout = srcw * 3;
const uint8_t* y = src;
const uint8_t* vu = src + y_h * srcw;
int16x8_t bias = vdupq_n_s16(128);
int16x8_t ga = vdupq_n_s16(44);
int16x8_t ra = vdupq_n_s16(179);
int16x8_t ba = vdupq_n_s16(227);
int16x8_t gb = vdupq_n_s16(91);
int16x8_t zero = vdupq_n_s16(0);
int16x8_t max = vdupq_n_s16(255);
uint8_t* zerobuf = new uint8_t[srcw];
uint8_t* writebuf = new uint8_t[wout];
memset(zerobuf, 0, sizeof(uint8_t) * srcw);
int i = 0;
#pragma omp parallel for
for (i = 0; i < y_h; i += 2) {
const uint8_t* ptr_y1 = y + i * srcw;
const uint8_t* ptr_y2 = ptr_y1 + srcw;
const uint8_t* ptr_vu = vu + (i / 2) * srcw;
uint8_t* ptr_bgr1 = dst + i * wout;
uint8_t* ptr_bgr2 = ptr_bgr1 + wout;
if (i + 2 > y_h) {
ptr_y2 = zerobuf;
ptr_bgr2 = writebuf;
}
int j = 0;
for (; j < srcw - 15; j += 16) {
uint8x8x2_t y1 = vld2_u8(ptr_y1); // d8 = y0y2y4y6...y14 d9 =
// y1y3y5...y15
uint8x8x2_t vu =
vld2_u8(ptr_vu); // d0 = v0v1v2v3v4v5...v7 d1 = u0u1u2...u7
uint8x8x2_t y2 = vld2_u8(ptr_y2);
uint16x8_t v = vmovl_u8(vu.val[x_num]);
uint16x8_t u = vmovl_u8(vu.val[y_num]);
int16x8_t v_s = vreinterpretq_s16_u16(v);
int16x8_t u_s = vreinterpretq_s16_u16(u);
int16x8_t v_bias = vsubq_s16(v_s, bias);
int16x8_t u_bias = vsubq_s16(u_s, bias);
// G = Y - 0.34414*(U-128) - 0.71414*(V-128);
int16x8_t g0 = vmulq_s16(ga, u_bias);
// R = Y + 1.402*(V-128);
int16x8_t r0 = vmulq_s16(ra, v_bias);
// B = Y + 1.772*(U-128);
int16x8_t b0 = vmulq_s16(ba, u_bias);
g0 = vmlaq_s16(g0, gb, v_bias);
int16x8_t y1_0_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[0]));
int16x8_t y1_1_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[1]));
int16x8_t y2_0_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[0]));
int16x8_t y2_1_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[1]));
int16x8_t r0_bias = vshrq_n_s16(r0, 7); // r0 / 128
int16x8_t b0_bias = vshrq_n_s16(b0, 7);
int16x8_t g0_bias = vshrq_n_s16(g0, 7);
int16x8_t r0_1 = vaddq_s16(y1_0_8, r0_bias);
int16x8_t b0_1 = vaddq_s16(y1_0_8, b0_bias);
int16x8_t g0_1 = vsubq_s16(y1_0_8, g0_bias); // g0_1 = y1_0_8 - g0_1
int16x8_t r0_2 = vaddq_s16(y1_1_8, r0_bias);
int16x8_t b0_2 = vaddq_s16(y1_1_8, b0_bias);
int16x8_t g0_2 = vsubq_s16(y1_1_8, g0_bias);
r0_1 = vmaxq_s16(r0_1, zero);
b0_1 = vmaxq_s16(b0_1, zero);
g0_1 = vmaxq_s16(g0_1, zero);
r0_2 = vmaxq_s16(r0_2, zero);
b0_2 = vmaxq_s16(b0_2, zero);
g0_2 = vmaxq_s16(g0_2, zero);
r0_1 = vminq_s16(r0_1, max);
b0_1 = vminq_s16(b0_1, max);
g0_1 = vminq_s16(g0_1, max);
r0_2 = vminq_s16(r0_2, max);
b0_2 = vminq_s16(b0_2, max);
g0_2 = vminq_s16(g0_2, max);
uint8x8_t r00 = vreinterpret_u8_s8(vmovn_s16(r0_1));
uint8x8_t b00 = vreinterpret_u8_s8(vmovn_s16(b0_1));
uint8x8_t g00 = vreinterpret_u8_s8(vmovn_s16(g0_1));
uint8x8_t r01 = vreinterpret_u8_s8(vmovn_s16(r0_2));
uint8x8_t b01 = vreinterpret_u8_s8(vmovn_s16(b0_2));
uint8x8_t g01 = vreinterpret_u8_s8(vmovn_s16(g0_2));
int16x8_t r1_1 = vaddq_s16(y2_0_8, r0_bias);
int16x8_t b1_1 = vaddq_s16(y2_0_8, b0_bias);
int16x8_t g1_1 = vsubq_s16(y2_0_8, g0_bias); // g0_1 = y1_0_8 - g0_1
int16x8_t r1_2 = vaddq_s16(y2_1_8, r0_bias);
int16x8_t b1_2 = vaddq_s16(y2_1_8, b0_bias);
int16x8_t g1_2 = vsubq_s16(y2_1_8, g0_bias);
uint8x8x2_t r00_0 = vtrn_u8(r00, r01); // 014589 236710
uint8x8x2_t b00_0 = vtrn_u8(b00, b01);
uint8x8x2_t g00_0 = vtrn_u8(g00, g01);
r1_1 = vmaxq_s16(r1_1, zero);
b1_1 = vmaxq_s16(b1_1, zero);
g1_1 = vmaxq_s16(g1_1, zero);
r1_2 = vmaxq_s16(r1_2, zero);
b1_2 = vmaxq_s16(b1_2, zero);
g1_2 = vmaxq_s16(g1_2, zero);
uint16x4_t r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
uint16x4_t r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
uint16x4_t b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
uint16x4_t b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
uint16x4_t g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
uint16x4_t g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
uint16x4x2_t r00_1 = vtrn_u16(r0_16, r1_16); // 012389 456710
uint16x4x2_t b00_1 = vtrn_u16(b0_16, b1_16);
uint16x4x2_t g00_1 = vtrn_u16(g0_16, g1_16);
r1_1 = vminq_s16(r1_1, max);
b1_1 = vminq_s16(b1_1, max);
g1_1 = vminq_s16(g1_1, max);
r1_2 = vminq_s16(r1_2, max);
b1_2 = vminq_s16(b1_2, max);
g1_2 = vminq_s16(g1_2, max);
uint32x2_t r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
uint32x2_t r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
uint32x2_t b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
uint32x2_t b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
uint32x2_t g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
uint32x2_t g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
uint32x2x2_t r00_2 = vtrn_u32(r0_32, r1_32); // 01234567 8910
uint32x2x2_t b00_2 = vtrn_u32(b0_32, b1_32);
uint32x2x2_t g00_2 = vtrn_u32(g0_32, g1_32);
r00 = vreinterpret_u8_s8(vmovn_s16(r1_1));
b00 = vreinterpret_u8_s8(vmovn_s16(b1_1));
g00 = vreinterpret_u8_s8(vmovn_s16(g1_1));
r01 = vreinterpret_u8_s8(vmovn_s16(r1_2));
b01 = vreinterpret_u8_s8(vmovn_s16(b1_2));
g01 = vreinterpret_u8_s8(vmovn_s16(g1_2));
uint8x8_t r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
uint8x8_t b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
uint8x8_t g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
uint8x8_t r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
uint8x8_t b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
uint8x8_t g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
uint8x8x3_t v_bgr;
v_bgr.val[0] = b0_8;
v_bgr.val[1] = g0_8;
v_bgr.val[2] = r0_8;
r00_0 = vtrn_u8(r00, r01); // 014589 236710
b00_0 = vtrn_u8(b00, b01);
g00_0 = vtrn_u8(g00, g01);
vst3_u8(ptr_bgr1, v_bgr);
r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
ptr_bgr1 += 24;
uint8x8x3_t v_bgr1;
v_bgr1.val[0] = b1_8;
v_bgr1.val[1] = g1_8;
v_bgr1.val[2] = r1_8;
r00_1 = vtrn_u16(r0_16, r1_16); // 012389 456710
b00_1 = vtrn_u16(b0_16, b1_16);
g00_1 = vtrn_u16(g0_16, g1_16);
vst3_u8(ptr_bgr1, v_bgr1);
r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
ptr_bgr1 += 24;
r00_2 = vtrn_u32(r0_32, r1_32); // 01234567 8910
b00_2 = vtrn_u32(b0_32, b1_32);
g00_2 = vtrn_u32(g0_32, g1_32);
ptr_vu += 16;
ptr_y1 += 16;
ptr_y2 += 16;
r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
v_bgr.val[0] = b0_8;
v_bgr.val[1] = g0_8;
v_bgr.val[2] = r0_8;
v_bgr1.val[0] = b1_8;
v_bgr1.val[1] = g1_8;
v_bgr1.val[2] = r1_8;
vst3_u8(ptr_bgr2, v_bgr);
vst3_u8(ptr_bgr2 + 24, v_bgr1);
ptr_bgr2 += 48;
}
// two data
for (; j < srcw; j += 2) {
uint8_t _y0 = ptr_y1[0];
uint8_t _y1 = ptr_y1[1];
uint8_t _v = ptr_vu[x_num];
uint8_t _u = ptr_vu[y_num];
uint8_t _y0_1 = ptr_y2[0];
uint8_t _y1_1 = ptr_y2[1];
int ra = floor((179 * (_v - 128)) >> 7);
int ga = floor((44 * (_u - 128) + 91 * (_v - 128)) >> 7);
int ba = floor((227 * (_u - 128)) >> 7);
int r = _y0 + ra;
int g = _y0 - ga;
int b = _y0 + ba;
int r1 = _y1 + ra;
int g1 = _y1 - ga;
int b1 = _y1 + ba;
r = r < 0 ? 0 : (r > 255) ? 255 : r;
g = g < 0 ? 0 : (g > 255) ? 255 : g;
b = b < 0 ? 0 : (b > 255) ? 255 : b;
r1 = r1 < 0 ? 0 : (r1 > 255) ? 255 : r1;
g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1;
b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1;
*ptr_bgr1++ = b;
*ptr_bgr1++ = g;
*ptr_bgr1++ = r;
int r2 = _y0_1 + ra;
int g2 = _y0_1 - ga;
int b2 = _y0_1 + ba;
int r3 = _y1_1 + ra;
int g3 = _y1_1 - ga;
int b3 = _y1_1 + ba;
r2 = r2 < 0 ? 0 : (r2 > 255) ? 255 : r2;
g2 = g2 < 0 ? 0 : (g2 > 255) ? 255 : g2;
b2 = b2 < 0 ? 0 : (b2 > 255) ? 255 : b2;
r3 = r3 < 0 ? 0 : (r3 > 255) ? 255 : r3;
g3 = g3 < 0 ? 0 : (g3 > 255) ? 255 : g3;
b3 = b3 < 0 ? 0 : (b3 > 255) ? 255 : b3;
*ptr_bgr1++ = b1;
*ptr_bgr1++ = g1;
*ptr_bgr1++ = r1;
*ptr_bgr2++ = b2;
*ptr_bgr2++ = g2;
*ptr_bgr2++ = r2;
ptr_y1 += 2;
ptr_y2 += 2;
ptr_vu += 2;
*ptr_bgr2++ = b3;
*ptr_bgr2++ = g3;
*ptr_bgr2++ = r3;
}
}
delete[] zerobuf;
delete[] writebuf;
}
// nv12bgra, nv21tobgra
void nv_to_bgra(const uint8_t* src,
uint8_t* dst,
int srcw,
int srch,
int x_num,
int y_num) {
// nv21 x = 0, y = 1
// nv12 x = 1, y = 0
int y_h = srch;
int vu_h = 1 / 2 * srch;
const uint8_t* y = src;
const uint8_t* vu = src + y_h * srcw;
int wout = srcw * 4;
uint8_t* zerobuf = new uint8_t[srcw];
uint8_t* writebuf = new uint8_t[wout];
memset(zerobuf, 0, sizeof(uint8_t) * srcw);
int16x8_t bias = vdupq_n_s16(128);
int16x8_t ga = vdupq_n_s16(44);
int16x8_t ra = vdupq_n_s16(179);
int16x8_t ba = vdupq_n_s16(227);
int16x8_t gb = vdupq_n_s16(91);
int16x8_t zero = vdupq_n_s16(0);
int16x8_t max = vdupq_n_s16(255);
uint8x8_t a_8 = vdup_n_u8(255);
#pragma omp parallel for
for (int i = 0; i < y_h; i += 2) {
const uint8_t* ptr_y1 = y + i * srcw;
const uint8_t* ptr_y2 = ptr_y1 + srcw;
const uint8_t* ptr_vu = vu + (i / 2) * srcw;
uint8_t* ptr_bgr1 = dst + i * wout;
uint8_t* ptr_bgr2 = ptr_bgr1 + wout;
if (i + 2 > y_h) {
ptr_y2 = zerobuf;
ptr_bgr2 = writebuf;
}
int j = 0;
for (; j < srcw - 15; j += 16) {
uint8x8x2_t y1 = vld2_u8(ptr_y1); // d8 = y0y2y4y6...y14 d9 =
// y1y3y5...y15
uint8x8x2_t vu =
vld2_u8(ptr_vu); // d0 = v0v1v2v3v4v5...v7 d1 = u0u1u2...u7
uint8x8x2_t y2 = vld2_u8(ptr_y2);
uint16x8_t v = vmovl_u8(vu.val[x_num]);
uint16x8_t u = vmovl_u8(vu.val[y_num]);
int16x8_t v_s = vreinterpretq_s16_u16(v);
int16x8_t u_s = vreinterpretq_s16_u16(u);
int16x8_t v_bias = vsubq_s16(v_s, bias);
int16x8_t u_bias = vsubq_s16(u_s, bias);
// G = Y - 0.34414*(U-128) - 0.71414*(V-128);
int16x8_t g0 = vmulq_s16(ga, u_bias);
// R = Y + 1.402*(V-128);
int16x8_t r0 = vmulq_s16(ra, v_bias);
// B = Y + 1.772*(U-128);
int16x8_t b0 = vmulq_s16(ba, u_bias);
g0 = vmlaq_s16(g0, gb, v_bias);
int16x8_t y1_0_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[0]));
int16x8_t y1_1_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[1]));
int16x8_t y2_0_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[0]));
int16x8_t y2_1_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[1]));
int16x8_t r0_bias = vshrq_n_s16(r0, 7); // r0 / 128
int16x8_t b0_bias = vshrq_n_s16(b0, 7);
int16x8_t g0_bias = vshrq_n_s16(g0, 7);
int16x8_t r0_1 = vaddq_s16(y1_0_8, r0_bias);
int16x8_t b0_1 = vaddq_s16(y1_0_8, b0_bias);
int16x8_t g0_1 = vsubq_s16(y1_0_8, g0_bias); // g0_1 = y1_0_8 - g0_1
int16x8_t r0_2 = vaddq_s16(y1_1_8, r0_bias);
int16x8_t b0_2 = vaddq_s16(y1_1_8, b0_bias);
int16x8_t g0_2 = vsubq_s16(y1_1_8, g0_bias);
r0_1 = vmaxq_s16(r0_1, zero);
b0_1 = vmaxq_s16(b0_1, zero);
g0_1 = vmaxq_s16(g0_1, zero);
r0_2 = vmaxq_s16(r0_2, zero);
b0_2 = vmaxq_s16(b0_2, zero);
g0_2 = vmaxq_s16(g0_2, zero);
r0_1 = vminq_s16(r0_1, max);
b0_1 = vminq_s16(b0_1, max);
g0_1 = vminq_s16(g0_1, max);
r0_2 = vminq_s16(r0_2, max);
b0_2 = vminq_s16(b0_2, max);
g0_2 = vminq_s16(g0_2, max);
uint8x8_t r00 = vreinterpret_u8_s8(vmovn_s16(r0_1));
uint8x8_t b00 = vreinterpret_u8_s8(vmovn_s16(b0_1));
uint8x8_t g00 = vreinterpret_u8_s8(vmovn_s16(g0_1));
uint8x8_t r01 = vreinterpret_u8_s8(vmovn_s16(r0_2));
uint8x8_t b01 = vreinterpret_u8_s8(vmovn_s16(b0_2));
uint8x8_t g01 = vreinterpret_u8_s8(vmovn_s16(g0_2));
int16x8_t r1_1 = vaddq_s16(y2_0_8, r0_bias);
int16x8_t b1_1 = vaddq_s16(y2_0_8, b0_bias);
int16x8_t g1_1 = vsubq_s16(y2_0_8, g0_bias); // g0_1 = y1_0_8 - g0_1
int16x8_t r1_2 = vaddq_s16(y2_1_8, r0_bias);
int16x8_t b1_2 = vaddq_s16(y2_1_8, b0_bias);
int16x8_t g1_2 = vsubq_s16(y2_1_8, g0_bias);
uint8x8x2_t r00_0 = vtrn_u8(r00, r01); // 014589 236710
uint8x8x2_t b00_0 = vtrn_u8(b00, b01);
uint8x8x2_t g00_0 = vtrn_u8(g00, g01);
r1_1 = vmaxq_s16(r1_1, zero);
b1_1 = vmaxq_s16(b1_1, zero);
g1_1 = vmaxq_s16(g1_1, zero);
r1_2 = vmaxq_s16(r1_2, zero);
b1_2 = vmaxq_s16(b1_2, zero);
g1_2 = vmaxq_s16(g1_2, zero);
uint16x4_t r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
uint16x4_t r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
uint16x4_t b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
uint16x4_t b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
uint16x4_t g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
uint16x4_t g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
uint16x4x2_t r00_1 = vtrn_u16(r0_16, r1_16); // 012389 456710
uint16x4x2_t b00_1 = vtrn_u16(b0_16, b1_16);
uint16x4x2_t g00_1 = vtrn_u16(g0_16, g1_16);
r1_1 = vminq_s16(r1_1, max);
b1_1 = vminq_s16(b1_1, max);
g1_1 = vminq_s16(g1_1, max);
r1_2 = vminq_s16(r1_2, max);
b1_2 = vminq_s16(b1_2, max);
g1_2 = vminq_s16(g1_2, max);
uint32x2_t r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
uint32x2_t r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
uint32x2_t b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
uint32x2_t b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
uint32x2_t g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
uint32x2_t g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
uint32x2x2_t r00_2 = vtrn_u32(r0_32, r1_32); // 01234567 8910
uint32x2x2_t b00_2 = vtrn_u32(b0_32, b1_32);
uint32x2x2_t g00_2 = vtrn_u32(g0_32, g1_32);
r00 = vreinterpret_u8_s8(vmovn_s16(r1_1));
b00 = vreinterpret_u8_s8(vmovn_s16(b1_1));
g00 = vreinterpret_u8_s8(vmovn_s16(g1_1));
r01 = vreinterpret_u8_s8(vmovn_s16(r1_2));
b01 = vreinterpret_u8_s8(vmovn_s16(b1_2));
g01 = vreinterpret_u8_s8(vmovn_s16(g1_2));
uint8x8_t r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
uint8x8_t b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
uint8x8_t g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
uint8x8_t r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
uint8x8_t b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
uint8x8_t g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
uint8x8x4_t v_bgr;
v_bgr.val[0] = b0_8;
v_bgr.val[1] = g0_8;
v_bgr.val[2] = r0_8;
v_bgr.val[3] = a_8;
r00_0 = vtrn_u8(r00, r01); // 014589 236710
b00_0 = vtrn_u8(b00, b01);
g00_0 = vtrn_u8(g00, g01);
// ptr_bgr3 += 8;
// ptr_bgr1 += 8;
// ptr_bgr2 += 8;
// vst3_u8(ptr_bgr1, v_bgr);
vst4_u8(ptr_bgr1, v_bgr);
r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
ptr_bgr1 += 32;
// uint8x8x3_t v_bgr1;
uint8x8x4_t v_bgr1;
v_bgr1.val[0] = b1_8;
v_bgr1.val[1] = g1_8;
v_bgr1.val[2] = r1_8;
v_bgr1.val[3] = a_8;
r00_1 = vtrn_u16(r0_16, r1_16); // 012389 456710
b00_1 = vtrn_u16(b0_16, b1_16);
g00_1 = vtrn_u16(g0_16, g1_16);
// vst3_u8(ptr_bgr1, v_bgr1);
vst4_u8(ptr_bgr1, v_bgr1);
r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
// ptr_bgr1 += 24;
ptr_bgr1 += 32;
r00_2 = vtrn_u32(r0_32, r1_32); // 01234567 8910
b00_2 = vtrn_u32(b0_32, b1_32);
g00_2 = vtrn_u32(g0_32, g1_32);
ptr_vu += 16;
ptr_y1 += 16;
ptr_y2 += 16;
r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
v_bgr.val[0] = b0_8;
v_bgr.val[1] = g0_8;
v_bgr.val[2] = r0_8;
v_bgr1.val[0] = b1_8;
v_bgr1.val[1] = g1_8;
v_bgr1.val[2] = r1_8;
// vst3_u8(ptr_bgr2, v_bgr);
// vst3_u8(ptr_bgr2 + 24, v_bgr1);
vst4_u8(ptr_bgr2, v_bgr);
vst4_u8(ptr_bgr2 + 32, v_bgr1);
ptr_bgr2 += 64;
}
// two data
for (; j < srcw; j += 2) {
uint8_t _y0 = ptr_y1[0];
uint8_t _y1 = ptr_y1[1];
uint8_t _v = ptr_vu[x_num];
uint8_t _u = ptr_vu[y_num];
uint8_t _y0_1 = ptr_y2[0];
uint8_t _y1_1 = ptr_y2[1];
int ra = floor((179 * (_v - 128)) >> 7);
int ga = floor((44 * (_u - 128) + 91 * (_v - 128)) >> 7);
int ba = floor((227 * (_u - 128)) >> 7);
int r = _y0 + ra;
int g = _y0 - ga;
int b = _y0 + ba;
int r1 = _y1 + ra;
int g1 = _y1 - ga;
int b1 = _y1 + ba;
r = r < 0 ? 0 : (r > 255) ? 255 : r;
g = g < 0 ? 0 : (g > 255) ? 255 : g;
b = b < 0 ? 0 : (b > 255) ? 255 : b;
r1 = r1 < 0 ? 0 : (r1 > 255) ? 255 : r1;
g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1;
b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1;
// *ptr_bgr1++ = b;
// *ptr_bgr2++ = g;
// *ptr_bgr3++ = r;
*ptr_bgr1++ = b;
*ptr_bgr1++ = g;
*ptr_bgr1++ = r;
*ptr_bgr1++ = 255;
int r2 = _y0_1 + ra;
int g2 = _y0_1 - ga;
int b2 = _y0_1 + ba;
int r3 = _y1_1 + ra;
int g3 = _y1_1 - ga;
int b3 = _y1_1 + ba;
r2 = r2 < 0 ? 0 : (r2 > 255) ? 255 : r2;
g2 = g2 < 0 ? 0 : (g2 > 255) ? 255 : g2;
b2 = b2 < 0 ? 0 : (b2 > 255) ? 255 : b2;
r3 = r3 < 0 ? 0 : (r3 > 255) ? 255 : r3;
g3 = g3 < 0 ? 0 : (g3 > 255) ? 255 : g3;
b3 = b3 < 0 ? 0 : (b3 > 255) ? 255 : b3;
*ptr_bgr1++ = b1;
*ptr_bgr1++ = g1;
*ptr_bgr1++ = r1;
*ptr_bgr1++ = 255;
*ptr_bgr2++ = b2;
*ptr_bgr2++ = g2;
*ptr_bgr2++ = r2;
*ptr_bgr2++ = 255;
ptr_y1 += 2;
ptr_y2 += 2;
ptr_vu += 2;
*ptr_bgr2++ = b3;
*ptr_bgr2++ = g3;
*ptr_bgr2++ = r3;
*ptr_bgr2++ = 255;
}
}
delete[] zerobuf;
delete[] writebuf;
}
void nv21_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
nv_to_bgr(src, dst, srcw, srch, 0, 1);
}
// nv12(yuv) to BGR:store hwc dsth * dstw = srch * srcw y_w = srcw, y_h = srch
// uv_w = srcw uv_h = 1/2 * srch
void nv12_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
// exchange vu forward
nv_to_bgr(src, dst, srcw, srch, 1, 0);
}
// nv21(yvu) to BGRA: stroe hwc dsth * dstw = srch * (srcw) y_w = srcw, y_h =
// srch uv_w = srcw uv_h = 1/2 * srch
void nv21_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
nv_to_bgra(src, dst, srcw, srch, 0, 1);
}
// nv12(yuv) to BGRA:store hwc dsth * dstw = srch * srcw y_w = srcw, y_h = srch
// uv_w = srcw uv_h = 1/2 * srch
void nv12_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
nv_to_bgra(src, dst, srcw, srch, 1, 0);
}
/*
采用CV_BGR2GRAY,转换公式Gray = 0.1140*B + 0.5870*G + 0.2989*R
采用CV_RGB2GRAY,转换公式Gray = 0.1140*R + 0.5870*G + 0.2989*B
b = 0.114 *128 = 14.529 = 15
g = 0.587 * 128 = 75.136 = 75
r = 0.2989 * 127 = 38.2592 = 38
Gray = (15*B + 75*G + 38*R)/128
bgr2gray, rgb2gray
*/
void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
uint8_t b = 15;
uint8_t g = 75;
uint8_t r = 38;
uint8x8_t vb = vdup_n_u8(b);
uint8x8_t vg = vdup_n_u8(g);
uint8x8_t vr = vdup_n_u8(r);
#ifdef __aarch64__
#else
uint8_t vb_array[8] = {b, b, b, b, b, b, b, b};
uint8_t vg_array[8] = {g, g, g, g, g, g, g, g};
uint8_t vr_array[8] = {r, r, r, r, r, r, r, r};
#endif
int cnt_pro = srcw >> 3;
int remain_pro = srcw % 8;
int win = srcw * 3;
int i = 0;
#pragma omp parallel for
for (i = 0; i < srch - 3; i += 4) {
int j = 0;
const uint8_t* inptr0 = src + i * win;
const uint8_t* inptr1 = inptr0 + win;
const uint8_t* inptr2 = inptr1 + win;
const uint8_t* inptr3 = inptr2 + win;
uint8_t* outr0 = dst + i * srcw;
uint8_t* outr1 = outr0 + srcw;
uint8_t* outr2 = outr1 + srcw;
uint8_t* outr3 = outr2 + srcw;
int cnt = cnt_pro;
if (cnt > 0) {
#ifdef __aarch64__
asm volatile(
"prfm pldl1keep, [%[inptr0]] \n"
"prfm pldl1keep, [%[inptr0], #128] \n"
"prfm pldl1keep, [%[inptr1]] \n"
"prfm pldl1keep, [%[inptr1], #128] \n"
"prfm pldl1keep, [%[inptr2]] \n"
"prfm pldl1keep, [%[inptr2], #128] \n"
"prfm pldl1keep, [%[inptr3]] \n"
"prfm pldl1keep, [%[inptr3], #128] \n"
"1: \n"
"ld3 {v0.8b - v2.8b}, [%[inptr0]], #24 \n" // d8 = y0y3y6y9.. d9 =
// y1y4y7...
"ld3 {v3.8b - v5.8b}, [%[inptr1]], #24 \n" // d8 = y0y3y6y9.. d9 =
// y1y4y7...
"ld3 {v6.8b - v8.8b}, [%[inptr2]], #24 \n" // d8 = y0y3y6y9.. d9 =
// y1y4y7...
"ld3 {v9.8b - v11.8b}, [%[inptr3]], #24 \n" // d8 = y0y3y6y9.. d9 =
// y1y4y7...
// mul b
"umull v12.8h, v0.8b, %w[vb].8b \n" // v0 * vb
"umull v13.8h, v3.8b, %w[vb].8b \n" // v0 * vb
"umull v14.8h, v6.8b, %w[vb].8b \n" // v0 * vb
"umull v15.8h, v9.8b, %w[vb].8b \n" // v0 * vb
// mul g
"umull v16.8h, v1.8b, %w[vg].8b \n" // v0 * vb
"umull v17.8h, v4.8b, %w[vg].8b \n" // v0 * vb
"umull v18.8h, v7.8b, %w[vg].8b \n" // v0 * vb
"umull v19.8h, v10.8b, %w[vg].8b \n" // v0 * vb
// mul r
"umlal v12.8h, v2.8b, %w[vr].8b \n" // v0 * vb
"umlal v13.8h, v5.8b, %w[vr].8b \n" // v0 * vb
"umlal v14.8h, v8.8b, %w[vr].8b \n" // v0 * vb
"umlal v15.8h, v11.8b, %w[vr].8b \n" // v0 * vb
// 16->32
"uaddl v0.4s, v16.4h, v12.4h \n"
"uaddl2 v1.4s, v16.8h, v12.8h \n"
"uaddl v2.4s, v17.4h, v13.4h \n"
"uaddl2 v3.4s, v17.8h, v13.8h \n"
"uaddl v4.4s, v18.4h, v14.4h \n"
"uaddl2 v5.4s, v18.8h, v14.8h \n"
"uaddl v6.4s, v19.4h, v15.4h \n"
"uaddl2 v7.4s, v19.8h, v15.8h \n"
// 32->16 v0 >> 7
"shrn v12.4h, v0.4s, #7 \n"
"shrn2 v12.8h, v1.4s, #7 \n"
"shrn v13.4h, v2.4s, #7 \n"
"shrn2 v13.8h, v3.4s, #7 \n"
"shrn v14.4h, v4.4s, #7 \n"
"shrn2 v14.8h, v5.4s, #7 \n"
"shrn v15.4h, v6.4s, #7 \n"
"shrn2 v15.8h, v7.4s, #7 \n"
// 16->8
"xtn v0.8b, v12.8h \n"
"xtn v1.8b, v13.8h \n"
"xtn v2.8b, v14.8h \n"
"xtn v3.8b, v15.8h \n"
"subs %w[cnt], %w[cnt], #1 \n"
"st1 {v0.8b}, [%[outr0]], #8 \n"
"st1 {v1.8b}, [%[outr1]], #8 \n"
"st1 {v2.8b}, [%[outr2]], #8 \n"
"st1 {v3.8b}, [%[outr3]], #8 \n"
"bne 1b \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outr0] "+r"(outr0),
[outr1] "+r"(outr1),
[outr2] "+r"(outr2),
[outr3] "+r"(outr3),
[cnt] "+r"(cnt)
: [vb] "w"(vb), [vg] "w"(vg), [vr] "w"(vr)
: "cc",
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20");
#else
asm volatile(
"pld [%[inptr0]] @ preload a, 64byte\n"
"pld [%[inptr0], #128] @ preload a, 64byte\n"
"pld [%[inptr1]] @ preload a, 64byte\n"
"pld [%[inptr1], #128] @ preload a, 64byte\n"
"pld [%[inptr2]] @ preload a, 64byte\n"
"pld [%[inptr2], #128] @ preload a, 64byte\n"
"pld [%[inptr3]] @ preload a, 64byte\n"
"pld [%[inptr3], #128] @ preload a, 64byte\n"
"vld1.8 d0, [%[vb]] \n"
"vld1.8 d1, [%[vg]] \n"
"vld1.8 d2, [%[vr]] \n"
"1: \n"
"vld3.8 {d3, d4, d5}, [%[inptr0]]! \n"
"vld3.8 {d6, d7, d8}, [%[inptr1]]! \n"
"vld3.8 {d9, d10, d11}, [%[inptr2]]! \n"
"vld3.8 {d12, d13, d14}, [%[inptr3]]! \n"
// vb
"vmull.u8 q8, d3, d0 \n"
"vmull.u8 q9, d6, d0 \n"
"vmull.u8 q10, d9, d0 \n"
"vmull.u8 q11, d12, d0 \n"
// vg
"vmull.u8 q12, d4, d1 \n"
"vmull.u8 q13, d7, d1 \n"
"vmull.u8 q14, d10, d1 \n"
"vmull.u8 q15, d13, d1 \n"
// vr
"vmlal.u8 q8, d5, d2 \n"
"vmlal.u8 q9, d6, d2 \n"
"vmlal.u8 q10, d11, d2 \n"
"vmlal.u8 q11, d14, d2 \n"
// 16->32
"vaddl.u16 q2, d24, d16 \n"
"vaddl.u16 q3, d25, d17 \n"
"vaddl.u16 q4, d26, d18 \n"
"vaddl.u16 q5, d27, d19 \n"
"vaddl.u16 q6, d28, d20 \n"
"vaddl.u16 q7, d29, d21 \n"
"vaddl.u16 q8, d30, d22 \n"
"vaddl.u16 q9, d31, d23 \n"
// 32->16 q2 >> 7
"vshrn.u32 d20, q2, #7 \n"
"vshrn.u32 d21, q3, #7 \n"
"vshrn.u32 d22, q4, #7 \n"
"vshrn.u32 d23, q5, #7 \n"
"vshrn.u32 d24, q6, #7 \n"
"vshrn.u32 d25, q7, #7 \n"
"vshrn.u32 d26, q8, #7 \n"
"vshrn.u32 d27, q8, #7 \n"
// 16->8
"vmovn.u16 d4, q10 \n"
"vmovn.u16 d5, q11 \n"
"vmovn.u16 d6, q12 \n"
"vmovn.u16 d7, q13 \n"
"subs %[cnt], #1 \n"
// store
"vst1.8 d4, [%[outr0]]! \n"
"vst1.8 d5, [%[outr1]]! \n"
"vst1.8 d6, [%[outr2]]! \n"
"vst1.8 d7, [%[outr3]]! \n"
"bne 1b \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outr0] "+r"(outr0),
[outr1] "+r"(outr1),
[outr2] "+r"(outr2),
[outr3] "+r"(outr3),
[cnt] "+r"(cnt)
: [vb] "r"(vb_array), [vg] "r"(vg_array), [vr] "r"(vr_array)
: "cc",
"memory",
"q0",
"q1",
"q2",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12",
"q13",
"q14",
"q15");
#endif
}
for (; j < remain_pro; j++) {
*outr0++ = (inptr0[0] * b + inptr0[1] * g + inptr0[2] * r) >> 7;
*outr1++ = (inptr1[0] * b + inptr1[1] * g + inptr1[2] * r) >> 7;
*outr2++ = (inptr2[0] * b + inptr2[1] * g + inptr2[2] * r) >> 7;
*outr3++ = (inptr3[0] * b + inptr3[1] * g + inptr3[2] * r) >> 7;
inptr0 += 3;
inptr1 += 3;
inptr2 += 3;
inptr3 += 3;
}
}
for (; i < srch; i++) {
int j = 0;
const uint8_t* inptr0 = src + i * win;
uint8_t* outr0 = dst + i * srcw;
for (j = 0; j < cnt_pro; j++) {
uint8x8x3_t y0 = vld3_u8(inptr0); // d8 = y0y3y6y9.. d9 = y1y4y7...y
uint16x8_t val0 = vmull_u8(y0.val[0], vb);
uint16x8_t val0_1 = vmull_u8(y0.val[1], vg);
val0 = vmlal_u8(val0, y0.val[2], vr);
uint32x4_t v0_sum0 = vaddl_u16(vget_low_u16(val0_1), vget_low_u16(val0));
uint32x4_t v0_sum1 =
vaddl_u16(vget_high_u16(val0_1), vget_high_u16(val0));
uint16x4_t v0_sum0_16 = vshrn_n_u32(v0_sum0, 7);
uint16x4_t v0_sum1_16 = vshrn_n_u32(v0_sum1, 7);
uint16x8_t v0_sum = vcombine_u16(v0_sum0_16, v0_sum1_16);
uint8x8_t vout0 = vmovn_u16(v0_sum);
inptr0 += 24;
vst1_u8(outr0, vout0);
outr0 += 8;
}
for (; j < srcw; j++) {
*outr0++ = (inptr0[0] * b + inptr0[1] * g + inptr0[2] * r) >> 7;
inptr0 += 3;
}
}
}
/*
采用CV_GRAY2BGR,转换公式B = G = R = Gray
采用CV_GRAY2RGB,转换公式R = G = B = Gray
gray2bgr, gray2rgb
*/
void hwc1_to_hwc3(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
for (int i = 0; i < srch; i++) {
for (int j = 0; j < srcw; j++) {
*dst++ = *src;
*dst++ = *src;
*dst++ = *src;
src++;
}
}
}
// bgr2bgra, rgb2rgba
void hwc3_to_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
for (int i = 0; i < srch; i++) {
for (int j = 0; j < srcw; j++) {
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
*dst++ = 255;
}
}
}
// bgra2bgr, rgba2rgb
void hwc4_to_hwc3(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
for (int i = 0; i < srch; i++) {
for (int j = 0; j < srcw; j++) {
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
src++;
}
}
}
// bgr2rgb, rgb2bgr
void hwc3_trans(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
for (int i = 0; i < srch; i++) {
for (int j = 0; j < srcw; j++) {
*dst++ = src[2]; // r
*dst++ = src[1]; // g
*dst++ = src[0]; // b
src += 3;
}
}
}
// bgra2rgba, rgba2bgra
void hwc4_trans(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
for (int i = 0; i < srch; i++) {
for (int j = 0; j < srcw; j++) {
*dst++ = src[2]; // r
*dst++ = src[1]; // g
*dst++ = src[0]; // b
*dst++ = src[3]; // a
src += 4;
}
}
}
// bgra2rgb, rgba2bgr
void hwc4_trans_hwc3(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
for (int i = 0; i < srch; i++) {
for (int j = 0; j < srcw; j++) {
*dst++ = src[2]; // r
*dst++ = src[1]; // g
*dst++ = src[0]; // b
// *dst++ = src[4];//a
src += 4;
}
}
}
// bgr2rgba, rgb2bga
void hwc3_trans_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
for (int i = 0; i < srch; i++) {
for (int j = 0; j < srcw; j++) {
*dst++ = src[2]; // r
*dst++ = src[1]; // g
*dst++ = src[0]; // b
*dst++ = 255; // a
src += 3;
}
}
}
} // namespace cv
} // namespace utils
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
#include <stdio.h>
#include "lite/utils/cv/paddle_image_preprocess.h"
namespace paddle {
namespace lite {
namespace utils {
namespace cv {
typedef void (*convert_func)(const uint8_t* src,
uint8_t* dst,
int srcw,
int srch);
class ImageConvert {
public:
void choose(const uint8_t* src,
uint8_t* dst,
ImageFormat srcFormat,
ImageFormat dstFormat,
int srcw,
int srch);
private:
convert_func impl_{nullptr};
};
} // namespace cv
} // namespace utils
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/utils/cv/image_flip.h"
#include <math.h>
#include <string.h>
namespace paddle {
namespace lite {
namespace utils {
namespace cv {
// gray
void flip_hwc1_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
void flip_hwc1_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
void flip_hwc1_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
// rgb bgr
void flip_hwc3_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
void flip_hwc3_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
void flip_hwc3_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
// rgba bgra
void flip_hwc4_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
void flip_hwc4_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
void flip_hwc4_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
void flip_hwc1(const uint8_t* src,
uint8_t* dst,
int srcw,
int srch,
FlipParam flip_param) {
if (flip_param == X) {
flip_hwc1_x(src, dst, srcw, srch);
} else if (flip_param == Y) {
flip_hwc1_y(src, dst, srcw, srch);
} else if (flip_param == XY) {
flip_hwc1_xy(src, dst, srcw, srch);
}
}
void flip_hwc3(const uint8_t* src,
uint8_t* dst,
int srcw,
int srch,
FlipParam flip_param) {
if (flip_param == X) {
flip_hwc3_x(src, dst, srcw, srch);
} else if (flip_param == Y) {
flip_hwc3_y(src, dst, srcw, srch);
} else if (flip_param == XY) {
flip_hwc3_xy(src, dst, srcw, srch);
}
}
void flip_hwc4(const uint8_t* src,
uint8_t* dst,
int srcw,
int srch,
FlipParam flip_param) {
if (flip_param == X) {
flip_hwc4_x(src, dst, srcw, srch);
} else if (flip_param == Y) {
flip_hwc4_y(src, dst, srcw, srch);
} else if (flip_param == XY) {
flip_hwc4_xy(src, dst, srcw, srch);
}
}
/*
1 2 3
4 5 6
7 8 9
rotate:
7 8 9
4 5 6
1 2 3
*/
void flip_hwc1_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
int h = h_in - 1;
uint8_t zerobuff[8] = {0, 0, 0, 0, 0, 0, 0, 0};
#pragma omp parallel for
for (int i = 0; i < h_in; i += 4) {
const uint8_t* inptr0 = src + i * w_in;
const uint8_t* inptr1 = inptr0 + w_in;
const uint8_t* inptr2 = inptr1 + w_in;
const uint8_t* inptr3 = inptr2 + w_in;
uint8_t* outptr0 = dst + (h - i) * w_in; // last
uint8_t* outptr1 = outptr0 - w_in;
uint8_t* outptr2 = outptr1 - w_in;
uint8_t* outptr3 = outptr2 - w_in;
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 3:
inptr0 = zerobuff;
outptr0 = zerobuff;
case 2:
inptr1 = zerobuff;
outptr1 = zerobuff;
case 1:
inptr2 = zerobuff;
outptr2 = zerobuff;
case 0:
inptr3 = zerobuff;
outptr3 = zerobuff;
default:
break;
}
}
int j = 0;
for (; j < w_in - 7; j += 8) {
#ifdef __aarch64__
asm volatile(
"ld1 {v0.8b}, [%[inptr0]], #8 \n" // v0={00,01,02, 03, 04, 05,
// 06, 07}"
"ld1 {v1.8b}, [%[inptr1]], #8 \n" // v0={10,11,12, 13, 14, 15,
// 16, 17}"
"ld1 {v2.8b}, [%[inptr2]], #8 \n" // v0={20,21,22, 23, 24, 25,
// 26, 27}"
"ld1 {v3.8b}, [%[inptr3]], #8 \n" // v0={30,31,32, 33, 34, 35,
// 36, 37}"
"st1 {v0.8b}, [%[outptr0]], #8 \n" // 00 10 20 30 04 14
// 24 34
"st1 {v1.8b}, [%[outptr1]], #8 \n" // 02 12 22 32
"st1 {v2.8b}, [%[outptr2]], #8 \n" // 01 11 21 31
"st1 {v3.8b}, [%[outptr3]], #8 \n" // 03 13 23 33
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3)
:
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
#else
asm volatile(
"vld1.8 {d0}, [%[inptr0]]! @ zip load r0, d0 =00 01 02 03 04 05 "
"06 07\n"
"vld1.8 {d4}, [%[inptr1]]! @ zip load r1, d2 =10 11 12 13 14 15 "
"16 17\n"
"vld1.8 {d8}, [%[inptr2]]! @ zip load r1, d4 =20 21 22 23 24 25 "
"26 27\n"
"vld1.8 {d12}, [%[inptr3]]! @ zip load r1, d6 = 30 31 32 33 34 35 "
"36 37\n"
"vst1.32 {d0}, [%[outptr0]]! @ write d0(q0,low),r00,r10 20 30\n"
"vst1.32 {d4}, [%[outptr1]]! @ write d4(q0,low),r01,r11 21 31\n"
"vst1.32 {d8}, [%[outptr2]]! @ write d4(q0,low),r01,r11 21 31\n"
"vst1.32 {d12}, [%[outptr3]]! @ write d4(q0,low),r01,r11 21 "
"31\n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3)
:
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
#endif
}
for (; j < w_in; j++) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 0:
*outptr2++ = *inptr2++;
case 1:
*outptr1++ = *inptr1++;
case 2:
*outptr0++ = *inptr0++;
case 3:
// inptr3 = zerobuff;
default:
break;
}
} else {
*outptr3++ = *inptr3++;
*outptr2++ = *inptr2++;
*outptr1++ = *inptr1++;
*outptr0++ = *inptr0++;
}
}
}
}
/*
1 2 3
4 5 6
7 8 9
flip:
3 2 1
6 5 4
9 8 7
*/
void flip_hwc1_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
int64_t stride_w = 8;
uint8_t zerobuff[8] = {0, 0, 0, 0, 0, 0, 0, 0};
#pragma omp parallel for
for (int i = 0; i < h_in; i += 4) {
const uint8_t* inptr0 = src + i * w_in;
const uint8_t* inptr1 = inptr0 + w_in;
const uint8_t* inptr2 = inptr1 + w_in;
const uint8_t* inptr3 = inptr2 + w_in;
uint8_t* outptr0 = dst + (i + 1) * w_in - stride_w; // last col
uint8_t* outptr1 = outptr0 + w_in;
uint8_t* outptr2 = outptr1 + w_in;
uint8_t* outptr3 = outptr2 + w_in;
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 3:
inptr0 = zerobuff;
outptr0 = zerobuff;
case 2:
inptr1 = zerobuff;
outptr1 = zerobuff;
case 1:
inptr2 = zerobuff;
outptr2 = zerobuff;
case 0:
inptr3 = zerobuff;
outptr3 = zerobuff;
default:
break;
}
}
int j = 0;
for (; j < w_in - 7; j += 8) {
#ifdef __aarch64__
asm volatile(
"ld1 {v0.8b}, [%[inptr0]], #8 \n" // v0={00,01,02, 03, 04, 05,
// 06, 07}"
"ld1 {v1.8b}, [%[inptr1]], #8 \n" // v0={10,11,12, 13, 14, 15,
// 16, 17}"
"ld1 {v2.8b}, [%[inptr2]], #8 \n" // v0={20,21,22, 23, 24, 25,
// 26, 27}"
"ld1 {v3.8b}, [%[inptr3]], #8 \n" // v0={30,31,32, 33, 34, 35,
// 36, 37}"
"rev64 v4.8b, v0.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00
"rev64 v5.8b, v1.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00
"rev64 v6.8b, v2.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00
"rev64 v7.8b, v3.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00
"st1 {v4.8b}, [%[outptr0]] \n" // 00 10 20 30 04 14 24 34
"st1 {v5.8b}, [%[outptr1]] \n" // 02 12 22 32
"st1 {v6.8b}, [%[outptr2]] \n" // 01 11 21 31
"st1 {v7.8b}, [%[outptr3]] \n" // 03 13 23 33
"sub %[outptr0], %[outptr0], %[stride_w] \n" //@ ptr - stride_w
"sub %[outptr1], %[outptr1], %[stride_w] \n"
"sub %[outptr2], %[outptr2], %[stride_w] \n"
"sub %[outptr3], %[outptr3], %[stride_w] \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
#else
asm volatile(
"vld1.8 {d0}, [%[inptr0]]! @ zip load r0, d0 =00 01 02 03 04 05 "
"06 07\n"
"vld1.8 {d4}, [%[inptr1]]! @ zip load r1, d2 =10 11 12 13 14 15 "
"16 17\n"
"vld1.8 {d8}, [%[inptr2]]! @ zip load r1, d4 =20 21 22 23 24 25 "
"26 27\n"
"vld1.8 {d12}, [%[inptr3]]! @ zip load r1, d6 = 30 31 32 33 34 35 "
"36 37\n"
"vrev64.8 d1, d0 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d5, d4 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d9, d8 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d13, d12 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vst1.32 {d1}, [%[outptr0]] @ write d0(q0,low),r00,r10 20 30\n"
"vst1.32 {d5}, [%[outptr1]] @ write d4(q0,low),r01,r11 21 31\n"
"vst1.32 {d9}, [%[outptr2]] @ write d4(q0,low),r01,r11 21 31\n"
"vst1.32 {d13}, [%[outptr3]] @ write d4(q0,low),r01,r11 21 31\n"
"sub %[outptr0], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr1], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr2], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr3], %[stride_w] @ ptr - stride_w \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
#endif
}
outptr3 += stride_w - 1;
outptr2 += stride_w - 1;
outptr1 += stride_w - 1;
outptr0 += stride_w - 1;
for (; j < w_in; j++) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 0:
*outptr2-- = *inptr2++;
case 1:
*outptr1-- = *inptr1++;
// inptr1 = zerobuff;
case 2:
*outptr0-- = *inptr0++;
case 3:
// inptr3 = zerobuff;
default:
break;
}
} else {
*outptr3-- = *inptr3++;
*outptr2-- = *inptr2++;
*outptr1-- = *inptr1++;
*outptr0-- = *inptr0++;
}
}
}
}
/*
1 2 3
4 5 6
7 8 9
flip:
9 8 7
6 5 4
3 2 1
*/
void flip_hwc1_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
int64_t stride_w = 8;
uint8_t zerobuff[8] = {0, 0, 0, 0, 0, 0, 0, 0};
#pragma omp parallel for
for (int i = 0; i < h_in; i += 4) {
const uint8_t* inptr0 = src + i * w_in;
const uint8_t* inptr1 = inptr0 + w_in;
const uint8_t* inptr2 = inptr1 + w_in;
const uint8_t* inptr3 = inptr2 + w_in;
uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w; // last col
uint8_t* outptr1 = outptr0 - w_in;
uint8_t* outptr2 = outptr1 - w_in;
uint8_t* outptr3 = outptr2 - w_in;
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 3:
inptr0 = zerobuff;
outptr0 = zerobuff;
case 2:
inptr1 = zerobuff;
outptr1 = zerobuff;
case 1:
inptr2 = zerobuff;
outptr2 = zerobuff;
case 0:
inptr3 = zerobuff;
outptr3 = zerobuff;
default:
break;
}
}
int j = 0;
for (; j < w_in - 7; j += 8) {
#ifdef __aarch64__
asm volatile(
"ld1 {v0.8b}, [%[inptr0]], #8 \n" // v0={00,01,02, 03, 04, 05,
// 06, 07}"
"ld1 {v1.8b}, [%[inptr1]], #8 \n" // v0={10,11,12, 13, 14, 15,
// 16, 17}"
"ld1 {v2.8b}, [%[inptr2]], #8 \n" // v0={20,21,22, 23, 24, 25,
// 26, 27}"
"ld1 {v3.8b}, [%[inptr3]], #8 \n" // v0={30,31,32, 33, 34, 35,
// 36, 37}"
"rev64 v4.8b, v0.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00
"rev64 v5.8b, v1.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00
"rev64 v6.8b, v2.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00
"rev64 v7.8b, v3.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00
"st1 {v4.8b}, [%[outptr0]] \n" // 00 10 20 30 04 14 24 34
"st1 {v5.8b}, [%[outptr1]] \n" // 02 12 22 32
"st1 {v6.8b}, [%[outptr2]] \n" // 01 11 21 31
"st1 {v7.8b}, [%[outptr3]] \n" // 03 13 23 33
"sub %[outptr0], %[outptr0], %[stride_w] \n" //@ ptr - stride_w
"sub %[outptr1], %[outptr1], %[stride_w] \n"
"sub %[outptr2], %[outptr2], %[stride_w] \n"
"sub %[outptr3], %[outptr3], %[stride_w] \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
#else
asm volatile(
"vld1.8 {d0}, [%[inptr0]]! @ zip load r0, d0 =00 01 02 03 04 05 "
"06 07\n"
"vld1.8 {d4}, [%[inptr1]]! @ zip load r1, d2 =10 11 12 13 14 15 "
"16 17\n"
"vld1.8 {d8}, [%[inptr2]]! @ zip load r1, d4 =20 21 22 23 24 25 "
"26 27\n"
"vld1.8 {d12}, [%[inptr3]]! @ zip load r1, d6 = 30 31 32 33 34 35 "
"36 37\n"
"vrev64.8 d1, d0 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d5, d4 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d9, d8 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d13, d12 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vst1.32 {d1}, [%[outptr0]] @ write d0(q0,low),r00,r10 20 30\n"
"vst1.32 {d5}, [%[outptr1]] @ write d4(q0,low),r01,r11 21 31\n"
"vst1.32 {d9}, [%[outptr2]] @ write d4(q0,low),r01,r11 21 31\n"
"vst1.32 {d13}, [%[outptr3]] @ write d4(q0,low),r01,r11 21 31\n"
"sub %[outptr0], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr1], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr2], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr3], %[stride_w] @ ptr - stride_w \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
#endif
}
outptr3 += stride_w - 1;
outptr2 += stride_w - 1;
outptr1 += stride_w - 1;
outptr0 += stride_w - 1;
for (; j < w_in; j++) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 0:
*outptr2-- = *inptr2++;
case 1:
*outptr1-- = *inptr1++;
// inptr1 = zerobuff;
case 2:
*outptr0-- = *inptr0++;
case 3:
// inptr3 = zerobuff;
default:
break;
}
} else {
*outptr3-- = *inptr3++;
*outptr2-- = *inptr2++;
*outptr1-- = *inptr1++;
*outptr0-- = *inptr0++;
}
}
}
}
void flip_hwc3_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
int h = h_in - 1;
int win = w_in * 3;
uint8_t zerobuff[30000];
memset(zerobuff, 0, win * sizeof(uint8_t));
uint8_t zerobuff2[30000];
memset(zerobuff2, 0, win * sizeof(uint8_t));
#pragma omp parallel for
for (int i = 0; i < h_in; i += 4) {
const uint8_t* inptr0 = src + i * win;
const uint8_t* inptr1 = inptr0 + win;
const uint8_t* inptr2 = inptr1 + win;
const uint8_t* inptr3 = inptr2 + win;
uint8_t* outptr0 = dst + (h - i) * win; // last
uint8_t* outptr1 = outptr0 - win;
uint8_t* outptr2 = outptr1 - win;
uint8_t* outptr3 = outptr2 - win;
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 3:
inptr0 = zerobuff;
outptr0 = zerobuff2;
case 2:
inptr1 = zerobuff;
outptr1 = zerobuff2;
case 1:
inptr2 = zerobuff;
outptr2 = zerobuff2;
case 0:
inptr3 = zerobuff;
outptr3 = zerobuff2;
default:
break;
}
}
int j = 0;
for (; j < w_in - 7; j += 8) {
#ifdef __aarch64__
asm volatile(
"ld3 {v0.8b, v1.8b, v2.8b}, [%[inptr0]], #24 \n" // v0={00,01,02,
// 03, 04, 05,
// 06, 07}"
"ld3 {v3.8b, v4.8b, v5.8b}, [%[inptr1]], #24 \n" // v0={10,11,12,
// 13, 14, 15,
// 16, 17}"
"ld3 {v6.8b, v7.8b, v8.8b}, [%[inptr2]], #24 \n" // v0={20,21,22,
// 23, 24, 25,
// 26, 27}"
"ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24 \n" // v0={30,31,32,
// 33, 34, 35,
// 36, 37}"
"st3 {v0.8b, v1.8b, v2.8b}, [%[outptr0]], #24 \n" // 00
// 10
// 20
// 30
// 04
// 14
// 24
// 34
"st3 {v3.8b, v4.8b, v5.8b}, [%[outptr1]], #24 \n" // 02
// 12
// 22
// 32
"st3 {v6.8b, v7.8b, v8.8b}, [%[outptr2]], #24 \n" // 01
// 11
// 21
// 31
"st3 {v9.8b, v10.8b, v11.8b}, [%[outptr3]], #24 \n" // 03 13 23 33
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3)
:
: "v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11");
#else
asm volatile(
"vld3.8 {d0, d1, d2}, [%[inptr0]]! @ zip load r0, d0 =00 01 02 03 "
"04 05 06 07\n"
"vld3.8 {d3, d4, d5}, [%[inptr1]]! @ zip load r1, d2 =10 11 12 13 "
"14 15 16 17\n"
"vld3.8 {d6, d7, d8}, [%[inptr2]]! @ zip load r1, d4 =20 21 22 23 "
"24 25 26 27\n"
"vld3.8 {d9, d10, d11}, [%[inptr3]]! @ zip load r1, d6 = 30 31 32 "
"33 34 35 36 37\n"
"vst3.8 {d0, d1, d2}, [%[outptr0]]! @ write d0(q0,low),r00,r10 "
"20 30\n"
"vst3.8 {d3, d4, d5}, [%[outptr1]]! @ write d4(q0,low),r01,r11 "
"21 31\n"
"vst3.8 {d6, d7, d8}, [%[outptr2]]! @ write d4(q0,low),r01,r11 "
"21 31\n"
"vst3.8 {d9, d10, d11}, [%[outptr3]]! @ write "
"d4(q0,low),r01,r11 21 31\n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3)
:
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
#endif
}
for (; j < w_in; j++) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 0:
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
case 1:
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
case 2:
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
case 3:
// inptr3 = zerobuff;
default:
break;
}
} else {
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
}
}
}
}
void flip_hwc3_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
int win = w_in * 3;
uint8_t zerobuff[30000];
memset(zerobuff, 0, win * sizeof(uint8_t));
uint8_t zerobuff2[30000];
memset(zerobuff2, 0, win * sizeof(uint8_t));
int64_t stride_w = 24;
#pragma omp parallel for
for (int i = 0; i < h_in; i += 4) {
const uint8_t* inptr0 = src + i * win;
const uint8_t* inptr1 = inptr0 + win;
const uint8_t* inptr2 = inptr1 + win;
const uint8_t* inptr3 = inptr2 + win;
uint8_t* outptr0 = dst + (i + 1) * win - stride_w; // last col
uint8_t* outptr1 = outptr0 + win;
uint8_t* outptr2 = outptr1 + win;
uint8_t* outptr3 = outptr2 + win;
int j = 0;
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 3:
inptr0 = zerobuff;
outptr0 = zerobuff2;
case 2:
inptr1 = zerobuff;
outptr1 = zerobuff2;
case 1:
inptr2 = zerobuff;
outptr2 = zerobuff2;
case 0:
inptr3 = zerobuff;
outptr3 = zerobuff2;
default:
break;
}
}
for (; j < w_in - 7; j += 8) {
#ifdef __aarch64__
asm volatile(
"ld3 {v0.8b, v1.8b, v2.8b}, [%[inptr0]], #24 \n" // v0={00,01,02,
// 03, 04, 05,
// 06, 07}"
"ld3 {v3.8b, v4.8b, v5.8b}, [%[inptr1]], #24 \n" // v0={10,11,12,
// 13, 14, 15,
// 16, 17}"
"ld3 {v6.8b, v7.8b, v8.8b}, [%[inptr2]], #24 \n" // v0={20,21,22,
// 23, 24, 25,
// 26, 27}"
"ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24 \n" // v0={30,31,32,
// 33, 34, 35,
// 36, 37}"
"rev64 v12.8b, v0.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 b
"rev64 v13.8b, v1.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 g
"rev64 v14.8b, v2.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 r
"rev64 v15.8b, v3.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v16.8b, v4.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v17.8b, v5.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v18.8b, v6.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v19.8b, v7.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v20.8b, v8.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v21.8b, v9.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v22.8b, v10.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v23.8b, v11.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]] \n" // 00 10
// 20 30
// 04 14
// 24 34
"st3 {v15.8b, v16.8b, v17.8b}, [%[outptr1]] \n" // 02 12
// 22 32
"st3 {v18.8b, v19.8b, v20.8b}, [%[outptr2]] \n" // 01 11
// 21 31
"st3 {v21.8b, v22.8b, v23.8b}, [%[outptr3]] \n" // 03 13
// 23 33
"sub %[outptr0], %[outptr0], %[stride_w] \n" //@ ptr - stride_w
"sub %[outptr1], %[outptr1], %[stride_w] \n"
"sub %[outptr2], %[outptr2], %[stride_w] \n"
"sub %[outptr3], %[outptr3], %[stride_w] \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23");
#else
asm volatile(
"vld3.8 {d0, d1, d2}, [%[inptr0]]! @ zip load r0, d0 =00 01 02 03 "
"04 05 06 07\n"
"vld3.8 {d3, d4, d5}, [%[inptr1]]! @ zip load r1, d2 =10 11 12 13 "
"14 15 16 17\n"
"vld3.8 {d6, d7, d8}, [%[inptr2]]! @ zip load r1, d4 =20 21 22 23 "
"24 25 26 27\n"
"vld3.8 {d9, d10, d11}, [%[inptr3]]! @ zip load r1, d6 = 30 31 32 "
"33 34 35 36 37\n"
"vrev64.8 d12, d0 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d13, d1 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d14, d2 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d15, d3 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d16, d4 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d17, d5 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d18, d6 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d19, d7 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d20, d8 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d21, d9 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d22, d10 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d23, d11 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vst3.8 {d12, d13, d14}, [%[outptr0]] @ write "
"d0(q0,low),r00,r10 20 30\n"
"vst3.8 {d15, d16, d17}, [%[outptr1]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"vst3.8 {d18, d19, d20}, [%[outptr2]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"vst3.8 {d21, d22, d23}, [%[outptr3]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"sub %[outptr0], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr1], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr2], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr3], %[stride_w] @ ptr - stride_w \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "q0",
"q1",
"q2",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12");
#endif
}
outptr3 += stride_w - 3;
outptr2 += stride_w - 3;
outptr1 += stride_w - 3;
outptr0 += stride_w - 3;
for (; j < w_in; j++) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 0:
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 6;
case 1:
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 6;
case 2:
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 6;
case 3:
// inptr3 = zerobuff;
default:
break;
}
} else {
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
outptr3 -= 6;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 6;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 6;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 6;
}
}
}
}
void flip_hwc3_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
int64_t stride_w = 24;
int win = w_in * 3;
uint8_t zerobuff[30000];
memset(zerobuff, 0, win * sizeof(uint8_t));
uint8_t zerobuff2[30000];
memset(zerobuff2, 0, win * sizeof(uint8_t));
#pragma omp parallel for
for (int i = 0; i < h_in; i += 4) {
const uint8_t* inptr0 = src + i * win;
const uint8_t* inptr1 = inptr0 + win;
const uint8_t* inptr2 = inptr1 + win;
const uint8_t* inptr3 = inptr2 + win;
uint8_t* outptr0 = dst + (h_in - i) * win - stride_w; // last col
uint8_t* outptr1 = outptr0 - win;
uint8_t* outptr2 = outptr1 - win;
uint8_t* outptr3 = outptr2 - win;
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 3:
inptr0 = zerobuff;
outptr0 = zerobuff2;
case 2:
inptr1 = zerobuff;
outptr1 = zerobuff2;
case 1:
inptr2 = zerobuff;
outptr2 = zerobuff2;
case 0:
inptr3 = zerobuff;
outptr3 = zerobuff2;
default:
break;
}
}
int j = 0;
for (; j < w_in - 7; j += 8) {
#ifdef __aarch64__
asm volatile(
"ld3 {v0.8b, v1.8b, v2.8b}, [%[inptr0]], #24 \n" // v0={00,01,02,
// 03, 04, 05,
// 06, 07}"
"ld3 {v3.8b, v4.8b, v5.8b}, [%[inptr1]], #24 \n" // v0={10,11,12,
// 13, 14, 15,
// 16, 17}"
"ld3 {v6.8b, v7.8b, v8.8b}, [%[inptr2]], #24 \n" // v0={20,21,22,
// 23, 24, 25,
// 26, 27}"
"ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24 \n" // v0={30,31,32,
// 33, 34, 35,
// 36, 37}"
"rev64 v12.8b, v0.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 b
"rev64 v13.8b, v1.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 g
"rev64 v14.8b, v2.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 r
"rev64 v15.8b, v3.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v16.8b, v4.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v17.8b, v5.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v18.8b, v6.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v19.8b, v7.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v20.8b, v8.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v21.8b, v9.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v22.8b, v10.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v23.8b, v11.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]] \n" // 00 10
// 20 30
// 04 14
// 24 34
"st3 {v15.8b, v16.8b, v17.8b}, [%[outptr1]] \n" // 02 12
// 22 32
"st3 {v18.8b, v19.8b, v20.8b}, [%[outptr2]] \n" // 01 11
// 21 31
"st3 {v21.8b, v22.8b, v23.8b}, [%[outptr3]] \n" // 03 13
// 23 33
"sub %[outptr0], %[outptr0], %[stride_w] \n" //@ ptr - stride_w
"sub %[outptr1], %[outptr1], %[stride_w] \n"
"sub %[outptr2], %[outptr2], %[stride_w] \n"
"sub %[outptr3], %[outptr3], %[stride_w] \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23");
#else
asm volatile(
"vld3.8 {d0, d1, d2}, [%[inptr0]]! @ zip load r0, d0 =00 01 02 03 "
"04 05 06 07\n"
"vld3.8 {d3, d4, d5}, [%[inptr1]]! @ zip load r1, d2 =10 11 12 13 "
"14 15 16 17\n"
"vld3.8 {d6, d7, d8}, [%[inptr2]]! @ zip load r1, d4 =20 21 22 23 "
"24 25 26 27\n"
"vld3.8 {d9, d10, d11}, [%[inptr3]]! @ zip load r1, d6 = 30 31 32 "
"33 34 35 36 37\n"
"vrev64.8 d12, d0 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d13, d1 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d14, d2 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d15, d3 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d16, d4 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d17, d5 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d18, d6 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d19, d7 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d20, d8 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d21, d9 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d22, d10 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d23, d11 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vst3.8 {d12, d13, d14}, [%[outptr0]] @ write "
"d0(q0,low),r00,r10 20 30\n"
"vst3.8 {d15, d16, d17}, [%[outptr1]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"vst3.8 {d18, d19, d20}, [%[outptr2]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"vst3.8 {d21, d22, d23}, [%[outptr3]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"sub %[outptr0], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr1], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr2], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr3], %[stride_w] @ ptr - stride_w \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "q0",
"q1",
"q2",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12");
#endif
}
outptr3 += stride_w - 3;
outptr2 += stride_w - 3;
outptr1 += stride_w - 3;
outptr0 += stride_w - 3;
for (; j < w_in; j++) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 0:
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 6;
case 1:
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 6;
case 2:
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 6;
case 3:
// inptr3 = zerobuff;
default:
break;
}
} else {
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
outptr3 -= 6;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 6;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 6;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 6;
}
}
}
}
void flip_hwc4_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
int h = h_in - 1;
int win = w_in * 4;
uint8_t zerobuff[40000];
memset(zerobuff, 0, win * sizeof(uint8_t));
uint8_t zerobuff2[40000];
memset(zerobuff2, 0, win * sizeof(uint8_t));
#pragma omp parallel for
for (int i = 0; i < h_in; i += 4) {
const uint8_t* inptr0 = src + i * win;
const uint8_t* inptr1 = inptr0 + win;
const uint8_t* inptr2 = inptr1 + win;
const uint8_t* inptr3 = inptr2 + win;
uint8_t* outptr0 = dst + (h - i) * win; // last
uint8_t* outptr1 = outptr0 - win;
uint8_t* outptr2 = outptr1 - win;
uint8_t* outptr3 = outptr2 - win;
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 3:
inptr0 = zerobuff;
outptr0 = zerobuff2;
case 2:
inptr1 = zerobuff;
outptr1 = zerobuff2;
case 1:
inptr2 = zerobuff;
outptr2 = zerobuff2;
case 0:
inptr3 = zerobuff;
outptr3 = zerobuff2;
default:
break;
}
}
int j = 0;
for (; j < w_in - 7; j += 8) {
#ifdef __aarch64__
asm volatile(
"ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[inptr0]], #32 \n" // v0={00,01,02,
// 03,
// 04,
// 05,
// 06,
// 07}"
"ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [%[inptr1]], #32 \n" // v0={10,11,12,
// 13,
// 14,
// 15,
// 16,
// 17}"
"ld4 {v8.8b, v9.8b, v10.8b, v11.8b}, [%[inptr2]], #32 \n" // v0={20,21,22,
// 23,
// 24,
// 25,
// 26,
// 27}"
"ld4 {v12.8b, v13.8b, v14.8b, v15.8b}, [%[inptr3]], #32 \n" // v0={30,31,32,
// 33,
// 34,
// 35,
// 36,
// 37}"
"st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[outptr0]], #32 \n" // 00 10 20
// 30 04 14
// 24 34
"st4 {v4.8b, v5.8b, v6.8b, v7.8b}, [%[outptr1]], #32 \n" // 02 12 22 32
"st4 {v8.8b, v9.8b, v10.8b, v11.8b}, [%[outptr2]], #32 \n" // 01 11 21 31
"st4 {v12.8b, v13.8b, v14.8b, v15.8b}, [%[outptr3]], #32 "
" \n" // 03 13 23 33
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3)
:
: "v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15");
#else
asm volatile(
"vld4.8 {d0, d1, d2, d3}, [%[inptr0]]! @ zip load r0, d0 =00 01 "
"02 03 04 05 06 07\n"
"vld4.8 {d4, d5, d6, d7}, [%[inptr1]]! @ zip load r1, d2 =10 11 "
"12 13 14 15 16 17\n"
"vld4.8 {d8, d9, d10, d11}, [%[inptr2]]! @ zip load r1, d4 =20 21 "
"22 23 24 25 26 27\n"
"vld4.8 {d12, d13, d14, d15}, [%[inptr3]]! @ zip load r1, d6 = 30 "
"31 32 33 34 35 36 37\n"
"vst4.8 {d0, d1, d2, d3}, [%[outptr0]]! @ write "
"d0(q0,low),r00,r10 20 30\n"
"vst4.8 {d4, d5, d6, d7}, [%[outptr1]]! @ write "
"d4(q0,low),r01,r11 21 31\n"
"vst4.8 {d8, d9, d10, d11}, [%[outptr2]]! @ write "
"d4(q0,low),r01,r11 21 31\n"
"vst4.8 {d12, d13, d14, d15}, [%[outptr3]]! @ write "
"d4(q0,low),r01,r11 21 31\n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3)
:
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
#endif
}
for (; j < w_in; j++) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 0:
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
case 1:
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
case 2:
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
case 3:
// inptr3 = zerobuff;
default:
break;
}
} else {
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
}
}
}
}
void flip_hwc4_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
int win = w_in * 4;
uint8_t zerobuff[40000];
memset(zerobuff, 0, win * sizeof(uint8_t));
uint8_t zerobuff2[40000];
memset(zerobuff2, 0, win * sizeof(uint8_t));
int64_t stride_w = 32;
#pragma omp parallel for
for (int i = 0; i < h_in; i += 4) {
const uint8_t* inptr0 = src + i * win;
const uint8_t* inptr1 = inptr0 + win;
const uint8_t* inptr2 = inptr1 + win;
const uint8_t* inptr3 = inptr2 + win;
uint8_t* outptr0 = dst + (i + 1) * win - stride_w; // last col
uint8_t* outptr1 = outptr0 + win;
uint8_t* outptr2 = outptr1 + win;
uint8_t* outptr3 = outptr2 + win;
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 3:
inptr0 = zerobuff;
outptr0 = zerobuff2;
case 2:
inptr1 = zerobuff;
outptr1 = zerobuff2;
case 1:
inptr2 = zerobuff;
outptr2 = zerobuff2;
case 0:
inptr3 = zerobuff;
outptr3 = zerobuff2;
default:
break;
}
}
int j = 0;
for (; j < w_in - 7; j += 8) {
#ifdef __aarch64__
asm volatile(
"ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[inptr0]], #32 \n" // v0={00,01,02,
// 03,
// 04,
// 05,
// 06,
// 07}"
"ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [%[inptr1]], #32 \n" // v0={10,11,12,
// 13,
// 14,
// 15,
// 16,
// 17}"
"ld4 {v8.8b, v9.8b, v10.8b, v11.8b}, [%[inptr2]], #32 \n" // v0={20,21,22,
// 23,
// 24,
// 25,
// 26,
// 27}"
"ld4 {v12.8b, v13.8b, v14.8b, v15.8b}, [%[inptr3]], #32 \n" // v0={30,31,32,
// 33,
// 34,
// 35,
// 36,
// 37}"
"rev64 v16.8b, v0.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 b
"rev64 v17.8b, v1.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 g
"rev64 v18.8b, v2.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 r
"rev64 v19.8b, v3.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v20.8b, v4.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v21.8b, v5.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v22.8b, v6.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v23.8b, v7.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v0.8b, v8.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v1.8b, v9.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v2.8b, v10.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v3.8b, v11.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v4.8b, v12.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v5.8b, v13.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v6.8b, v14.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v7.8b, v15.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[outptr0]] \n" // 00 10 20 30 04 14 24 34
"st4 {v20.8b, v21.8b, v22.8b, v23.8b}, [%[outptr1]] \n" // 02 12 22 32
"st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[outptr2]] \n" // 01
// 11
// 21
// 31
"st4 {v4.8b, v5.8b, v6.8b, v7.8b}, [%[outptr3]] \n" // 03 13 23 33
"sub %[outptr0], %[outptr0], %[stride_w] \n" //@ ptr -
// stride_w
"sub %[outptr1], %[outptr1], %[stride_w] \n"
"sub %[outptr2], %[outptr2], %[stride_w] \n"
"sub %[outptr3], %[outptr3], %[stride_w] \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23");
#else
asm volatile(
"vld4.8 {d0, d1, d2, d3}, [%[inptr0]]! @ zip load r0, d0 =00 01 "
"02 03 04 05 06 07\n"
"vld4.8 {d4, d5, d6, d7}, [%[inptr1]]! @ zip load r1, d2 =10 11 "
"12 13 14 15 16 17\n"
"vld4.8 {d8, d9, d10, d11}, [%[inptr2]]! @ zip load r1, d4 =20 "
"21 22 23 24 25 26 27\n"
"vld4.8 {d12, d13, d14, d15}, [%[inptr3]]! @ zip load r1, d6 = "
"30 31 32 33 34 35 36 37\n"
"vrev64.8 d16, d0 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d17, d1 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d18, d2 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d19, d3 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d20, d4 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d21, d5 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d22, d6 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d23, d7 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d0, d8 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d1, d9 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d2, d10 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d3, d11 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d4, d12 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d5, d13 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d6, d14 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d7, d15 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vst4.8 {d16, d17, d18, d19}, [%[outptr0]] @ write "
"d0(q0,low),r00,r10 20 30\n"
"vst4.8 {d20, d21, d22, d23}, [%[outptr1]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"vst4.8 {d0, d1, d2, d3}, [%[outptr2]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"vst4.8 {d4, d5, d6, d7}, [%[outptr3]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"sub %[outptr0], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr1], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr2], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr3], %[stride_w] @ ptr - stride_w \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "q0",
"q1",
"q2",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12");
#endif
}
outptr3 += stride_w - 4;
outptr2 += stride_w - 4;
outptr1 += stride_w - 4;
outptr0 += stride_w - 4;
for (; j < w_in; j++) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 0:
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 8;
case 1:
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 8;
case 2:
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 8;
case 3:
// inptr3 = zerobuff;
default:
break;
}
} else {
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
outptr3 -= 8;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 8;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 8;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 8;
}
}
}
}
void flip_hwc4_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
int64_t stride_w = 32;
int win = w_in * 4;
uint8_t zerobuff[40000];
memset(zerobuff, 0, win * sizeof(uint8_t));
uint8_t zerobuff2[40000];
memset(zerobuff2, 0, win * sizeof(uint8_t));
#pragma omp parallel for
for (int i = 0; i < h_in; i += 4) {
const uint8_t* inptr0 = src + i * win;
const uint8_t* inptr1 = inptr0 + win;
const uint8_t* inptr2 = inptr1 + win;
const uint8_t* inptr3 = inptr2 + win;
uint8_t* outptr0 = dst + (h_in - i) * win - stride_w; // last col
uint8_t* outptr1 = outptr0 - win;
uint8_t* outptr2 = outptr1 - win;
uint8_t* outptr3 = outptr2 - win;
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 3:
inptr0 = zerobuff;
outptr0 = zerobuff2;
case 2:
inptr1 = zerobuff;
outptr1 = zerobuff2;
case 1:
inptr2 = zerobuff;
outptr2 = zerobuff2;
case 0:
inptr3 = zerobuff;
outptr3 = zerobuff2;
default:
break;
}
}
int j = 0;
for (; j < w_in - 7; j += 8) {
#ifdef __aarch64__
asm volatile(
"ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[inptr0]], #32 \n" // v0={00,01,02,
// 03,
// 04,
// 05,
// 06,
// 07}"
"ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [%[inptr1]], #32 \n" // v0={10,11,12,
// 13,
// 14,
// 15,
// 16,
// 17}"
"ld4 {v8.8b, v9.8b, v10.8b, v11.8b}, [%[inptr2]], #32 \n" // v0={20,21,22,
// 23,
// 24,
// 25,
// 26,
// 27}"
"ld4 {v12.8b, v13.8b, v14.8b, v15.8b}, [%[inptr3]], #32 \n" // v0={30,31,32,
// 33,
// 34,
// 35,
// 36,
// 37}"
"rev64 v16.8b, v0.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 b
"rev64 v17.8b, v1.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 g
"rev64 v18.8b, v2.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 r
"rev64 v19.8b, v3.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v20.8b, v4.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v21.8b, v5.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v22.8b, v6.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v23.8b, v7.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v0.8b, v8.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v1.8b, v9.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v2.8b, v10.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v3.8b, v11.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v4.8b, v12.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v5.8b, v13.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v6.8b, v14.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v7.8b, v15.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[outptr0]] \n" // 00 10 20 30 04 14 24 34
"st4 {v20.8b, v21.8b, v22.8b, v23.8b}, [%[outptr1]] \n" // 02 12 22 32
"st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[outptr2]] \n" // 01
// 11
// 21
// 31
"st4 {v4.8b, v5.8b, v6.8b, v7.8b}, [%[outptr3]] \n" // 03
// 13
// 23
// 33
"sub %[outptr0], %[outptr0], %[stride_w] \n" //@ ptr -
// stride_w
"sub %[outptr1], %[outptr1], %[stride_w] \n"
"sub %[outptr2], %[outptr2], %[stride_w] \n"
"sub %[outptr3], %[outptr3], %[stride_w] \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23");
#else
asm volatile(
"vld4.8 {d0, d1, d2, d3}, [%[inptr0]]! @ zip load r0, d0 =00 01 "
"02 03 04 05 06 07\n"
"vld4.8 {d4, d5, d6, d7}, [%[inptr1]]! @ zip load r1, d2 =10 11 "
"12 13 14 15 16 17\n"
"vld4.8 {d8, d9, d10, d11}, [%[inptr2]]! @ zip load r1, d4 =20 "
"21 22 23 24 25 26 27\n"
"vld4.8 {d12, d13, d14, d15}, [%[inptr3]]! @ zip load r1, d6 = "
"30 31 32 33 34 35 36 37\n"
"vrev64.8 d16, d0 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d17, d1 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d18, d2 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d19, d3 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d20, d4 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d21, d5 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d22, d6 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d23, d7 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d0, d8 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d1, d9 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d2, d10 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d3, d11 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d4, d12 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d5, d13 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d6, d14 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d7, d15 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vst4.8 {d16, d17, d18, d19}, [%[outptr0]] @ write "
"d0(q0,low),r00,r10 20 30\n"
"vst4.8 {d20, d21, d22, d23}, [%[outptr1]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"vst4.8 {d0, d1, d2, d3}, [%[outptr2]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"vst4.8 {d4, d5, d6, d7}, [%[outptr3]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"sub %[outptr0], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr1], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr2], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr3], %[stride_w] @ ptr - stride_w \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "q0",
"q1",
"q2",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12");
#endif
}
outptr3 += stride_w - 4;
outptr2 += stride_w - 4;
outptr1 += stride_w - 4;
outptr0 += stride_w - 4;
for (; j < w_in; j++) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 0:
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 8;
case 1:
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 8;
case 2:
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 8;
case 3:
// inptr3 = zerobuff;
default:
break;
}
} else {
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
outptr3 -= 8;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 8;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 8;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 8;
}
}
}
}
} // namespace cv
} // namespace utils
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
#include <vector>
#include "lite/utils/cv/paddle_image_preprocess.h"
namespace paddle {
namespace lite {
namespace utils {
namespace cv {
void flip_hwc1(
const uint8_t* src, uint8_t* dst, int srcw, int srch, FlipParam flip_param);
void flip_hwc3(
const uint8_t* src, uint8_t* dst, int srcw, int srch, FlipParam flip_param);
void flip_hwc4(
const uint8_t* src, uint8_t* dst, int srcw, int srch, FlipParam flip_param);
} // namespace cv
} // namespace utils
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// ncnn license
// Tencent is pleased to support the open source community by making ncnn
// available.
//
// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this
// file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software
// distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
#include "lite/utils/cv/image_resize.h"
#include <arm_neon.h>
#include <math.h>
#include <algorithm>
namespace paddle {
namespace lite {
namespace utils {
namespace cv {
void compute_xy(int srcw,
int srch,
int dstw,
int dsth,
double scale_x,
double scale_y,
int* xofs,
int* yofs,
int16_t* ialpha,
int16_t* ibeta);
// use bilinear method to resize
void resize(const uint8_t* src,
uint8_t* dst,
ImageFormat srcFormat,
int srcw,
int srch,
int dstw,
int dsth) {
int size = srcw * srch;
if (srcw == dstw && srch == dsth) {
if (srcFormat == NV12 || srcFormat == NV21) {
size = srcw * (floor(1.5 * srch));
} else if (srcFormat == BGR || srcFormat == RGB) {
size = 3 * srcw * srch;
} else if (srcFormat == BGRA || srcFormat == RGBA) {
size = 4 * srcw * srch;
}
memcpy(dst, src, sizeof(uint8_t) * size);
return;
}
double scale_x = static_cast<double>(srcw / dstw);
double scale_y = static_cast<double>(srch / dsth);
int* buf = new int[dstw * 2 + dsth * 2];
int* xofs = buf;
int* yofs = buf + dstw;
int16_t* ialpha = reinterpret_cast<int16_t*>(buf + dstw + dsth);
int16_t* ibeta = reinterpret_cast<int16_t*>(buf + 2 * dstw + dsth);
compute_xy(
srcw, srch, dstw, dsth, scale_x, scale_y, xofs, yofs, ialpha, ibeta);
int w_out = dstw;
int w_in = srcw;
int num = 1;
int orih = dsth;
if (srcFormat == GRAY) {
num = 1;
} else if (srcFormat == NV12 || srcFormat == NV21) {
num = 1;
int hout = static_cast<int>(0.5 * dsth);
dsth += hout;
} else if (srcFormat == BGR || srcFormat == RGB) {
w_in = srcw * 3;
w_out = dstw * 3;
num = 3;
} else if (srcFormat == BGRA || srcFormat == RGBA) {
w_in = srcw * 4;
w_out = dstw * 4;
num = 4;
}
int* xofs1 = nullptr;
int* yofs1 = nullptr;
int16_t* ialpha1 = nullptr;
if (orih < dsth) { // uv
int tmp = dsth - orih;
int w = dstw / 2;
xofs1 = new int[w];
yofs1 = new int[tmp];
ialpha1 = new int16_t[srcw];
compute_xy(srcw / 2,
srch / 2,
w,
tmp,
scale_x,
scale_y,
xofs1,
yofs1,
ialpha1,
ibeta + orih);
}
int cnt = w_out >> 3;
int remain = w_out % 8;
int32x4_t _v2 = vdupq_n_s32(2);
#pragma omp parallel for
for (int dy = 0; dy < dsth; dy++) {
int16_t* rowsbuf0 = new int16_t[w_out];
int16_t* rowsbuf1 = new int16_t[w_out];
int sy = yofs[dy];
if (dy >= orih) {
xofs = xofs1;
yofs = yofs1;
ialpha = ialpha1;
}
if (sy < 0) {
memset(rowsbuf0, 0, sizeof(uint16_t) * w_out);
const uint8_t* S1 = src + srcw * (sy + 1);
const int16_t* ialphap = ialpha;
int16_t* rows1p = rowsbuf1;
for (int dx = 0; dx < dstw; dx++) {
int sx = xofs[dx] * num; // num = 4
int16_t a0 = ialphap[0];
int16_t a1 = ialphap[1];
const uint8_t* S1pl = S1 + sx;
const uint8_t* S1pr = S1 + sx + num;
if (sx < 0) {
S1pl = S1;
}
for (int i = 0; i < num; i++) {
if (sx < 0) {
*rows1p++ = ((*S1pl++) * a1) >> 4;
} else {
*rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4;
}
}
ialphap += 2;
}
} else {
// hresize two rows
const uint8_t* S0 = src + w_in * (sy);
const uint8_t* S1 = src + w_in * (sy + 1);
const int16_t* ialphap = ialpha;
int16_t* rows0p = rowsbuf0;
int16_t* rows1p = rowsbuf1;
for (int dx = 0; dx < dstw; dx++) {
int sx = xofs[dx] * num; // num = 4
int16_t a0 = ialphap[0];
int16_t a1 = ialphap[1];
const uint8_t* S0pl = S0 + sx;
const uint8_t* S0pr = S0 + sx + num;
const uint8_t* S1pl = S1 + sx;
const uint8_t* S1pr = S1 + sx + num;
if (sx < 0) {
S0pl = S0;
S1pl = S1;
}
for (int i = 0; i < num; i++) {
if (sx < 0) {
*rows0p = ((*S0pl++) * a1) >> 4;
*rows1p = ((*S1pl++) * a1) >> 4;
rows0p++;
rows1p++;
} else {
*rows0p++ = ((*S0pl++) * a0 + (*S0pr++) * a1) >> 4;
*rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4;
}
}
ialphap += 2;
}
}
int ind = dy * 2;
int16_t b0 = ibeta[ind];
int16_t b1 = ibeta[ind + 1];
int16x8_t _b0 = vdupq_n_s16(b0);
int16x8_t _b1 = vdupq_n_s16(b1);
uint8_t* dp_ptr = dst + dy * w_out;
int16_t* rows0p = rowsbuf0;
int16_t* rows1p = rowsbuf1;
int re_cnt = cnt;
if (re_cnt > 0) {
#ifdef __aarch64__
asm volatile(
"1: \n"
"ld1 {v0.8h}, [%[rows0p]], #16 \n"
"ld1 {v1.8h}, [%[rows1p]], #16 \n"
"orr v6.16b, %w[_v2].16b, %w[_v2].16b \n"
"orr v7.16b, %w[_v2].16b, %w[_v2].16b \n"
"smull v2.4s, v0.4h, %w[_b0].4h \n"
"smull2 v4.4s, v0.8h, %w[_b0].8h \n"
"smull v3.4s, v1.4h, %w[_b1].4h \n"
"smull2 v5.4s, v1.8h, %w[_b1].8h \n"
"ssra v6.4s, v2.4s, #16 \n"
"ssra v7.4s, v4.4s, #16 \n"
"ssra v6.4s, v3.4s, #16 \n"
"ssra v7.4s, v5.4s, #16 \n"
"shrn v0.4h, v6.4s, #2 \n"
"shrn2 v0.8h, v7.4s, #2 \n"
"subs %w[cnt], %w[cnt], #1 \n"
"sqxtun v1.8b, v0.8h \n"
"st1 {v1.8b}, [%[dp]], #8 \n"
"bne 1b \n"
: [rows0p] "+r"(rows0p),
[rows1p] "+r"(rows1p),
[cnt] "+r"(re_cnt),
[dp] "+r"(dp_ptr)
: [_b0] "w"(_b0), [_b1] "w"(_b1), [_v2] "w"(_v2)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
#else
asm volatile(
"mov r4, #2 \n"
"vdup.s32 q12, r4 \n"
"0: \n"
"vld1.s16 {d2-d3}, [%[rows0p]]!\n"
"vld1.s16 {d6-d7}, [%[rows1p]]!\n"
"vorr.s32 q10, q12, q12 \n"
"vorr.s32 q11, q12, q12 \n"
"vmull.s16 q0, d2, %[_b0] \n"
"vmull.s16 q1, d3, %[_b0] \n"
"vmull.s16 q2, d6, %[_b1] \n"
"vmull.s16 q3, d7, %[_b1] \n"
"vsra.s32 q10, q0, #16 \n"
"vsra.s32 q11, q1, #16 \n"
"vsra.s32 q10, q2, #16 \n"
"vsra.s32 q11, q3, #16 \n"
"vshrn.s32 d20, q10, #2 \n"
"vshrn.s32 d21, q11, #2 \n"
"subs %[cnt], #1 \n"
"vqmovun.s16 d20, q10 \n"
"vst1.8 {d20}, [%[dp]]! \n"
"bne 0b \n"
: [rows0p] "+r"(rows0p),
[rows1p] "+r"(rows1p),
[cnt] "+r"(re_cnt),
[dp] "+r"(dp_ptr)
: [_b0] "w"(_b0), [_b1] "w"(_b1)
: "cc",
"memory",
"r4",
"q0",
"q1",
"q2",
"q3",
"q8",
"q9",
"q10",
"q11",
"q12");
#endif // __aarch64__
}
for (int i = 0; i < remain; i++) {
// D[x] = (rows0[x]*b0 + rows1[x]*b1) >>
// INTER_RESIZE_COEF_BITS;
*dp_ptr++ =
(uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
(int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
2);
}
}
delete[] buf;
}
// compute xofs, yofs, alpha, beta
void compute_xy(int srcw,
int srch,
int dstw,
int dsth,
double scale_x,
double scale_y,
int* xofs,
int* yofs,
int16_t* ialpha,
int16_t* ibeta) {
float fy = 0.f;
float fx = 0.f;
int sy = 0;
int sx = 0;
const int resize_coef_bits = 11;
const int resize_coef_scale = 1 << resize_coef_bits;
#define SATURATE_CAST_SHORT(X) \
(int16_t)::std::min( \
::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
SHRT_MAX);
for (int dx = 0; dx < dstw; dx++) {
fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
sx = floor(fx);
fx -= sx;
if (sx < 0) {
sx = 0;
fx = 0.f;
}
if (sx >= srcw - 1) {
sx = srcw - 2;
fx = 1.f;
}
xofs[dx] = sx;
float a0 = (1.f - fx) * resize_coef_scale;
float a1 = fx * resize_coef_scale;
ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
}
for (int dy = 0; dy < dsth; dy++) {
fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
sy = floor(fy);
fy -= sy;
if (sy < 0) {
sy = 0;
fy = 0.f;
}
if (sy >= srch - 1) {
sy = srch - 2;
fy = 1.f;
}
yofs[dy] = sy;
float b0 = (1.f - fy) * resize_coef_scale;
float b1 = fy * resize_coef_scale;
ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
}
#undef SATURATE_CAST_SHORT
}
} // namespace cv
} // namespace utils
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// ncnn license
// Tencent is pleased to support the open source community by making ncnn
// available.
//
// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this
// file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software
// distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
#pragma once
#include <math.h>
#include <stdint.h>
#include "lite/utils/cv/paddle_image_preprocess.h"
namespace paddle {
namespace lite {
namespace utils {
namespace cv {
void resize(const uint8_t* src,
uint8_t* dst,
ImageFormat srcFormat,
int srcw,
int srch,
int dstw,
int dsth);
} // namespace cv
} // namespace utils
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/utils/cv/image_rotate.h"
#include <math.h>
#include <string.h>
namespace paddle {
namespace lite {
namespace utils {
namespace cv {
// gray
void rotate_hwc1_90(
const uint8_t* src, uint8_t* dst, int w_in, int h_in, int w_out, int h_out);
void rotate_hwc1_180(
const uint8_t* src, uint8_t* dst, int w_in, int h_in, int w_out, int h_out);
void rotate_hwc1_270(
const uint8_t* src, uint8_t* dst, int w_in, int h_in, int w_out, int h_out);
// bgr rgb
void rotate_hwc3_90(
const uint8_t* src, uint8_t* dst, int w_in, int h_in, int w_out, int h_out);
void rotate_hwc3_180(
const uint8_t* src, uint8_t* dst, int w_in, int h_in, int w_out, int h_out);
void rotate_hwc3_270(
const uint8_t* src, uint8_t* dst, int w_in, int h_in, int w_out, int h_out);
// rgba bgra
void rotate_hwc4_90(
const uint8_t* src, uint8_t* dst, int w_in, int h_in, int w_out, int h_out);
void rotate_hwc4_180(
const uint8_t* src, uint8_t* dst, int w_in, int h_in, int w_out, int h_out);
void rotate_hwc4_270(
const uint8_t* src, uint8_t* dst, int w_in, int h_in, int w_out, int h_out);
void rotate_hwc1(
const uint8_t* src, uint8_t* dst, int srcw, int srch, float degree) {
if (degree == 90) {
rotate_hwc1_90(src, dst, srcw, srch, srch, srcw);
} else if (degree == 180) {
rotate_hwc1_180(src, dst, srcw, srch, srcw, srch);
} else if (degree == 270) {
rotate_hwc1_270(src, dst, srcw, srch, srch, srcw);
}
}
void rotate_hwc3(
const uint8_t* src, uint8_t* dst, int srcw, int srch, float degree) {
if (degree == 90) {
rotate_hwc3_90(src, dst, srcw, srch, srch, srcw);
} else if (degree == 180) {
rotate_hwc3_180(src, dst, srcw, srch, srcw, srch);
} else if (degree == 270) {
rotate_hwc3_270(src, dst, srcw, srch, srch, srcw);
}
}
void rotate_hwc4(
const uint8_t* src, uint8_t* dst, int srcw, int srch, float degree) {
if (degree == 90) {
rotate_hwc4_90(src, dst, srcw, srch, srch, srcw);
} else if (degree == 180) {
rotate_hwc4_180(src, dst, srcw, srch, srcw, srch);
} else if (degree == 270) {
rotate_hwc4_270(src, dst, srcw, srch, srch, srcw);
}
}
#ifdef __aarch64__
#define INPUT_C1 \
"ld1 {v0.8b}, [%[inptr0]] \n" \
"ld1 {v4.8b}, [%[inptr1]] \n" \
"ld1 {v8.8b}, [%[inptr2]] \n" \
"ld1 {v12.8b}, [%[inptr3]] \n"
#define INPUT_C3 \
"ld3 {v0.8b, v1.8b, v2.8b}, [%[inptr0]] \n" \
"ld3 {v4.8b, v5.8b, v6.8b}, [%[inptr1]] \n" \
"ld3 {v8.8b, v9.8b, v10.8b}, [%[inptr2]] \n" \
"ld3 {v12.8b, v13.8b, v14.8b}, [%[inptr3]] \n"
#define ADD_INPUT \
"add %[inptr0], %[inptr0], %[stride_h] \n" \
"add %[inptr1], %[inptr1], %[stride_h] \n" \
"add %[inptr2], %[inptr2], %[stride_h] \n" \
"add %[inptr3], %[inptr3], %[stride_h] \n"
#define SUB_INPUT \
"sub %[inptr0], %[inptr0], %[stride_h_w] \n" /* 4 - 4*w_in + 8 */ \
"sub %[inptr1], %[inptr1], %[stride_h_w] \n" /* 5 - 4*w_in + 8 */ \
"sub %[inptr2], %[inptr2], %[stride_h_w] \n" /* 6 - 4*w_in + 8 */ \
"sub %[inptr3], %[inptr3], %[stride_h_w] \n" /* 7 - 4*w_in + 8 */
#define TRANS_C1_8 \
"trn1 v1.8b, v0.8b, v4.8b \n" /* b v4=00 10 02 12 04 14 06 16*/ \
"trn1 v5.8b, v8.8b, v12.8b \n" /* v4={20 30 22 32 24 34 26 36 */ \
"trn2 v2.8b, v0.8b, v4.8b \n" /* v5={01 11 03 13 05 15 07 17 */ \
"trn2 v6.8b, v8.8b, v12.8b \n" /* v7={21 31 23 33 25 35 27 37 */
#define TRANS_C1_16 \
"trn1 v9.4h, v1.4h, v5.4h \n" \
"trn1 v13.4h, v2.4h, v6.4h \n" /* v22=01 11 21 31 05 15 25 35 */ \
"trn2 v10.4h, v1.4h, v5.4h \n" /* v21=02 12 22 32 06 16 26 36*/ \
"trn2 v14.4h, v2.4h, v6.4h \n" /* v23=03 13 23 33 07 17 27 37 */
#define TRANS_C1 \
"trn1 v0.4h, v1.4h, v5.4h \n" /* b v4=40 50 60 70 04 14 24 34 */ \
"trn1 v8.4h, v2.4h, v6.4h \n" /* v4=41 11 21 31 05 15 25 35 */ \
"trn2 v4.4h, v1.4h, v5.4h \n" /* v5=42 12 22 32 06 16 26 36*/ \
"trn2 v12.4h, v2.4h, v6.4h \n" /* v7=43 13 23 33 07 17 27 37 */ \
"trn1 v1.2s, v9.2s, v0.2s \n" /* b v1=00 10 20 30 40 50 60 */ \
"trn1 v2.2s, v13.2s, v8.2s \n" /* v2= 01 11 21 31 41 51 61 71 */ \
"trn1 v5.2s, v10.2s, v4.2s \n" /* b v5=02 12 22 32 42 52 62 */ \
"trn1 v6.2s, v14.2s, v12.2s \n" /* v6=03 13 23 33 43 53 63 73*/ \
\
"trn2 v3.2s, v9.2s, v0.2s \n" /* v3=04 14 24 34 44 54 64 74*/ \
"trn2 v7.2s, v13.2s, v8.2s \n" /* v7=05 15 25 35 45 55 65 75*/ \
"trn2 v11.2s, v10.2s, v4.2s \n" /* v11=06 16 26 36 46 56 66 */ \
"trn2 v15.2s, v14.2s, v12.2s \n" /* v15=07 17 27 37 47 57 67 */
#define REVERSE_C1 \
"rev64 v0.8b, v1.8b \n" \
"rev64 v4.8b, v2.8b \n" \
"rev64 v8.8b, v5.8b \n" \
"rev64 v12.8b, v6.8b \n" \
\
"rev64 v1.8b, v3.8b \n" \
"rev64 v5.8b, v7.8b \n" \
"rev64 v9.8b, v11.8b \n" \
"rev64 v13.8b, v15.8b \n"
#define STORE_C1_R \
"st1 {v0.8b}, [%[outptr0]] \n" /* b v1=00 10 20 30 40 50 60*/ \
"st1 {v4.8b}, [%[outptr1]] \n" /* v2=01 11 21 31 41 51 61 71*/ \
"st1 {v8.8b}, [%[outptr2]] \n" /* b v5=02 12 22 32 42 52 62 */ \
"st1 {v12.8b}, [%[outptr3]] \n" /* v6=03 13 23 33 43 53 63 73*/ \
\
"st1 {v1.8b}, [%[outptr4]] \n" /* v3=04 14 24 34 44 54 64 74}*/ \
"st1 {v5.8b}, [%[outptr5]] \n" /* v7=05 15 25 35 45 55 65 75}*/ \
"st1 {v9.8b}, [%[outptr6]] \n" /* v11=06 16 26 36 46 56 66 */ \
"st1 {v13.8b}, [%[outptr7]] \n" /* v15=07 17 27 37 47 57 67 */
#define STORE_C1 \
"st1 {v1.8b}, [%[outptr0]] \n" /* b v1=00 10 20 30 40 50 60 */ \
"st1 {v2.8b}, [%[outptr1]] \n" /* v2=01 11 21 31 41 51 61 71*/ \
"st1 {v5.8b}, [%[outptr2]] \n" /* b v5=02 12 22 32 42 52 62 */ \
"st1 {v6.8b}, [%[outptr3]] \n" /* v6=03 13 23 33 43 53 63 73}*/ \
\
"st1 {v3.8b}, [%[outptr4]] \n" /* v3=04 14 24 34 44 54 64 74*/ \
"st1 {v7.8b}, [%[outptr5]] \n" /* v7=05 15 25 35 45 55 65 75*/ \
"st1 {v11.8b}, [%[outptr6]] \n" /* v11=06 16 26 36 46 56 66 */ \
"st1 {v15.8b}, [%[outptr7]] \n" /* v15=07 17 27 37 47 57 67*/
#define TRANS_C3_8 \
"trn1 v3.8b, v0.8b, v4.8b \n" /* b v4=00 10 02 12 04 14 06 16 */ \
"trn1 v7.8b, v8.8b, v12.8b \n" /* v4=20 30 22 32 24 34 26 36 */ \
"trn2 v11.8b, v0.8b, v4.8b \n" /* v5=01 11 03 13 05 15 07 17 */ \
"trn2 v15.8b, v8.8b, v12.8b \n" /* v7=21 31 23 33 25 35 27 37*/ \
\
"trn1 v16.8b, v1.8b, v5.8b \n" \
"trn1 v18.8b, v9.8b, v13.8b \n" /* v4=20 30 22 32 24 34 26 36 */ \
"trn2 v17.8b, v1.8b, v5.8b \n" /* v5={01 11 03 13 05 15 07 17 */ \
"trn2 v19.8b, v9.8b, v13.8b \n" /* v7=21 31 23 33 25 35 27 37 */ \
\
"trn1 v20.8b, v2.8b, v6.8b \n" \
"trn1 v22.8b, v10.8b, v14.8b \n" \
"trn2 v21.8b, v2.8b, v6.8b \n" /* v5=01 11 03 13 05 15 07 17 */ \
"trn2 v23.8b, v10.8b, v14.8b \n"
#define TRANS_C3_16 \
"trn1 v24.4h, v3.4h, v7.4h \n" \
"trn1 v26.4h, v11.4h, v15.4h \n" /* v4=01 11 21 31 05 15 25 35*/ \
"trn2 v25.4h, v3.4h, v7.4h \n" /* v5=02 12 22 32 06 16 26 36*/ \
"trn2 v27.4h, v11.4h, v15.4h \n" \
\
"trn1 v28.4h, v16.4h, v18.4h \n" /* g v4=00 10 20 30 04 14 24 */ \
"trn1 v30.4h, v17.4h, v19.4h \n" \
"trn2 v29.4h, v16.4h, v18.4h \n" /* v5=02 12 22 32 06 16 26 */ \
"trn2 v31.4h, v17.4h, v19.4h \n" \
\
"trn1 v16.4h, v20.4h, v22.4h \n" /* r v4=00 10 20 30 04 14 24 */ \
"trn1 v18.4h, v21.4h, v23.4h \n" \
"trn2 v17.4h, v20.4h, v22.4h \n" /* v5=02 12 22 32 06 16 26 */ \
"trn2 v19.4h, v21.4h, v23.4h \n"
#define TRANS_C3 \
"trn1 v3.8b, v0.8b, v4.8b \n" /* b v4=40 50 42 52 04 14 06 16 */ \
"trn1 v7.8b, v8.8b, v12.8b \n" /* v4=60 70 62 72 24 34 26 36 */ \
"trn2 v11.8b, v0.8b, v4.8b \n" /* v5=41 51 03 13 05 15 07 17 */ \
"trn2 v15.8b, v8.8b, v12.8b \n" /* v7=61 71 23 33 25 35 27 37 */ \
\
"trn1 v20.8b, v2.8b, v6.8b \n" \
"trn1 v22.8b, v10.8b, v14.8b \n" \
"trn2 v21.8b, v2.8b, v6.8b \n" /* v5=41 51 03 13 05 15 07 17 */ \
"trn2 v23.8b, v10.8b, v14.8b \n" \
\
"trn1 v0.4h, v3.4h, v7.4h \n" /* b v4=40 50 60 70 04 14 24 34 */ \
"trn1 v4.4h, v11.4h, v15.4h \n" /* v4=41 51 61 71 05 15 25 35 */ \
"trn2 v8.4h, v3.4h, v7.4h \n" /* v5=42 52 62 72 06 16 26 36*/ \
"trn2 v12.4h, v11.4h, v15.4h \n" \
\
"trn1 v2.4h, v20.4h, v22.4h \n" /* r v4=40 50 60 70 */ \
"trn1 v6.4h, v21.4h, v23.4h \n" /* v4=41 51 61 71 */ \
"trn2 v10.4h, v20.4h, v22.4h \n" /* v5=42 52 62 72 */ \
"trn2 v14.4h, v21.4h, v23.4h \n" /* v7=43 53 63 73 */ \
\
"trn1 v20.2s, v24.2s, v0.2s \n" \
"trn1 v21.2s, v26.2s, v4.2s \n" /* v4=01 11 21 31 41 51 61 71 */ \
"trn1 v22.2s, v25.2s, v8.2s \n" /* v5=02 12 22 32 42 52 62 72 */ \
"trn1 v23.2s, v27.2s, v12.2s \n" \
\
"trn2 v3.2s, v24.2s, v0.2s \n" \
"trn2 v7.2s, v26.2s, v4.2s \n" /* v4=05 11 21 31 41 51 61 71 */ \
"trn2 v11.2s, v25.2s, v8.2s \n" /* v5=06 12 22 32 42 52 62 72 */ \
"trn2 v15.2s, v27.2s, v12.2s \n" /* v7=07 13 23 33 43 53 63 */ \
\
"trn1 v0.2s, v16.2s, v2.2s \n" /* r v4=00 10 20 30 40 50 60 */ \
"trn1 v4.2s, v18.2s, v6.2s \n" /* v4=01 11 21 31 41 51 61 71 */ \
"trn1 v8.2s, v17.2s, v10.2s \n" /* v5=02 12 22 32 42 52 62 72 */ \
"trn1 v12.2s, v19.2s, v14.2s \n" /* v7=03 13 23 33 43 53 63 */ \
\
"trn2 v24.2s, v16.2s, v2.2s \n" /* r v4=04 10 20 30 40 50 60 */ \
"trn2 v25.2s, v18.2s, v6.2s \n" /* v4=05 11 21 31 41 51 61 71 */ \
"trn2 v26.2s, v17.2s, v10.2s \n" /* v5=06 12 22 32 42 52 62 */ \
"trn2 v27.2s, v19.2s, v14.2s \n" /* v7=07 13 23 33 43 53 63 */ \
\
"trn1 v16.8b, v1.8b, v5.8b \n" /* g v4={00 10 02 12 04 14 06 */ \
"trn1 v18.8b, v9.8b, v13.8b \n" /* v4={20 30 22 32 24 34 26 */ \
"trn2 v17.8b, v1.8b, v5.8b \n" /* v5={01 11 03 13 05 15 07 17 */ \
"trn2 v19.8b, v9.8b, v13.8b \n" /* v7={21 31 23 33 25 35 27 */ \
\
"sub %[inptr0], %[inptr0], %[stride_h_w] \n" /* 4 - 4*w_in + 8 */ \
"sub %[inptr1], %[inptr1], %[stride_h_w] \n" /* 5 - 4*w_in + 8 */ \
"sub %[inptr2], %[inptr2], %[stride_h_w] \n" /* 6 - 4*w_in + 8 */ \
"sub %[inptr3], %[inptr3], %[stride_h_w] \n" /* 7 - 4*w_in + 8 */ \
\
"trn1 v1.4h, v16.4h, v18.4h \n" /* g v4={00 10 20 30 04 14 24*/ \
"trn1 v5.4h, v17.4h, v19.4h \n" /* v4={ 01 11 21 31 05 15 25 */ \
"trn2 v9.4h, v16.4h, v18.4h \n" /* v5={02 12 22 32 06 16 26 36*/ \
"trn2 v13.4h, v17.4h, v19.4h \n" /* v7={03 13 23 33 07 17 27 */ \
\
"trn1 v2.2s, v28.2s, v1.2s \n" /* g v4=00 10 20 30 40 50 60 */ \
"trn1 v6.2s, v30.2s, v5.2s \n" /* v4=01 11 21 31 41 51 61 71 */ \
"trn1 v10.2s, v29.2s, v9.2s \n" /* v5=02 12 22 32 42 52 62 72 */ \
"trn1 v14.2s, v31.2s, v13.2s \n" /* v7=03 13 23 33 43 53 63 */ \
\
"trn2 v16.2s, v28.2s, v1.2s \n" /* g v4=04 10 20 30 40 50 60 */ \
"trn2 v17.2s, v30.2s, v5.2s \n" /* v4=05 11 21 31 41 51 61 71 */ \
"trn2 v18.2s, v29.2s, v9.2s \n" /* v5=06 12 22 32 42 52 62 72 */ \
"trn2 v19.2s, v31.2s, v13.2s \n" /* v7=07 13 23 33 43 53 63 */
#define REVERSE_C3 \
"rev64 v28.8b, v20.8b \n" /* b 00 10 20 30 40 50 60 70*/ \
"rev64 v29.8b, v2.8b \n" /* g 00 10 20 30 40 50 60 70*/ \
"rev64 v30.8b, v0.8b \n" /* r 00 10 20 30 40 50 60 70*/ \
\
"rev64 v0.8b, v21.8b \n" /* b 01 11 21 31 41 51 61 71 */ \
"rev64 v1.8b, v6.8b \n" /* g 01 11 21 31 41 51 61 71 */ \
"rev64 v2.8b, v4.8b \n" /* r 01 11 21 31 41 51 61 71 */ \
\
"rev64 v4.8b, v22.8b \n" /* b 02 12 22 32 42 52 62 72 */ \
"rev64 v5.8b, v10.8b \n" /* g 02 12 22 32 42 52 62 72*/ \
"rev64 v6.8b, v8.8b \n" /* r 02 12 22 32 42 52 62 72 */ \
\
"rev64 v8.8b, v23.8b \n" /* b 03 13 23 33 43 53 63 73 */ \
"rev64 v9.8b, v14.8b \n" /* g 03 13 23 33 43 53 63 73 */ \
"rev64 v10.8b, v12.8b \n" /* r 03 13 23 33 43 53 63 73 */ \
\
"rev64 v12.8b, v3.8b \n" /* b 04 14 20 30 40 50 60 70 */ \
"rev64 v13.8b, v16.8b \n" /* g 04 14 20 30 40 50 60 70 */ \
"rev64 v14.8b, v24.8b \n" /* r 04 14 20 30 40 50 60 70 */ \
\
"rev64 v20.8b, v7.8b \n" /* b 05 15 20 30 40 50 60 70 */ \
"rev64 v21.8b, v17.8b \n" /* g 05 15 20 30 40 50 60 70 */ \
"rev64 v22.8b, v25.8b \n" /* r 05 15 20 30 40 50 60 70 */ \
\
"rev64 v23.8b, v11.8b \n" /* b 06 15 20 30 40 50 60 70 */ \
"rev64 v24.8b, v18.8b \n" /* g 06 15 20 30 40 50 60 70 */ \
"rev64 v25.8b, v26.8b \n" /* r 06 15 20 30 40 50 60 70 */ \
\
"rev64 v16.8b, v15.8b \n" /* b 07 15 20 30 40 50 60 70 */ \
"rev64 v17.8b, v19.8b \n" /* g 07 15 20 30 40 50 60 70 */ \
"rev64 v18.8b, v27.8b \n" /* r 07 15 20 30 40 50 60 70 */
#define MOV_C3 \
"mov v28.8b, v20.8b \n" /* b 00 10 20 30 40 50 60 70*/ \
"mov v29.8b, v2.8b \n" /* g 00 10 20 30 40 50 60 70*/ \
"mov v30.8b, v0.8b \n" /* r 00 10 20 30 40 50 60 70*/ \
\
"mov v0.8b, v21.8b \n" /* b 01 11 21 31 41 51 61 71 */ \
"mov v1.8b, v6.8b \n" /* g 01 11 21 31 41 51 61 71 */ \
"mov v2.8b, v4.8b \n" /* r 01 11 21 31 41 51 61 71 */ \
\
"mov v4.8b, v22.8b \n" /* b 02 12 22 32 42 52 62 72 */ \
"mov v5.8b, v10.8b \n" /* g 02 12 22 32 42 52 62 72*/ \
"mov v6.8b, v8.8b \n" /* r 02 12 22 32 42 52 62 72 */ \
\
"mov v8.8b, v23.8b \n" /* b 03 13 23 33 43 53 63 73 */ \
"mov v9.8b, v14.8b \n" /* g 03 13 23 33 43 53 63 73 */ \
"mov v10.8b, v12.8b \n" /* r 03 13 23 33 43 53 63 73 */ \
\
"mov v12.8b, v3.8b \n" /* b 04 14 20 30 40 50 60 70 */ \
"mov v13.8b, v16.8b \n" /* g 04 14 20 30 40 50 60 70 */ \
"mov v14.8b, v24.8b \n" /* r 04 14 20 30 40 50 60 70 */ \
\
"mov v20.8b, v7.8b \n" /* b 05 15 20 30 40 50 60 70 */ \
"mov v21.8b, v17.8b \n" /* g 05 15 20 30 40 50 60 70 */ \
"mov v22.8b, v25.8b \n" /* r 05 15 20 30 40 50 60 70 */ \
\
"mov v23.8b, v11.8b \n" /* b 06 15 20 30 40 50 60 70 */ \
"mov v24.8b, v18.8b \n" /* g 06 15 20 30 40 50 60 70 */ \
"mov v25.8b, v26.8b \n" /* r 06 15 20 30 40 50 60 70 */ \
\
"mov v16.8b, v15.8b \n" /* b 07 15 20 30 40 50 60 70 */ \
"mov v17.8b, v19.8b \n" /* g 07 15 20 30 40 50 60 70 */ \
"mov v18.8b, v27.8b \n" /* r 07 15 20 30 40 50 60 70 */
#define STORE_C3 \
"st3 {v28.8b, v29.8b, v30.8b}, [%[outptr0]] \n" \
"st3 {v0.8b, v1.8b, v2.8b}, [%[outptr1]] \n" \
"st3 {v4.8b, v5.8b, v6.8b}, [%[outptr2]] \n" \
"st3 {v8.8b, v9.8b, v10.8b}, [%[outptr3]] \n" \
\
"st3 {v12.8b, v13.8b, v14.8b}, [%[outptr4]] \n" \
"st3 {v20.8b, v21.8b, v22.8b}, [%[outptr5]] \n" \
"st3 {v23.8b, v24.8b, v25.8b}, [%[outptr6]] \n" \
"st3 {v16.8b, v17.8b, v18.8b}, [%[outptr7]] \n"
#else
#define INPUT_C1 \
"vld1.8 {d0}, [%[inptr0]] @ zip load r0, d0 =00 01 02 03 04 05 06 07\n" \
"vld1.8 {d4}, [%[inptr1]] @ zip load r1, d2 =10 11 12 13 14 15 16 17\n" \
"vld1.8 {d8}, [%[inptr2]] @ zip load r1, d4 =20 21 22 23 24 25 26 27\n" \
"vld1.8 {d12}, [%[inptr3]] @ zip load r1, d6 = 30 31 32 33 34 35 36 37\n"
#define INPUT_C3 \
"vld3.8 {d0, d1, d2}, [%[inptr0]] @ zip load r0, d0 =00 01 02 03 04 05 " \
"06 07\n" \
"vld3.8 {d4, d5, d6}, [%[inptr1]] @ zip load r1, d2 =10 11 12 13 14 15 " \
"16 17\n" \
"vld3.8 {d8, d9, d10}, [%[inptr2]] @ zip load r1, d4 =20 21 22 23 24 25 " \
"26 27\n" \
"vld3.8 {d12, d13, d14}, [%[inptr3]] @ zip load r1, d6 = 30 31 32 33 34 " \
"35 36 37\n"
#define ADD_INPUT \
"add %[inptr0], %[inptr0], %[stride_h] \n" \
"add %[inptr1], %[inptr1], %[stride_h] \n" \
"add %[inptr2], %[inptr2], %[stride_h] \n" \
"add %[inptr3], %[inptr3], %[stride_h] \n"
#define SUB_INPUT \
"sub %[inptr0], %[inptr0], %[stride_h_w] \n" \
"sub %[inptr1], %[inptr1], %[stride_h_w] \n" \
"sub %[inptr2], %[inptr2], %[stride_h_w] \n" \
"sub %[inptr3], %[inptr3], %[stride_h_w] \n"
#define TRANS_C1 \
"vtrn.8 d0, d4 @ trans data: \n" /* d0=00 10 02 12 04 14 06 16 */ \
"vtrn.8 d8, d12 @ trans data: \n" /* d8=20 30 12 32 24 34 26 36 */ \
\
"vld1.8 {d1}, [%[inptr0]] @ zip load r0, d0 =00 01 02 03 04 05 06 07\n" \
"vld1.8 {d5}, [%[inptr1]] @ zip load r1, d2 =10 11 12 13 14 15 16 17\n" \
"vld1.8 {d9}, [%[inptr2]] @ zip load r1, d4 =20 21 22 23 24 25 26 27\n" \
"vld1.8 {d13}, [%[inptr3]] @ zip load r1, d6 = 30 31 32 33 34 35 36 37\n" \
\
"vtrn.16 d0, d8 @ trans data: \n" /* d0=00 10 20 30 04 14 24 34 */ \
"vtrn.16 d4, d12 @ trans data:\n" /* d4=01 11 21 31 05 15 25 35 */ \
\
"vtrn.8 d1, d5 @ trans data: \n" /* d0=40 50 42 52 04 14 06 16 */ \
"vtrn.8 d9, d13 @ trans data: \n" /* d8=60 70 62 72 24 34 26 36 */ \
\
"sub %[inptr0], %[inptr0], %[stride_h_w] \n" \
"sub %[inptr1], %[inptr1], %[stride_h_w] \n" \
"sub %[inptr2], %[inptr2], %[stride_h_w] \n" \
"sub %[inptr3], %[inptr3], %[stride_h_w] \n" \
\
"vtrn.16 d1, d5 @ trans data: \n" /* d0=40 50 60 70 04 14 24 34 */ \
"vtrn.16 d9, d13 @ trans data:\n" /* d4=41 51 61 71 05 15 25 35 */ \
\
"vtrn.32 d0, d1 @ trans data: \n" \
"vtrn.32 d8, d9 @ trans data: \n" \
"vtrn.32 d4, d5 @ trans data: \n" \
"vtrn.32 d12, d13 @ trans data: \n"
#define REVERSE_C1 \
"vrev64.8 d2, d0 @reverse 7 6 5 4 3 2 1 \n" \
"vrev64.8 d3, d1 @reverse 7 6 5 4 3 2 1 \n" \
"vrev64.8 d10, d8 @reverse 7 6 5 4 3 2 1 \n" \
"vrev64.8 d11, d9 @reverse 7 6 5 4 3 2 1 \n" \
"vrev64.8 d6, d4 @reverse 7 6 5 4 3 2 1 \n" \
"vrev64.8 d7, d5 @reverse 7 6 5 4 3 2 1 \n" \
"vrev64.8 d14, d12 @reverse 7 6 5 4 3 2 1 \n" \
"vrev64.8 d15, d13 @reverse 7 6 5 4 3 2 1 \n"
#define ADD_OUTPUT \
"add %[outptr0], %[outptr0], %[stride_out] \n" \
"add %[outptr2], %[outptr2], %[stride_out] \n" \
"add %[outptr1], %[outptr1], %[stride_out] \n" \
"add %[outptr3], %[outptr3], %[stride_out] \n"
#define SUB_OUTPUT \
"sub %[outptr0], %[outptr0], %[stride_out] \n" \
"sub %[outptr2], %[outptr2], %[stride_out] \n" \
"sub %[outptr1], %[outptr1], %[stride_out] \n" \
"sub %[outptr3], %[outptr3], %[stride_out] \n"
#define STORE_C1_4 \
"vst1.8 {d0}, [%[outptr0]] @ write d0(q0,low),r00,r10 20 30\n" \
"vst1.8 {d8}, [%[outptr2]] @ write d0(q0,low),r00,r10 20 30\n" \
"vst1.8 {d4}, [%[outptr1]] @ write d0(q0,low),r00,r10 20 30\n" \
"vst1.8 {d12}, [%[outptr3]] @ write d0(q0,low),r00,r10 20 30\n"
#define STORE_C1_8 \
"vst1.8 {d1}, [%[outptr0]] @ write d0(q0,low),r00,r10 20 30\n" \
"vst1.8 {d9}, [%[outptr2]] @ write d0(q0,low),r00,r10 20 30\n" \
"vst1.8 {d5}, [%[outptr1]] @ write d0(q0,low),r00,r10 20 30\n" \
"vst1.8 {d13}, [%[outptr3]] @ write d0(q0,low),r00,r10 20 30\n"
#define STORE_C1_R_4 \
"vst1.8 {d2}, [%[outptr0]] @ write d0(q0,low),r00,r10 20 30\n" \
"vst1.8 {d10}, [%[outptr2]] @ write d0(q0,low),r00,r10 20 30\n" \
"vst1.8 {d6}, [%[outptr1]] @ write d0(q0,low),r00,r10 20 30\n" \
"vst1.8 {d14}, [%[outptr3]] @ write d0(q0,low),r00,r10 20 30\n"
#define STORE_C1_R_8 \
"vst1.8 {d3}, [%[outptr0]] @ write d0(q0,low),r00,r10 20 30\n" \
"vst1.8 {d11}, [%[outptr2]] @ write d0(q0,low),r00,r10 20 30\n" \
"vst1.8 {d7}, [%[outptr1]] @ write d0(q0,low),r00,r10 20 30\n" \
"vst1.8 {d15}, [%[outptr3]] @ write d0(q0,low),r00,r10 20 30\n"
#define TRANS_C3 \
"vtrn.8 d0, d4 @ trans data: \n" \
"vtrn.8 d8, d12 @ trans data: \n" \
"vtrn.8 d1, d5 @ trans data: \n" \
"vtrn.8 d9, d13 @ trans data: \n" \
"vtrn.8 d2, d6 @ trans data: \n" \
"vtrn.8 d10, d14 @ trans data: \n" \
\
"vld3.8 {d16, d17, d18}, [%[inptr0]] @ zip load r0, d0 =40 01 02 03 04 " \
"05 06 07\n" \
"vld3.8 {d20, d21, d22}, [%[inptr1]] @ zip load r1, d2 =50 11 12 13 14 " \
"15 16 17\n" \
"vld3.8 {d24, d25, d26}, [%[inptr2]] @ zip load r1, d4 =60 21 22 23 24 " \
"25 26 27\n" \
"vld3.8 {d28, d29, d30}, [%[inptr3]] @ zip load r1, d6 =70 31 32 33 34 " \
"35 36 37\n" \
\
"vtrn.16 d0, d8 @ trans data: \n" \
"vtrn.16 d4, d12 @ trans data:\n" \
"vtrn.16 d1, d9 @ trans data: \n" \
"vtrn.16 d5, d13 @ trans data:\n" \
"vtrn.16 d2, d10 @ trans data: \n" \
"vtrn.16 d6, d14 @ trans data:\n" \
\
"vtrn.8 d16, d20 @ trans data: \n" \
"vtrn.8 d24, d28 @ trans data: \n" \
"vtrn.8 d17, d21 @ trans data: \n" \
"vtrn.8 d25, d29 @ trans data: \n" \
"vtrn.8 d18, d22 @ trans data: \n" \
"vtrn.8 d26, d30 @ trans data: \n" \
\
"sub %[inptr0], %[inptr0], %[stride_h_w] \n" \
"sub %[inptr1], %[inptr1], %[stride_h_w] \n" \
"sub %[inptr2], %[inptr2], %[stride_h_w] \n" \
"sub %[inptr3], %[inptr3], %[stride_h_w] \n" \
\
"vtrn.16 d16, d24 @ trans data: \n" \
"vtrn.16 d20, d28 @ trans data: \n" \
"vtrn.16 d17, d25 @ trans data: \n" \
"vtrn.16 d21, d29 @ trans data: \n" \
"vtrn.16 d18, d26 @ trans data: \n" \
"vtrn.16 d22, d30 @ trans data: \n" \
\
"vtrn.32 d0, d16 @ trans data: \n" \
"vtrn.32 d8, d24 @ trans data: \n" \
"vtrn.32 d4, d20 @ trans data: \n" \
"vtrn.32 d12, d28 @ trans data: \n" \
\
"vtrn.32 d1, d17 @ trans data: \n" \
"vtrn.32 d9, d25 @ trans data: \n" \
"vtrn.32 d5, d21 @ trans data: \n" \
"vtrn.32 d13, d29 @ trans data: \n" \
\
"vtrn.32 d2, d18 @ trans data: \n" \
"vtrn.32 d10, d26 @ trans data: \n" \
"vtrn.32 d6, d22 @ trans data: \n" \
"vtrn.32 d14, d30 @ trans data: \n"
#define STORE_C3_4 \
"vst3.8 {d0, d1, d2}, [%[outptr0]] \n" \
"vst3.8 {d4, d5, d6}, [%[outptr1]] \n" \
"vst3.8 {d8, d9, d10}, [%[outptr2]] \n" \
"vst3.8 {d12, d13, d14}, [%[outptr3]] \n"
#define STORE_C3_8 \
"vst3.8 {d16, d17, d18}, [%[outptr0]] \n" \
"vst3.8 {d20, d21, d22}, [%[outptr1]] \n" \
"vst3.8 {d24, d25, d26}, [%[outptr2]] \n" \
"vst3.8 {d28, d29, d30}, [%[outptr3]] \n"
#define REVERSE_C3 \
"vrev64.8 d3, d0 \n" /* b 00*/ \
"vrev64.8 d7, d4 \n" /* b 01*/ \
"vrev64.8 d15, d5 \n" /* g 01*/ \
"vrev64.8 d11, d8 \n" /* b 02*/ \
"vrev64.8 d4, d1 \n" /* g 00*/ \
"vrev64.8 d5, d2 \n" /* r 00*/ \
\
"vrev64.8 d0, d12 \n" /* b 03*/ \
"vrev64.8 d1, d13 \n" /* g 03*/ \
"vrev64.8 d2, d14 \n" /* r 03*/ \
\
"vrev64.8 d12, d9 \n" /* g 02*/ \
"vrev64.8 d13, d10 \n" /* r 02*/ \
\
"vmov d8, d15 \n" /* g 01*/ \
"vrev64.8 d9, d6 \n" /* r 01*/ \
\
"vrev64.8 d14, d16 \n" /* b 04*/ \
"vrev64.8 d15, d17 \n" /* g 04*/ \
"vrev64.8 d16, d18 \n" /* r 04*/ \
\
"vrev64.8 d17, d20 \n" /* b 05*/ \
"vrev64.8 d18, d21 \n" /* g 05*/ \
"vrev64.8 d19, d22 \n" /* r 05*/ \
\
"vrev64.8 d20, d24 \n" /* b 06*/ \
"vrev64.8 d21, d25 \n" /* g 06*/ \
"vrev64.8 d22, d26 \n" /* r 06*/ \
\
"vrev64.8 d24, d28 \n" /* b 07*/ \
"vrev64.8 d25, d29 \n" /* g 07*/ \
"vrev64.8 d26, d30 \n" /* r 07*/
#define STORE_C3_R_4 \
"vst3.8 {d3, d4, d5}, [%[outptr0]] \n" \
"vst3.8 {d0, d1, d2}, [%[outptr3]] \n" \
"vst3.8 {d11, d12, d13}, [%[outptr2]] \n" \
"vst3.8 {d7, d8, d9}, [%[outptr1]] \n"
#define STORE_C3_R_8 \
"vst3.8 {d14, d15, d16}, [%[outptr0]] \n" \
"vst3.8 {d17, d18, d19}, [%[outptr1]] \n" \
"vst3.8 {d20, d21, d22}, [%[outptr2]] \n" \
"vst3.8 {d24, d25, d26}, [%[outptr3]] \n"
#endif
/*
1 2 3
4 5 6
7 8 9
rotate:
7 4 1
8 5 2
9 6 3
*/
// transpose
void rotate_hwc1_90(const uint8_t* src,
uint8_t* dst,
int w_in,
int h_in,
int w_out,
int h_out) {
uint8_t zerobuff[8] = {0, 0, 0, 0, 0, 0, 0, 0};
// block 4*8. -- 8*4
int i = 0;
int stride_h = 4 * w_in;
int stride_h_w = 4 * w_in - 8;
int stride_out = 4 * w_out;
#pragma omp parallel for
for (i = 0; i < h_in - 7; i += 8) {
const uint8_t* inptr0 = src + i * w_in;
const uint8_t* inptr1 = inptr0 + w_in;
const uint8_t* inptr2 = inptr1 + w_in;
const uint8_t* inptr3 = inptr2 + w_in;
int j = 0;
for (; j < w_in - 7; j += 8) {
uint8_t* outptr0 = dst + j * w_out + i;
uint8_t* outptr1 = outptr0 + w_out;
uint8_t* outptr2 = outptr1 + w_out;
uint8_t* outptr3 = outptr2 + w_out;
uint8_t* outptr4 = outptr3 + w_out;
uint8_t* outptr5 = outptr4 + w_out;
uint8_t* outptr6 = outptr5 + w_out;
uint8_t* outptr7 = outptr6 + w_out;
#ifdef __aarch64__
asm volatile(INPUT_C1 ADD_INPUT TRANS_C1_8 INPUT_C1 TRANS_C1_16 TRANS_C1_8
SUB_INPUT TRANS_C1 REVERSE_C1 STORE_C1_R
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[outptr4] "+r"(outptr4),
[outptr5] "+r"(outptr5),
[outptr6] "+r"(outptr6),
[outptr7] "+r"(outptr7)
: [stride_h] "r"(stride_h), [stride_h_w] "r"(stride_h_w)
: "v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15");
#else
asm volatile(INPUT_C1 ADD_INPUT TRANS_C1 REVERSE_C1 STORE_C1_R_4
ADD_OUTPUT STORE_C1_R_8
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3)
: [stride_h] "r"(stride_h),
[stride_h_w] "r"(stride_h_w),
[stride_out] "r"(stride_out)
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
#endif
}
const uint8_t* inptr4 = inptr3 + w_in;
const uint8_t* inptr5 = inptr4 + w_in;
const uint8_t* inptr6 = inptr5 + w_in;
const uint8_t* inptr7 = inptr6 + w_in;
for (; j < w_in; j++) {
uint8_t* outptr = dst + j * w_out + i;
*outptr++ = *inptr0++;
*outptr++ = *inptr1++;
*outptr++ = *inptr2++;
*outptr++ = *inptr3++;
*outptr++ = *inptr4++;
*outptr++ = *inptr5++;
*outptr++ = *inptr6++;
*outptr++ = *inptr7++;
}
}
for (; i < h_in; i++) {
const uint8_t* inptr0 = src + i * w_in;
for (int j = 0; j < w_in; j++) {
uint8_t* outptr0 = dst + j * w_out + i;
*outptr0 = *inptr0++;
}
}
}
/*
1 2 3 4
4 5 6 7
7 8 9 10
rotate:
10 9 8 7
7 6 5 4
4 3 2 1
*/
void rotate_hwc1_180(const uint8_t* src,
uint8_t* dst,
int w_in,
int h_in,
int w_out,
int h_out) {
uint8_t zerobuff[10000];
memset(zerobuff, 0, w_in * sizeof(uint8_t));
int stride_w = 8;
#pragma omp parallel for
for (int i = 0; i < h_in; i += 4) {
const uint8_t* inptr0 = src + i * w_in;
const uint8_t* inptr1 = inptr0 + w_in;
const uint8_t* inptr2 = inptr1 + w_in;
const uint8_t* inptr3 = inptr2 + w_in;
uint8_t* outptr0 = dst + (h_in - i) * w_out - stride_w; // last
uint8_t* outptr1 = outptr0 + w_out;
uint8_t* outptr2 = outptr1 + w_out;
uint8_t* outptr3 = outptr2 + w_out;
if (i + 3 >= h_in) {
uint8_t* ptr = zerobuff + w_in - stride_w;
switch ((i + 3) - h_in) {
case 3:
inptr0 = zerobuff;
outptr0 = ptr;
case 2:
inptr1 = zerobuff;
outptr1 = ptr;
case 1:
inptr2 = zerobuff;
outptr2 = ptr;
case 0:
inptr3 = zerobuff;
outptr3 = ptr;
default:
break;
}
}
int j = 0;
for (; j < w_in - 7; j += 8) {
#ifdef __aarch64__
asm volatile(
"ld1 {v0.8b}, [%[inptr0]], #8 \n" // v0={00,01,02, 03, 04, 05,
// 06, 07}"
"ld1 {v1.8b}, [%[inptr1]], #8 \n" // v0={10,11,12, 13, 14, 15,
// 16, 17}"
"ld1 {v2.8b}, [%[inptr2]], #8 \n" // v0={20,21,22, 23, 24, 25,
// 26, 27}"
"ld1 {v3.8b}, [%[inptr3]], #8 \n" // v0={30,31,32, 33, 34, 35,
// 36, 37}"
"rev64 v4.8b, v0.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00
"rev64 v5.8b, v1.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00
"rev64 v6.8b, v2.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00
"rev64 v7.8b, v3.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00
"st1 {v4.8b}, [%[outptr0]] \n" // 00 10 20 30 04 14 24 34
"st1 {v5.8b}, [%[outptr1]] \n" // 02 12 22 32
"st1 {v6.8b}, [%[outptr2]] \n" // 01 11 21 31
"st1 {v7.8b}, [%[outptr3]] \n" // 03 13 23 33
"sub %[outptr0], %[outptr0], %[stride_w] \n" //@ ptr - stride_w
"sub %[outptr1], %[outptr1], %[stride_w] \n"
"sub %[outptr2], %[outptr2], %[stride_w] \n"
"sub %[outptr3], %[outptr3], %[stride_w] \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
#else
asm volatile(
"vld1.8 {d0}, [%[inptr0]]! @ zip load r0, d0 =00 01 02 03 04 05 "
"06 07\n"
"vld1.8 {d4}, [%[inptr1]]! @ zip load r1, d2 =10 11 12 13 14 15 "
"16 17\n"
"vld1.8 {d8}, [%[inptr2]]! @ zip load r1, d4 =20 21 22 23 24 25 "
"26 27\n"
"vld1.8 {d12}, [%[inptr3]]! @ zip load r1, d6 = 30 31 32 33 34 35 "
"36 37\n"
"vrev64.8 d1, d0 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d5, d4 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d9, d8 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d13, d12 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vst1.32 {d1}, [%[outptr0]] @ write d0(q0,low),r00,r10 20 30\n"
"vst1.32 {d5}, [%[outptr1]] @ write d4(q0,low),r01,r11 21 31\n"
"vst1.32 {d9}, [%[outptr2]] @ write d4(q0,low),r01,r11 21 31\n"
"vst1.32 {d13}, [%[outptr3]] @ write d4(q0,low),r01,r11 21 31\n"
"sub %[outptr0], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr1], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr2], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr3], %[stride_w] @ ptr - stride_w \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
#endif
}
outptr3 += stride_w - 1;
outptr2 += stride_w - 1;
outptr1 += stride_w - 1;
outptr0 += stride_w - 1;
for (; j < w_in; j++) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 0:
*outptr2-- = *inptr2++;
case 1:
*outptr1-- = *inptr1++;
case 2:
*outptr0-- = *inptr0++;
case 3:
default:
break;
}
} else {
*outptr3-- = *inptr3++;
*outptr2-- = *inptr2++;
*outptr1-- = *inptr1++;
*outptr0-- = *inptr0++;
}
}
}
}
/*
1 2 3
4 5 6
7 8 9
rotate:
3 6 9
2 5 8
1 4 7
*/
// dst = (h_out - 1) * w_out
void rotate_hwc1_270(const uint8_t* src,
uint8_t* dst,
int w_in,
int h_in,
int w_out,
int h_out) {
int stride_h = 4 * w_in;
int stride_h_w = 4 * w_in - 8;
int hout = h_out - 1;
int stride_out = 4 * w_out;
int i = 0;
// block 8*8. -- 8*8
#pragma omp parallel for
for (i = 0; i < h_in - 7; i += 8) {
const uint8_t* inptr0 = src + i * w_in;
const uint8_t* inptr1 = inptr0 + w_in;
const uint8_t* inptr2 = inptr1 + w_in;
const uint8_t* inptr3 = inptr2 + w_in;
int j = 0;
for (; j < w_in - 7; j += 8) {
uint8_t* outptr0 = dst + (hout - j) * w_out + i;
uint8_t* outptr1 = outptr0 - w_out;
uint8_t* outptr2 = outptr1 - w_out;
uint8_t* outptr3 = outptr2 - w_out;
uint8_t* outptr4 = outptr3 - w_out;
uint8_t* outptr5 = outptr4 - w_out;
uint8_t* outptr6 = outptr5 - w_out;
uint8_t* outptr7 = outptr6 - w_out;
#ifdef __aarch64__
asm volatile(INPUT_C1 ADD_INPUT TRANS_C1_8 INPUT_C1 TRANS_C1_16 TRANS_C1_8
SUB_INPUT TRANS_C1 STORE_C1
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[outptr4] "+r"(outptr4),
[outptr5] "+r"(outptr5),
[outptr6] "+r"(outptr6),
[outptr7] "+r"(outptr7)
: [stride_h] "r"(stride_h), [stride_h_w] "r"(stride_h_w)
: "v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15");
#else
asm volatile(INPUT_C1 ADD_INPUT TRANS_C1 STORE_C1_4 ADD_OUTPUT STORE_C1_8
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3)
: [stride_h] "r"(stride_h),
[stride_h_w] "r"(stride_h_w),
[stride_out] "r"(stride_out)
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
#endif
}
const uint8_t* inptr4 = inptr3 + w_in;
const uint8_t* inptr5 = inptr4 + w_in;
const uint8_t* inptr6 = inptr5 + w_in;
const uint8_t* inptr7 = inptr6 + w_in;
for (; j < w_in; j++) {
uint8_t* outptr = dst + (hout - j) * w_out + i;
*outptr++ = *inptr0++;
*outptr++ = *inptr1++;
*outptr++ = *inptr2++;
*outptr++ = *inptr3++;
*outptr++ = *inptr4++;
*outptr++ = *inptr5++;
*outptr++ = *inptr6++;
*outptr++ = *inptr7++;
}
}
for (; i < h_in; i++) {
const uint8_t* inptr0 = src + i * w_in;
for (int j = 0; j < w_in; j++) {
uint8_t* outptr0 = dst + (hout - j) * w_out + i;
*outptr0 = *inptr0++;
}
}
}
void rotate_hwc3_90(const uint8_t* src,
uint8_t* dst,
int w_in,
int h_in,
int w_out,
int h_out) {
int win = w_in * 3;
int wout = w_out * 3;
int stride_h = 4 * win;
int stride_h_w = 4 * win - 24;
int stride_out = 4 * wout;
int ww = w_out - 8;
uint8_t zerobuff[8] = {0, 0, 0, 0, 0, 0, 0, 0};
// block 4*8. -- 8*4
int i = 0;
#pragma omp parallel for
for (i = 0; i < h_in - 7; i += 8) {
const uint8_t* inptr0 = src + i * win;
const uint8_t* inptr1 = inptr0 + win;
const uint8_t* inptr2 = inptr1 + win;
const uint8_t* inptr3 = inptr2 + win;
int j = 0;
for (; j < w_in - 7; j += 8) {
uint8_t* outptr0 = dst + j * wout + (ww - i) * 3;
uint8_t* outptr1 = outptr0 + wout;
uint8_t* outptr2 = outptr1 + wout;
uint8_t* outptr3 = outptr2 + wout;
uint8_t* outptr4 = outptr3 + wout;
uint8_t* outptr5 = outptr4 + wout;
uint8_t* outptr6 = outptr5 + wout;
uint8_t* outptr7 = outptr6 + wout;
#ifdef __aarch64__
asm volatile(INPUT_C3 ADD_INPUT TRANS_C3_8 INPUT_C3 TRANS_C3_16 TRANS_C3
REVERSE_C3 STORE_C3
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[outptr4] "+r"(outptr4),
[outptr5] "+r"(outptr5),
[outptr6] "+r"(outptr6),
[outptr7] "+r"(outptr7),
[stride_h] "+r"(stride_h),
[stride_h_w] "+r"(stride_h_w)
:
: "v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31");
#else
asm volatile(INPUT_C3 ADD_INPUT TRANS_C3 REVERSE_C3 STORE_C3_R_4
ADD_OUTPUT STORE_C3_R_8
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3)
: [stride_h] "r"(stride_h),
[stride_h_w] "r"(stride_h_w),
[stride_out] "r"(stride_out)
: "q0",
"q1",
"q2",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12",
"q13",
"q14");
#endif
}
const uint8_t* inptr4 = inptr3 + win;
const uint8_t* inptr5 = inptr4 + win;
const uint8_t* inptr6 = inptr5 + win;
const uint8_t* inptr7 = inptr6 + win;
for (; j < w_in; j++) {
int tmpx = (ww - i) * 3;
uint8_t* outptr = dst + j * wout + tmpx;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
}
}
// remain
ww = w_out - 1;
for (; i < h_in; i++) {
const uint8_t* inptr0 = src + i * win;
for (int j = 0; j < w_in; j++) {
uint8_t* outptr0 = dst + j * wout + (ww - i) * 3;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
}
}
}
void rotate_hwc3_180(const uint8_t* src,
uint8_t* dst,
int w_in,
int h_in,
int w_out,
int h_out) {
int win = w_in * 3;
uint8_t zerobuff[30000];
memset(zerobuff, 0, win * sizeof(uint8_t));
int stride_w = 24;
#pragma omp parallel for
for (int i = 0; i < h_in; i += 4) {
const uint8_t* inptr0 = src + i * win;
const uint8_t* inptr1 = inptr0 + win;
const uint8_t* inptr2 = inptr1 + win;
const uint8_t* inptr3 = inptr2 + win;
uint8_t* outptr0 = dst + (h_in - i) * win - stride_w; // last col
uint8_t* outptr1 = outptr0 - win;
uint8_t* outptr2 = outptr1 - win;
uint8_t* outptr3 = outptr2 - win;
if (i + 3 >= h_in) {
uint8_t* ptr = zerobuff + win - stride_w;
switch ((i + 3) - h_in) {
case 3:
inptr0 = zerobuff;
outptr0 = ptr;
case 2:
inptr1 = zerobuff;
outptr1 = ptr;
case 1:
inptr2 = zerobuff;
outptr2 = ptr;
case 0:
inptr3 = zerobuff;
outptr3 = ptr;
default:
break;
}
}
int j = 0;
for (; j < w_in - 7; j += 8) {
#ifdef __aarch64__
asm volatile(
"ld3 {v0.8b, v1.8b, v2.8b}, [%[inptr0]], #24 \n" // v0={00,01,02,
// 03, 04, 05,
// 06, 07}"
"ld3 {v3.8b, v4.8b, v5.8b}, [%[inptr1]], #24 \n" // v0={10,11,12,
// 13, 14, 15,
// 16, 17}"
"ld3 {v6.8b, v7.8b, v8.8b}, [%[inptr2]], #24 \n" // v0={20,21,22,
// 23, 24, 25,
// 26, 27}"
"ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24 \n" // v0={30,31,32,
// 33, 34, 35,
// 36, 37}"
"rev64 v12.8b, v0.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 b
"rev64 v13.8b, v1.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 g
"rev64 v14.8b, v2.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 r
"rev64 v15.8b, v3.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v16.8b, v4.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v17.8b, v5.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v18.8b, v6.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v19.8b, v7.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v20.8b, v8.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v21.8b, v9.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v22.8b, v10.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v23.8b, v11.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]] \n" // 00 10
// 20 30
// 04 14
// 24 34
"st3 {v15.8b, v16.8b, v17.8b}, [%[outptr1]] \n" // 02 12
// 22 32
"st3 {v18.8b, v19.8b, v20.8b}, [%[outptr2]] \n" // 01 11
// 21 31
"st3 {v21.8b, v22.8b, v23.8b}, [%[outptr3]] \n" // 03 13
// 23 33
"sub %[outptr0], %[outptr0], %[stride_w] \n" //@ ptr - stride_w
"sub %[outptr1], %[outptr1], %[stride_w] \n"
"sub %[outptr2], %[outptr2], %[stride_w] \n"
"sub %[outptr3], %[outptr3], %[stride_w] \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23");
#else
asm volatile(
"vld3.8 {d0, d1, d2}, [%[inptr0]]! @ zip load r0, d0 =00 01 02 03 "
"\n"
"vld3.8 {d3, d4, d5}, [%[inptr1]]! @ zip load r1, d2 =10 11 12 13 "
"\n"
"vld3.8 {d6, d7, d8}, [%[inptr2]]! @ zip load r1, d4 =20 21 22 23 "
"\n"
"vld3.8 {d9, d10, d11}, [%[inptr3]]! @ zip load r1, d6 = 30 31 32 "
"\n"
"vrev64.8 d12, d0 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d13, d1 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d14, d2 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d15, d3 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d16, d4 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d17, d5 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d18, d6 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d19, d7 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d20, d8 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d21, d9 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d22, d10 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d23, d11 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vst3.8 {d12, d13, d14}, [%[outptr0]] @ write \n"
"vst3.8 {d15, d16, d17}, [%[outptr1]] @ write \n"
"vst3.8 {d18, d19, d20}, [%[outptr2]] @ write \n"
"vst3.8 {d21, d22, d23}, [%[outptr3]] @ write \n"
"sub %[outptr0], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr1], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr2], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr3], %[stride_w] @ ptr - stride_w \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "q0",
"q1",
"q2",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12");
#endif
}
outptr3 += stride_w - 3;
outptr2 += stride_w - 3;
outptr1 += stride_w - 3;
outptr0 += stride_w - 3;
for (; j < w_in; j++) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 0:
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 6;
case 1:
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 6;
case 2:
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 6;
case 3:
// inptr3 = zerobuff;
default:
break;
}
} else {
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
outptr3 -= 6;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 6;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 6;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 6;
}
}
}
}
void rotate_hwc3_270(const uint8_t* src,
uint8_t* dst,
int w_in,
int h_in,
int w_out,
int h_out) {
int win = w_in * 3;
int wout = w_out * 3;
int stride_h = 4 * win;
int stride_h_w = 4 * win - 24;
int stride_out = 4 * wout;
int hout = h_out - 1;
// block 8*8. -- 8*8
int i = 0;
#pragma omp parallel for
for (i = 0; i < h_in - 7; i += 8) {
const uint8_t* inptr0 = src + i * win;
const uint8_t* inptr1 = inptr0 + win;
const uint8_t* inptr2 = inptr1 + win;
const uint8_t* inptr3 = inptr2 + win;
int j = 0;
for (; j < w_in - 7; j += 8) {
uint8_t* outptr0 = dst + (hout - j) * wout + i * 3;
uint8_t* outptr1 = outptr0 - wout;
uint8_t* outptr2 = outptr1 - wout;
uint8_t* outptr3 = outptr2 - wout;
uint8_t* outptr4 = outptr3 - wout;
uint8_t* outptr5 = outptr4 - wout;
uint8_t* outptr6 = outptr5 - wout;
uint8_t* outptr7 = outptr6 - wout;
#ifdef __aarch64__
asm volatile(INPUT_C3 ADD_INPUT TRANS_C3_8 INPUT_C3 TRANS_C3_16 TRANS_C3
MOV_C3 STORE_C3
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[outptr4] "+r"(outptr4),
[outptr5] "+r"(outptr5),
[outptr6] "+r"(outptr6),
[outptr7] "+r"(outptr7),
[stride_h] "+r"(stride_h),
[stride_h_w] "+r"(stride_h_w)
:
: "v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31");
#else
asm volatile(INPUT_C3 ADD_INPUT TRANS_C3 STORE_C3_4 SUB_OUTPUT STORE_C3_8
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3)
: [stride_h] "r"(stride_h),
[stride_out] "r"(stride_out),
[stride_h_w] "r"(stride_h_w)
: "q0",
"q1",
"q2",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12",
"q13",
"q14");
#endif
}
const uint8_t* inptr4 = inptr3 + win;
const uint8_t* inptr5 = inptr4 + win;
const uint8_t* inptr6 = inptr5 + win;
const uint8_t* inptr7 = inptr6 + win;
for (; j < w_in; j++) {
int tmpx = i * 3;
uint8_t* outptr = dst + (hout - j) * wout + tmpx;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
}
}
for (; i < h_in; i++) {
const uint8_t* inptr0 = src + i * win;
for (int j = 0; j < w_in; j++) {
uint8_t* outptr0 = dst + (hout - j) * wout + i * 3;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
}
}
}
void rotate_hwc4_90(const uint8_t* src,
uint8_t* dst,
int w_in,
int h_in,
int w_out,
int h_out) {
int win = w_in * 4;
int wout = w_out * 4;
int hremain = h_in % 8;
int stride_h = 4 * win;
int stride_h_w = 4 * win - 32;
int ww = w_out - 8;
// block 8*8. -- 8*8
int i = 0;
#pragma omp parallel for
for (i = 0; i < h_in - 7; i += 8) {
const uint8_t* inptr0 = src + i * win;
const uint8_t* inptr1 = inptr0 + win;
const uint8_t* inptr2 = inptr1 + win;
const uint8_t* inptr3 = inptr2 + win;
const uint8_t* inptr4 = inptr3 + win;
const uint8_t* inptr5 = inptr4 + win;
const uint8_t* inptr6 = inptr5 + win;
const uint8_t* inptr7 = inptr6 + win;
int j = 0;
for (; j < w_in; j++) {
int tmpx = (ww - i) * 4;
uint8_t* outptr = dst + j * wout + tmpx;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
}
}
ww = w_out - 1;
for (; i < h_in; i++) {
const uint8_t* inptr0 = src + i * win;
for (int j = 0; j < w_in; j++) {
uint8_t* outptr0 = dst + j * wout + (ww - i) * 4;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
}
}
}
void rotate_hwc4_180(const uint8_t* src,
uint8_t* dst,
int w_in,
int h_in,
int w_out,
int h_out) {
int win = w_in * 4;
uint8_t zerobuff[40000];
memset(zerobuff, 0, win * sizeof(uint8_t));
int stride_w = 32;
#pragma omp parallel for
for (int i = 0; i < h_in; i += 4) {
const uint8_t* inptr0 = src + i * win;
const uint8_t* inptr1 = inptr0 + win;
const uint8_t* inptr2 = inptr1 + win;
const uint8_t* inptr3 = inptr2 + win;
uint8_t* outptr0 = dst + (h_in - i) * win - stride_w; // last col
uint8_t* outptr1 = outptr0 - win;
uint8_t* outptr2 = outptr1 - win;
uint8_t* outptr3 = outptr2 - win;
if (i + 3 >= h_in) {
uint8_t* ptr = zerobuff + win - stride_w;
switch ((i + 3) - h_in) {
case 3:
inptr0 = zerobuff;
outptr0 = ptr;
case 2:
inptr1 = zerobuff;
outptr1 = ptr;
case 1:
inptr2 = zerobuff;
outptr2 = ptr;
case 0:
inptr3 = zerobuff;
outptr3 = ptr;
default:
break;
}
}
int j = 0;
for (; j < w_in - 7; j += 8) {
#ifdef __aarch64__
asm volatile(
"ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[inptr0]], #32 \n" // v0={00,01,02,
// 03, 04, 05,
// 06, 07}"
"ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [%[inptr1]], #32 \n" // v0={10,11,12,
// 13, 14, 15,
// 16, 17}"
"ld4 {v8.8b, v9.8b, v10.8b, v11.8b}, [%[inptr2]], #32 \n" // v0={20,21,22,
// 23, 24, 25,
// 26, 27}"
"ld4 {v12.8b, v13.8b, v14.8b, v15.8b}, [%[inptr3]], #32 \n" // v0={30,31,32,
// 33, 34, 35,
// 36, 37}"
"rev64 v16.8b, v0.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 b
"rev64 v17.8b, v1.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 g
"rev64 v18.8b, v2.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 r
"rev64 v19.8b, v3.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 a
"rev64 v20.8b, v4.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v21.8b, v5.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v22.8b, v6.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v23.8b, v7.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v24.8b, v8.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v25.8b, v9.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v26.8b, v10.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v27.8b, v11.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v28.8b, v12.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v29.8b, v13.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v30.8b, v14.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v31.8b, v15.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[outptr0]] \n" // 00 10
"st4 {v20.8b, v21.8b, v22.8b, v23.8b}, [%[outptr1]] \n" // 02 12
"st4 {v24.8b, v25.8b, v26.8b, v27.8b}, [%[outptr2]] \n" // 01 11
"st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [%[outptr3]] \n" // 03 13
"sub %[outptr0], %[outptr0], %[stride_w] \n" //@ ptr - stride_w
"sub %[outptr1], %[outptr1], %[stride_w] \n"
"sub %[outptr2], %[outptr2], %[stride_w] \n"
"sub %[outptr3], %[outptr3], %[stride_w] \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31");
#else
asm volatile(
"vld4.8 {d0, d1, d2, d3}, [%[inptr0]]! @ zip load r0, d0 =00 01 "
"02 03 "
"04 05 06 07\n"
"vld4.8 {d4, d5, d6, d7}, [%[inptr1]]! @ zip load r1, d2 =10 11 "
"12 13 "
"14 15 16 17\n"
"vld4.8 {d8, d9, d10, d11}, [%[inptr2]]! @ zip load r1, d4 =20 21 "
"22 23 "
"24 25 26 27\n"
"vld4.8 {d12, d13, d14, d15}, [%[inptr3]]! @ zip load r1, d6 = 30 "
"31 32 "
"33 34 35 36 37\n"
"vrev64.8 d16, d0 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d17, d1 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d18, d2 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d19, d3 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d20, d4 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d21, d5 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d22, d6 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d23, d7 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d24, d8 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d25, d9 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d26, d10 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d27, d11 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d28, d12 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d29, d13 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d30, d14 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d31, d15 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vst4.8 {d16, d17, d18, d19}, [%[outptr0]] @ write \n"
"vst4.8 {d20, d21, d22, d23}, [%[outptr1]] @ write \n"
"vst4.8 {d24, d25, d26, d27}, [%[outptr2]] @ write \n"
"vst4.8 {d28, d29, d30, d31}, [%[outptr3]] @ write \n"
"sub %[outptr0], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr1], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr2], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr3], %[stride_w] @ ptr - stride_w \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "q0",
"q1",
"q2",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12",
"q13",
"q14",
"q15");
#endif
}
outptr3 += stride_w - 4;
outptr2 += stride_w - 4;
outptr1 += stride_w - 4;
outptr0 += stride_w - 4;
for (; j < w_in; j++) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 0:
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 8;
case 1:
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 8;
case 2:
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 8;
case 3:
// inptr3 = zerobuff;
default:
break;
}
} else {
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
outptr3 -= 8;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 8;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 8;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 8;
}
}
}
}
void rotate_hwc4_270(const uint8_t* src,
uint8_t* dst,
int w_in,
int h_in,
int w_out,
int h_out) {
int win = w_in * 4;
int wout = w_out * 4;
int hremain = h_in % 8;
int stride_h = 4 * win;
int stride_h_w = 4 * win - 32;
int hout = h_out - 1;
// block 8*8. -- 8*8
int i = 0;
#pragma omp parallel for
for (i = 0; i < h_in - 7; i += 8) {
const uint8_t* inptr0 = src + i * win;
const uint8_t* inptr1 = inptr0 + win;
const uint8_t* inptr2 = inptr1 + win;
const uint8_t* inptr3 = inptr2 + win;
const uint8_t* inptr4 = inptr3 + win;
const uint8_t* inptr5 = inptr4 + win;
const uint8_t* inptr6 = inptr5 + win;
const uint8_t* inptr7 = inptr6 + win;
int j = 0;
for (; j < w_in; j++) {
int tmpx = i * 4;
uint8_t* outptr = dst + (hout - j) * wout + tmpx;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
}
}
for (; i < h_in; i++) {
const uint8_t* inptr0 = src + i * win;
for (int j = 0; j < w_in; j++) {
uint8_t* outptr0 = dst + (hout - j) * wout + i * 4;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
}
}
}
} // namespace cv
} // namespace utils
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
#include <vector>
namespace paddle {
namespace lite {
namespace utils {
namespace cv {
void rotate_hwc1(
const uint8_t* src, uint8_t* dst, int srcw, int srch, float degree);
void rotate_hwc3(
const uint8_t* src, uint8_t* dst, int srcw, int srch, float degree);
void rotate_hwc4(
const uint8_t* src, uint8_t* dst, int srcw, int srch, float degree);
} // namespace cv
} // namespace utils
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/utils/cv/paddle_image_preprocess.h"
#include <math.h>
#include <algorithm>
#include <climits>
#include "lite/utils/cv/image2tensor.h"
#include "lite/utils/cv/image_convert.h"
#include "lite/utils/cv/image_flip.h"
#include "lite/utils/cv/image_resize.h"
#include "lite/utils/cv/image_rotate.h"
namespace paddle {
namespace lite {
namespace utils {
namespace cv {
#define PI 3.14159265f
#define Degrees2Radians(degrees) ((degrees) * (SK_ScalarPI / 180))
#define Radians2Degrees(radians) ((radians) * (180 / SK_ScalarPI))
#define ScalarNearlyZero (1.0f / (1 << 12))
// init
ImagePreprocess::ImagePreprocess(ImageFormat srcFormat,
ImageFormat dstFormat,
TransParam param) {
this->srcFormat_ = srcFormat;
this->dstFormat_ = dstFormat;
this->transParam_ = param;
}
void ImagePreprocess::imageCovert(const uint8_t* src, uint8_t* dst) {
ImageConvert img_convert;
img_convert.choose(src,
dst,
this->srcFormat_,
this->dstFormat_,
this->transParam_.iw,
this->transParam_.ih);
}
void ImagePreprocess::imageCovert(const uint8_t* src,
uint8_t* dst,
ImageFormat srcFormat,
ImageFormat dstFormat) {
ImageConvert img_convert;
img_convert.choose(src,
dst,
srcFormat,
dstFormat,
this->transParam_.iw,
this->transParam_.ih);
}
void ImagePreprocess::imageResize(const uint8_t* src,
uint8_t* dst,
ImageFormat srcFormat,
int srcw,
int srch,
int dstw,
int dsth) {
resize(src, dst, srcFormat, srcw, srch, dstw, dsth);
/*
int size = srcw * srch;
if (srcw == dstw && srch == dsth) {
if (srcFormat == NV12 || srcFormat == NV21) {
size = srcw * (floor(1.5 * srch));
} else if (srcFormat == BGR || srcFormat == RGB) {
size = 3 * srcw * srch;
} else if (srcFormat == BGRA || srcFormat == RGBA) {
size = 4 * srcw * srch;
}
memcpy(dst, src, sizeof(uint8_t) * size);
return;
}
double scale_x = static_cast<double>(srcw / dstw);
double scale_y = static_cast<double>(srch / dsth);
int* buf = new int[dstw * 2 + dsth * 2];
int* xofs = buf;
int* yofs = buf + dstw;
int16_t* ialpha = reinterpret_cast<int16_t*>(buf + dstw + dsth);
int16_t* ibeta = reinterpret_cast<int16_t*>(buf + 2 * dstw + dsth);
compute_xy(
srcw, srch, dstw, dsth, scale_x, scale_y, xofs, yofs, ialpha, ibeta);
int w_out = dstw;
int w_in = srcw;
int num = 1;
int orih = dsth;
if (srcFormat == GRAY) {
num = 1;
} else if (srcFormat == NV12 || srcFormat == NV21) {
num = 1;
int hout = static_cast<int>(0.5 * dsth);
dsth += hout;
} else if (srcFormat == BGR || srcFormat == RGB) {
w_in = srcw * 3;
w_out = dstw * 3;
num = 3;
} else if (srcFormat == BGRA || srcFormat == RGBA) {
w_in = srcw * 4;
w_out = dstw * 4;
num = 4;
}
int* xofs1 = nullptr;
int* yofs1 = nullptr;
int16_t* ialpha1 = nullptr;
if (orih < dsth) { // uv
int tmp = dsth - orih;
int w = dstw / 2;
xofs1 = new int[w];
yofs1 = new int[tmp];
ialpha1 = new int16_t[srcw];
compute_xy(srcw / 2,
srch / 2,
w,
tmp,
scale_x,
scale_y,
xofs1,
yofs1,
ialpha1,
ibeta + orih);
}
int cnt = w_out >> 3;
int remain = w_out % 8;
int32x4_t _v2 = vdupq_n_s32(2);
#pragma omp parallel for
for (int dy = 0; dy < dsth; dy++) {
int16_t* rowsbuf0 = new int16_t[w_out];
int16_t* rowsbuf1 = new int16_t[w_out];
int sy = yofs[dy];
if (dy >= orih) {
xofs = xofs1;
yofs = yofs1;
ialpha = ialpha1;
}
if (sy < 0) {
memset(rowsbuf0, 0, sizeof(uint16_t) * w_out);
const uint8_t* S1 = src + srcw * (sy + 1);
const int16_t* ialphap = ialpha;
int16_t* rows1p = rowsbuf1;
for (int dx = 0; dx < dstw; dx++) {
int sx = xofs[dx] * num; // num = 4
int16_t a0 = ialphap[0];
int16_t a1 = ialphap[1];
const uint8_t* S1pl = S1 + sx;
const uint8_t* S1pr = S1 + sx + num;
if (sx < 0) {
S1pl = S1;
}
for (int i = 0; i < num; i++) {
if (sx < 0) {
*rows1p++ = ((*S1pl++) * a1) >> 4;
} else {
*rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4;
}
}
ialphap += 2;
}
} else {
// hresize two rows
const uint8_t* S0 = src + w_in * (sy);
const uint8_t* S1 = src + w_in * (sy + 1);
const int16_t* ialphap = ialpha;
int16_t* rows0p = rowsbuf0;
int16_t* rows1p = rowsbuf1;
for (int dx = 0; dx < dstw; dx++) {
int sx = xofs[dx] * num; // num = 4
int16_t a0 = ialphap[0];
int16_t a1 = ialphap[1];
const uint8_t* S0pl = S0 + sx;
const uint8_t* S0pr = S0 + sx + num;
const uint8_t* S1pl = S1 + sx;
const uint8_t* S1pr = S1 + sx + num;
if (sx < 0) {
S0pl = S0;
S1pl = S1;
}
for (int i = 0; i < num; i++) {
if (sx < 0) {
*rows0p = ((*S0pl++) * a1) >> 4;
*rows1p = ((*S1pl++) * a1) >> 4;
rows0p++;
rows1p++;
} else {
*rows0p++ = ((*S0pl++) * a0 + (*S0pr++) * a1) >> 4;
*rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4;
}
}
ialphap += 2;
}
}
int ind = dy * 2;
int16_t b0 = ibeta[ind];
int16_t b1 = ibeta[ind + 1];
int16x8_t _b0 = vdupq_n_s16(b0);
int16x8_t _b1 = vdupq_n_s16(b1);
uint8_t* dp_ptr = dst + dy * w_out;
int16_t* rows0p = rowsbuf0;
int16_t* rows1p = rowsbuf1;
int re_cnt = cnt;
if (re_cnt > 0) {
#ifdef __aarch64__
asm volatile(
"1: \n"
"ld1 {v0.8h}, [%[rows0p]], #16 \n"
"ld1 {v1.8h}, [%[rows1p]], #16 \n"
"orr v6.16b, %w[_v2].16b, %w[_v2].16b \n"
"orr v7.16b, %w[_v2].16b, %w[_v2].16b \n"
"smull v2.4s, v0.4h, %w[_b0].4h \n"
"smull2 v4.4s, v0.8h, %w[_b0].8h \n"
"smull v3.4s, v1.4h, %w[_b1].4h \n"
"smull2 v5.4s, v1.8h, %w[_b1].8h \n"
"ssra v6.4s, v2.4s, #16 \n"
"ssra v7.4s, v4.4s, #16 \n"
"ssra v6.4s, v3.4s, #16 \n"
"ssra v7.4s, v5.4s, #16 \n"
"shrn v0.4h, v6.4s, #2 \n"
"shrn2 v0.8h, v7.4s, #2 \n"
"subs %w[cnt], %w[cnt], #1 \n"
"sqxtun v1.8b, v0.8h \n"
"st1 {v1.8b}, [%[dp]], #8 \n"
"bne 1b \n"
: [rows0p] "+r"(rows0p),
[rows1p] "+r"(rows1p),
[cnt] "+r"(re_cnt),
[dp] "+r"(dp_ptr)
: [_b0] "w"(_b0), [_b1] "w"(_b1), [_v2] "w"(_v2)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
#else
asm volatile(
"mov r4, #2 \n"
"vdup.s32 q12, r4 \n"
"0: \n"
"vld1.s16 {d2-d3}, [%[rows0p]]!\n"
"vld1.s16 {d6-d7}, [%[rows1p]]!\n"
"vorr.s32 q10, q12, q12 \n"
"vorr.s32 q11, q12, q12 \n"
"vmull.s16 q0, d2, %[_b0] \n"
"vmull.s16 q1, d3, %[_b0] \n"
"vmull.s16 q2, d6, %[_b1] \n"
"vmull.s16 q3, d7, %[_b1] \n"
"vsra.s32 q10, q0, #16 \n"
"vsra.s32 q11, q1, #16 \n"
"vsra.s32 q10, q2, #16 \n"
"vsra.s32 q11, q3, #16 \n"
"vshrn.s32 d20, q10, #2 \n"
"vshrn.s32 d21, q11, #2 \n"
"subs %[cnt], #1 \n"
"vqmovun.s16 d20, q10 \n"
"vst1.8 {d20}, [%[dp]]! \n"
"bne 0b \n"
: [rows0p] "+r"(rows0p),
[rows1p] "+r"(rows1p),
[cnt] "+r"(re_cnt),
[dp] "+r"(dp_ptr)
: [_b0] "w"(_b0), [_b1] "w"(_b1)
: "cc",
"memory",
"r4",
"q0",
"q1",
"q2",
"q3",
"q8",
"q9",
"q10",
"q11",
"q12");
#endif // __aarch64__
}
for (int i = 0; i < remain; i++) {
// D[x] = (rows0[x]*b0 + rows1[x]*b1) >>
// INTER_RESIZE_COEF_BITS;
*dp_ptr++ =
(uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
(int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
2);
}
}
delete[] buf;
*/
}
void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst) {
int srcw = this->transParam_.iw;
int srch = this->transParam_.ih;
int dstw = this->transParam_.ow;
int dsth = this->transParam_.oh;
auto srcFormat = this->dstFormat_;
resize(src, dst, srcFormat, srcw, srch, dstw, dsth);
}
void ImagePreprocess::imageRotate(const uint8_t* src,
uint8_t* dst,
ImageFormat srcFormat,
int srcw,
int srch,
float degree) {
if (degree != 90 && degree != 180 && degree != 270) {
printf("this degree: %f not support \n", degree);
}
if (srcFormat == GRAY) {
rotate_hwc1(src, dst, srcw, srch, degree);
} else if (srcFormat == BGR || srcFormat == RGB) {
rotate_hwc3(src, dst, srcw, srch, degree);
} else if (srcFormat == BGRA || srcFormat == RGBA) {
rotate_hwc4(src, dst, srcw, srch, degree);
} else {
printf("this srcFormat: %d does not support! \n", srcFormat);
return;
}
}
void ImagePreprocess::imageRotate(const uint8_t* src, uint8_t* dst) {
auto srcw = this->transParam_.ow;
auto srch = this->transParam_.oh;
auto srcFormat = this->dstFormat_;
auto degree = this->transParam_.rotate_param;
if (degree != 90 && degree != 180 && degree != 270) {
printf("this degree: %f not support \n", degree);
}
ImagePreprocess::imageRotate(src, dst, srcFormat, srcw, srch, degree);
}
void ImagePreprocess::imageFlip(const uint8_t* src,
uint8_t* dst,
ImageFormat srcFormat,
int srcw,
int srch,
FlipParam flip_param) {
if (srcFormat == GRAY) {
flip_hwc1(src, dst, srcw, srch, flip_param);
} else if (srcFormat == BGR || srcFormat == RGB) {
flip_hwc3(src, dst, srcw, srch, flip_param);
} else if (srcFormat == BGRA || srcFormat == RGBA) {
flip_hwc4(src, dst, srcw, srch, flip_param);
} else {
printf("this srcFormat: %d does not support! \n", srcFormat);
return;
}
}
void ImagePreprocess::imageFlip(const uint8_t* src, uint8_t* dst) {
auto srcw = this->transParam_.ow;
auto srch = this->transParam_.oh;
auto srcFormat = this->dstFormat_;
auto flip_param = this->transParam_.flip_param;
ImagePreprocess::imageFlip(src, dst, srcFormat, srcw, srch, flip_param);
}
void ImagePreprocess::image2Tensor(const uint8_t* src,
Tensor* dstTensor,
ImageFormat srcFormat,
int srcw,
int srch,
LayoutType layout,
float* means,
float* scales) {
Image2Tensor img2tensor;
img2tensor.choose(
src, dstTensor, srcFormat, layout, srcw, srch, means, scales);
}
void ImagePreprocess::image2Tensor(const uint8_t* src,
Tensor* dstTensor,
LayoutType layout,
float* means,
float* scales) {
Image2Tensor img2tensor;
img2tensor.choose(src,
dstTensor,
this->dstFormat_,
layout,
this->transParam_.ow,
this->transParam_.oh,
means,
scales);
}
} // namespace cv
} // namespace utils
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
#include <stdio.h>
#include <vector>
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_place.h"
namespace paddle {
namespace lite {
namespace utils {
namespace cv {
typedef paddle::lite_api::Tensor Tensor;
typedef paddle::lite_api::DataLayoutType LayoutType;
// color enum
enum ImageFormat {
RGBA = 0,
BGRA,
RGB,
BGR,
GRAY,
NV21 = 11,
NV12,
};
// flip enum
enum FlipParam {
X = 0, // flip along the X axis
Y, // flip along the Y axis
XY // flip along the XY axis
};
// transform param
typedef struct {
int ih; // input height
int iw; // input width
int oh; // outpu theight
int ow; // output width
FlipParam flip_param; // flip, support x, y, xy
float rotate_param; // rotate, support 90, 180, 270
} TransParam;
class ImagePreprocess {
public:
/*
* init
* param srcFormat: input image color
* param dstFormat: output image color
* param param: input image parameter, egs: input size
*/
ImagePreprocess(ImageFormat srcFormat,
ImageFormat dstFormat,
TransParam param);
/*
* image color convert
* support NV12/NV21_to_BGR(RGB), NV12/NV21_to_BGRA(RGBA),
* BGR(RGB)and BGRA(RGBA) transform,
* BGR(RGB)and RGB(BGR) transform,
* BGR(RGB)and RGBA(BGRA) transform,
* BGR(RGB)and GRAY transform,
* param src: input image data
* param dst: output image data
*/
void imageCovert(const uint8_t* src, uint8_t* dst);
/*
* image color convert
* support NV12/NV21_to_BGR(RGB), NV12/NV21_to_BGRA(RGBA),
* BGR(RGB)and BGRA(RGBA) transform,
* BGR(RGB)and RGB(BGR) transform,
* BGR(RGB)and RGBA(BGRA) transform,
* BGR(RGB)and GRAY transform,
* param src: input image data
* param dst: output image data
* param srcFormat: input image image format support: GRAY, NV12(NV21),
* BGR(RGB) and BGRA(RGBA)
* param dstFormat: output image image format, support GRAY, BGR(RGB) and
* BGRA(RGBA)
*/
void imageCovert(const uint8_t* src,
uint8_t* dst,
ImageFormat srcFormat,
ImageFormat dstFormat);
/*
* image resize, use bilinear method
* support image format: 1-channel image (egs: GRAY, 2-channel image (egs:
* NV12, NV21), 3-channel(egs: BGR), 4-channel(egs: BGRA)
* param src: input image data
* param dst: output image data
*/
void imageResize(const uint8_t* src, uint8_t* dst);
/*
image resize, use bilinear method
* support image format: 1-channel image (egs: GRAY, 2-channel image (egs:
NV12, NV21), 3-channel image(egs: BGR), 4-channel image(egs: BGRA)
* param src: input image data
* param dst: output image data
* param srcw: input image width
* param srch: input image height
* param dstw: output image width
* param dsth: output image height
*/
void imageResize(const uint8_t* src,
uint8_t* dst,
ImageFormat srcFormat,
int srcw,
int srch,
int dstw,
int dsth);
/*
* image Rotate
* support 90, 180 and 270 Rotate process
* color format support 1-channel image, 3-channel image and 4-channel image
* param src: input image data
* param dst: output image data
*/
void imageRotate(const uint8_t* src, uint8_t* dst);
/*
* image Rotate
* support 90, 180 and 270 Rotate process
* color format support 1-channel image, 3-channel image and 4-channel image
* param src: input image data
* param dst: output image data
* param srcFormat: input image format, support GRAY, BGR(GRB) and BGRA(RGBA)
* param srcw: input image width
* param srch: input image height
* param degree: Rotate degree, support 90, 180 and 270
*/
void imageRotate(const uint8_t* src,
uint8_t* dst,
ImageFormat srcFormat,
int srcw,
int srch,
float degree);
/*
* image Flip
* support X, Y and XY flip process
* color format support 1-channel image, 3-channel image and 4-channel image
* param src: input image data
* param dst: output image data
*/
void imageFlip(const uint8_t* src, uint8_t* dst);
/*
* image Flip
* support X, Y and XY flip process
* color format support 1-channel image, 3-channel image and 4-channel image
* param src: input image data
* param dst: output image data
* param srcFormat: input image format, support GRAY, BGR(GRB) and BGRA(RGBA)
* param srcw: input image width
* param srch: input image height
* param flip_param: flip parameter, support X, Y and XY
*/
void imageFlip(const uint8_t* src,
uint8_t* dst,
ImageFormat srcFormat,
int srcw,
int srch,
FlipParam flip_param);
/*
* change image data to tensor data
* support image format is BGR(RGB) and BGRA(RGBA), Data layout is NHWC and
* NCHW
* param src: input image data
* param dstTensor: output tensor data
* param layout: output tensor layout,support NHWC and NCHW
* param means: means of image
* param scales: scales of image
*/
void image2Tensor(const uint8_t* src,
Tensor* dstTensor,
LayoutType layout,
float* means,
float* scales);
/*
* change image data to tensor data
* support image format is BGR(RGB) and BGRA(RGBA), Data layout is NHWC and
* NCHW
* param src: input image data
* param dstTensor: output tensor data
* param srcFormat: input image format, support BGR(GRB) and BGRA(RGBA)
* param srcw: input image width
* param srch: input image height
* param layout: output tensor layout,support NHWC and NCHW
* param means: means of image
* param scales: scales of image
*/
void image2Tensor(const uint8_t* src,
Tensor* dstTensor,
ImageFormat srcFormat,
int srcw,
int srch,
LayoutType layout,
float* means,
float* scales);
private:
ImageFormat srcFormat_;
ImageFormat dstFormat_;
TransParam transParam_;
};
} // namespace cv
} // namespace utils
} // namespace lite
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册