diff --git a/CMakeLists.txt b/CMakeLists.txt index 1ec5352fa4009144b9f572ecbe061aba11e884d3..77a94bea1efcdafaa67b4c078bfb0a756f7b1cec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,6 +72,9 @@ lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF) # publish options lite_option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kernels and operators" OFF) lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF) +# cv build options +lite_option(LITE_WITH_CV "Enable build cv image in lite" OFF IF NOT LITE_WITH_ARM) + # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter. if(ANDROID OR IOS OR ARMLINUX) @@ -181,7 +184,7 @@ include(external/xxhash) # download install xxhash needed for x86 jit include(cudnn) include(configure) # add paddle env configuration -if(LITE_WITH_CUDA) +if(LITE_WITH_CUDA) include(cuda) endif() diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 5dbb7f3fca4a2ecdab943cd49f34ee97f9bac9b0..bc055d3186c6bfd77ff6a5e9f979af5082fa34e3 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -117,8 +117,12 @@ endif() if (LITE_WITH_ARM) add_definitions("-DLITE_WITH_ARM") + if (LITE_WITH_CV) + add_definitions("-DLITE_WITH_CV") + endif() endif() + if (WITH_ARM_DOTPROD) add_definitions("-DWITH_ARM_DOTPROD") endif() diff --git a/cmake/lite.cmake b/cmake/lite.cmake index 9b6fab3f6261ff13361bda35cfa9cd681075c77d..98dbc9ab7e27d4239676a686d85216595b5888e1 100644 --- a/cmake/lite.cmake +++ b/cmake/lite.cmake @@ -43,6 +43,11 @@ function (lite_deps TARGET) foreach(var ${lite_deps_ARM_DEPS}) set(deps ${deps} ${var}) endforeach(var) + if(LITE_WITH_CV) + foreach(var ${lite_cv_deps}) + set(deps ${deps} ${var}) + endforeach(var) + endif() endif() if(LITE_WITH_PROFILE) @@ -341,7 +346,7 @@ function(add_kernel TARGET device level) file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") endforeach() nv_library(${TARGET} SRCS ${args_SRCS} DEPS ${args_DEPS}) - return() + return() endif() # the source list will collect for paddle_use_kernel.h code generation. diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt index 71914a8a347c7f2105ba1194638405935c497a41..dc38718c40580185a9263a098c8eabb2d2310ac4 100644 --- a/lite/CMakeLists.txt +++ b/lite/CMakeLists.txt @@ -9,6 +9,7 @@ message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}") message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}") message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}") message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}") +message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}") set(LITE_MODEL_DIR "${THIRD_PARTY_PATH}/install") set(LITE_ON_MOBILE ${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK}) @@ -129,6 +130,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) #COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/model_optimize_tool" "${INFER_LITE_PUBLISH_ROOT}/bin" COMMAND cp "${CMAKE_BINARY_DIR}/lite/gen_code/paddle_code_generator" "${INFER_LITE_PUBLISH_ROOT}/bin" COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" ) if(NOT IOS) #add_dependencies(publish_inference_cxx_lib model_optimize_tool) @@ -136,10 +138,10 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) add_dependencies(publish_inference_cxx_lib bundle_full_api) add_dependencies(publish_inference_cxx_lib bundle_light_api) add_dependencies(publish_inference_cxx_lib test_model_bin) - if (ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux") + if (ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux") add_dependencies(publish_inference_cxx_lib paddle_full_api_shared) add_dependencies(publish_inference paddle_light_api_shared) - add_custom_command(TARGET publish_inference_cxx_lib + add_custom_command(TARGET publish_inference_cxx_lib COMMAND cp ${CMAKE_BINARY_DIR}/lite/api/*.so ${INFER_LITE_PUBLISH_ROOT}/cxx/lib) endif() add_dependencies(publish_inference publish_inference_cxx_lib) @@ -155,6 +157,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/include" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include" COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/lib" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" ) add_dependencies(tiny_publish_lib bundle_light_api) add_dependencies(publish_inference tiny_publish_lib) @@ -166,6 +169,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/libpaddle_light_api_shared.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" ) add_dependencies(tiny_publish_cxx_lib paddle_light_api_shared) add_dependencies(publish_inference tiny_publish_cxx_lib) diff --git a/lite/tests/CMakeLists.txt b/lite/tests/CMakeLists.txt index 11fa2f0cb6d26a2c2739cc2e90aadf61b58001d2..0416c33a81b524b4dba1c1b406d91204cca6946d 100644 --- a/lite/tests/CMakeLists.txt +++ b/lite/tests/CMakeLists.txt @@ -1,2 +1,3 @@ add_subdirectory(kernels) add_subdirectory(math) +add_subdirectory(cv) diff --git a/lite/tests/cv/CMakeLists.txt b/lite/tests/cv/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..05fcc06b10ae5dc6b009ae087ce4e18f8d82e475 --- /dev/null +++ b/lite/tests/cv/CMakeLists.txt @@ -0,0 +1,3 @@ +if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ARM) + lite_cc_test(image_convert_test SRCS image_convert_test.cc DEPS paddle_cv_arm paddle_api_light ${lite_cv_deps} ${arm_kernels} ${lite_ops} ${host_kernels}) +endif() diff --git a/lite/tests/cv/cv_basic.h b/lite/tests/cv/cv_basic.h new file mode 100644 index 0000000000000000000000000000000000000000..728d3167144bc6e03683b77803fb4887967eb524 --- /dev/null +++ b/lite/tests/cv/cv_basic.h @@ -0,0 +1,832 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "lite/utils/cv/paddle_image_preprocess.h" + +typedef paddle::lite::utils::cv::ImageFormat ImageFormat; +typedef paddle::lite::utils::cv::FlipParam FlipParam; +typedef paddle::lite::Tensor Tensor; +typedef paddle::lite_api::DataLayoutType LayoutType; + +void nv2bgr(const uint8_t* in_data, + uint8_t* out_data, + int srcw, + int srch, + int v_num, + int u_num) { + int size = srch * srcw; + const uint8_t* y_ptr = in_data; + const uint8_t* uv_ptr = in_data + size; + for (int i = 0; i < srch; i++) { + int j = 0; + const uint8_t* ptr_y1 = y_ptr + i * srcw; + const uint8_t* ptr_vu = uv_ptr + (i / 2) * srcw; + uint8_t* ptr_bgr1 = out_data + i * 3 * srcw; + for (; j < srcw; j += 2) { + uint8_t _y0 = ptr_y1[0]; + uint8_t _y1 = ptr_y1[1]; + uint8_t _v = ptr_vu[v_num]; + uint8_t _u = ptr_vu[u_num]; + + int ra = floor((179 * (_v - 128)) >> 7); + int ga = floor((44 * (_u - 128) + 91 * (_v - 128)) >> 7); + int ba = floor((227 * (_u - 128)) >> 7); + + int r = _y0 + ra; + int g = _y0 - ga; + int b = _y0 + ba; + + int r1 = _y1 + ra; + int g1 = _y1 - ga; + int b1 = _y1 + ba; + + r = r < 0 ? 0 : (r > 255) ? 255 : r; + g = g < 0 ? 0 : (g > 255) ? 255 : g; + b = b < 0 ? 0 : (b > 255) ? 255 : b; + + r1 = r1 < 0 ? 0 : (r1 > 255) ? 255 : r1; + g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1; + b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1; + + *ptr_bgr1++ = b; + *ptr_bgr1++ = g; + *ptr_bgr1++ = r; + + *ptr_bgr1++ = b1; + *ptr_bgr1++ = g1; + *ptr_bgr1++ = r1; + + ptr_y1 += 2; + ptr_vu += 2; + } + if (j < srcw) { + uint8_t _y = ptr_y1[0]; + uint8_t _v = ptr_vu[v_num]; + uint8_t _u = ptr_vu[u_num]; + + int r = _y + ((179 * (_v - 128)) >> 7); + int g = _y - ((44 * (_u - 128) - 91 * (_v - 128)) >> 7); + int b = _y + ((227 * (_u - 128)) >> 7); + + r = r < 0 ? 0 : (r > 255) ? 255 : r; + g = g < 0 ? 0 : (g > 255) ? 255 : g; + b = b < 0 ? 0 : (b > 255) ? 255 : b; + + ptr_bgr1[0] = b; + ptr_bgr1[1] = g; + ptr_bgr1[2] = r; + } + } +} +void nv2bgra(const uint8_t* in_data, + uint8_t* out_data, + int srcw, + int srch, + int v_num, + int u_num) { + int size = srch * srcw; + const uint8_t* y_ptr = in_data; + const uint8_t* uv_ptr = in_data + size; + for (int i = 0; i < srch; i++) { + int j = 0; + const uint8_t* ptr_y1 = y_ptr + i * srcw; + const uint8_t* ptr_vu = uv_ptr + (i / 2) * srcw; + uint8_t* ptr_bgr1 = out_data + i * 4 * srcw; + for (; j < srcw; j += 2) { + uint8_t _y0 = ptr_y1[0]; + uint8_t _y1 = ptr_y1[1]; + uint8_t _v = ptr_vu[v_num]; + uint8_t _u = ptr_vu[u_num]; + + int ra = floor((179 * (_v - 128)) >> 7); + int ga = floor((44 * (_u - 128) + 91 * (_v - 128)) >> 7); + int ba = floor((227 * (_u - 128)) >> 7); + + int r = _y0 + ra; + int g = _y0 - ga; + int b = _y0 + ba; + + int r1 = _y1 + ra; + int g1 = _y1 - ga; + int b1 = _y1 + ba; + + r = r < 0 ? 0 : (r > 255) ? 255 : r; + g = g < 0 ? 0 : (g > 255) ? 255 : g; + b = b < 0 ? 0 : (b > 255) ? 255 : b; + + r1 = r1 < 0 ? 0 : (r1 > 255) ? 255 : r1; + g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1; + b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1; + + *ptr_bgr1++ = b; + *ptr_bgr1++ = g; + *ptr_bgr1++ = r; + *ptr_bgr1++ = 255; + + *ptr_bgr1++ = b1; + *ptr_bgr1++ = g1; + *ptr_bgr1++ = r1; + *ptr_bgr1++ = 255; + + ptr_y1 += 2; + ptr_vu += 2; + } + if (j < srcw) { + uint8_t _y = ptr_y1[0]; + uint8_t _v = ptr_vu[v_num]; + uint8_t _u = ptr_vu[u_num]; + + int r = _y + ((179 * (_v - 128)) >> 7); + int g = _y - ((44 * (_u - 128) - 91 * (_v - 128)) >> 7); + int b = _y + ((227 * (_u - 128)) >> 7); + + r = r < 0 ? 0 : (r > 255) ? 255 : r; + g = g < 0 ? 0 : (g > 255) ? 255 : g; + b = b < 0 ? 0 : (b > 255) ? 255 : b; + + ptr_bgr1[0] = b; + ptr_bgr1[1] = g; + ptr_bgr1[2] = r; + ptr_bgr1[3] = 255; + } + } +} + +void nv12_bgr_basic(const uint8_t* in_data, + uint8_t* out_data, + int srcw, + int srch) { + nv2bgr(in_data, out_data, srcw, srch, 1, 0); +} + +void nv21_bgr_basic(const uint8_t* in_data, + uint8_t* out_data, + int srcw, + int srch) { + nv2bgr(in_data, out_data, srcw, srch, 0, 1); +} +void nv12_bgra_basic(const uint8_t* in_data, + uint8_t* out_data, + int srcw, + int srch) { + nv2bgra(in_data, out_data, srcw, srch, 1, 0); +} + +void nv21_bgra_basic(const uint8_t* in_data, + uint8_t* out_data, + int srcw, + int srch) { + nv2bgra(in_data, out_data, srcw, srch, 0, 1); +} + +/* +/* +采用CV_BGR2GRAY,转换公式Gray = 0.1140*B + 0.5870*G + 0.2989*R +采用CV_RGB2GRAY,转换公式Gray = 0.1140*R + 0.5870*G + 0.2989*B +b = 0.114 *128 = 14.529 = 15 +g = 0.587 * 128 = 75.136 = 75 +r = 0.2989 * 128 = 38.2592 = 38 +Gray = (15*B + 75*G + 38*R)/128 +bgr2gray, rgb2gray +*/ +void bgr_gray_basic(const uint8_t* in_data, + uint8_t* out_data, + int srcw, + int srch) { + for (int i = 0; i < srch; i++) { + const uint8_t* din_ptr = in_data + i * 3 * srcw; + uint8_t* dout_ptr = out_data + i * srcw; + for (int j = 0; j < srcw; j++) { + int sum = din_ptr[0] * 15 + din_ptr[1] * 75 + din_ptr[2] * 38; + sum = sum >> 7; + *dout_ptr++ = sum; + din_ptr += 3; + } + } +} + +void gray_bgr_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) { + for (int i = 0; i < srch; i++) { + for (int j = 0; j < srcw; j++) { + *dst++ = *src; + *dst++ = *src; + *dst++ = *src; + src++; + } + } +} +// bgr2bgra, rgb2rgba +void hwc3_to_hwc4_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) { + for (int i = 0; i < srch; i++) { + for (int j = 0; j < srcw; j++) { + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = 255; + } + } +} +// bgra2bgr, rgba2rgb +void hwc4_to_hwc3_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) { + for (int i = 0; i < srch; i++) { + for (int j = 0; j < srcw; j++) { + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + src++; + } + } +} +// bgr2rgb, rgb2bgr +void hwc3_trans_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) { + for (int i = 0; i < srch; i++) { + for (int j = 0; j < srcw; j++) { + *dst++ = src[2]; // r + *dst++ = src[1]; // g + *dst++ = src[0]; // b + src += 3; + } + } +} +// bgra2rgba, rgba2bgra +void hwc4_trans_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) { + for (int i = 0; i < srch; i++) { + for (int j = 0; j < srcw; j++) { + *dst++ = src[2]; // r + *dst++ = src[1]; // g + *dst++ = src[0]; // b + *dst++ = src[3]; // a + src += 4; + } + } +} +// bgra2rgb, rgba2bgr +void hwc4_trans_hwc3_basic(const uint8_t* src, + uint8_t* dst, + int srcw, + int srch) { + for (int i = 0; i < srch; i++) { + for (int j = 0; j < srcw; j++) { + *dst++ = src[2]; // r + *dst++ = src[1]; // g + *dst++ = src[0]; // b + // *dst++ = src[4];//a + src += 4; + } + } +} +// bgr2rgba, rgb2bga +void hwc3_trans_hwc4_basic(const uint8_t* src, + uint8_t* dst, + int srcw, + int srch) { + for (int i = 0; i < srch; i++) { + for (int j = 0; j < srcw; j++) { + *dst++ = src[2]; // r + *dst++ = src[1]; // g + *dst++ = src[0]; // b + *dst++ = 255; // a + src += 3; + } + } +} +void image_convert_basic(const uint8_t* in_data, + uint8_t* out_data, + ImageFormat srcFormat, + ImageFormat dstFormat, + int srcw, + int srch, + int out_size) { + if (srcFormat == dstFormat) { + // copy + memcpy(out_data, in_data, sizeof(uint8_t) * out_size); + return; + } else { + if (srcFormat == ImageFormat::NV12 && + (dstFormat == ImageFormat::BGR || dstFormat == ImageFormat::RGB)) { + nv12_bgr_basic(in_data, out_data, srcw, srch); + } else if (srcFormat == ImageFormat::NV21 && + (dstFormat == ImageFormat::BGR || + dstFormat == ImageFormat::RGB)) { + nv21_bgr_basic(in_data, out_data, srcw, srch); + } else if (srcFormat == ImageFormat::NV12 && + (dstFormat == ImageFormat::BGRA || + dstFormat == ImageFormat::RGBA)) { + nv12_bgra_basic(in_data, out_data, srcw, srch); + } else if (srcFormat == ImageFormat::NV21 && + (dstFormat == ImageFormat::BGRA || + dstFormat == ImageFormat::RGBA)) { + nv21_bgra_basic(in_data, out_data, srcw, srch); + } else if ((srcFormat == ImageFormat::RGB && + dstFormat == ImageFormat::GRAY) || + (srcFormat == ImageFormat::BGR && + dstFormat == ImageFormat::GRAY)) { + bgr_gray_basic(in_data, out_data, srcw, srch); + } else if ((srcFormat == ImageFormat::GRAY && + dstFormat == ImageFormat::RGB) || + (srcFormat == ImageFormat::GRAY && + dstFormat == ImageFormat::BGR)) { + gray_bgr_basic(in_data, out_data, srcw, srch); + } else if ((srcFormat == ImageFormat::RGBA && + dstFormat == ImageFormat::RGB) || + (srcFormat == ImageFormat::BGRA && + dstFormat == ImageFormat::BGR)) { + hwc4_to_hwc3_basic(in_data, out_data, srcw, srch); + } else if ((srcFormat == ImageFormat::RGB && + dstFormat == ImageFormat::RGBA) || + (srcFormat == ImageFormat::BGR && + dstFormat == ImageFormat::BGRA)) { + hwc3_to_hwc4_basic(in_data, out_data, srcw, srch); + } else if ((srcFormat == ImageFormat::RGB && + dstFormat == ImageFormat::BGR) || + (srcFormat == ImageFormat::BGR && + dstFormat == ImageFormat::RGB)) { + hwc3_trans_basic(in_data, out_data, srcw, srch); + } else if ((srcFormat == ImageFormat::RGBA && + dstFormat == ImageFormat::BGRA) || + (srcFormat == ImageFormat::BGRA && + dstFormat == ImageFormat::RGBA)) { + hwc4_trans_basic(in_data, out_data, srcw, srch); + } else if ((srcFormat == ImageFormat::RGBA && + dstFormat == ImageFormat::BGR) || + (srcFormat == ImageFormat::BGRA && + dstFormat == ImageFormat::RGB)) { + hwc4_trans_hwc3_basic(in_data, out_data, srcw, srch); + } else if ((srcFormat == ImageFormat::RGB && + dstFormat == ImageFormat::BGRA) || + (srcFormat == ImageFormat::BGR && + dstFormat == ImageFormat::RGBA)) { + hwc3_trans_hwc4_basic(in_data, out_data, srcw, srch); + } else { + printf("srcFormat: %d, dstFormat: %d does not support! \n", + srcFormat, + dstFormat); + } + // for (int i = 0; i < out_size; i++){ + // printf("%d ", *out_data++); + // if ((i+1) % 10 == 0){ + // printf("\n"); + // } + // } + } +} + +void compute_xy(int srcw, + int srch, + int dstw, + int dsth, + double scale_x, + double scale_y, + int* xofs, + int* yofs, + float* ialpha, + float* ibeta) { + float fy = 0.f; + float fx = 0.f; + int sy = 0; + int sx = 0; + const int resize_coef_bits = 11; + const int resize_coef_scale = 1 << resize_coef_bits; + for (int dx = 0; dx < dstw; dx++) { + fx = static_cast((dx + 0.5) * scale_x - 0.5); + sx = floor(fx); + fx -= sx; + + if (sx < 0) { + sx = 0; + fx = 0.f; + } + if (sx >= srcw - 1) { + sx = srcw - 2; + fx = 1.f; + } + xofs[dx] = sx; + + float a0 = (1.f - fx); + float a1 = fx; + + ialpha[dx * 2] = a0; + ialpha[dx * 2 + 1] = a1; + } + for (int dy = 0; dy < dsth; dy++) { + fy = static_cast((dy + 0.5) * scale_y - 0.5); + sy = floor(fy); + fy -= sy; + + if (sy < 0) { + sy = 0; + fy = 0.f; + } + if (sy >= srch - 1) { + sy = srch - 2; + fy = 1.f; + } + + yofs[dy] = sy; + + float b0 = (1.f - fy); + float b1 = fy; + + ibeta[dy * 2] = b0; + ibeta[dy * 2 + 1] = b1; + } +} +void image_resize_basic(const uint8_t* in_data, + uint8_t* out_data, + ImageFormat srcFormat, + int srcw, + int srch, + int dstw, + int dsth) { + int size = srcw * srch; + if (srcw == dstw && srch == dsth) { + if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) { + size = srcw * (ceil(1.5 * srch)); + } else if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) { + size = 3 * srcw * srch; + } else if (srcFormat == ImageFormat::BGRA || + srcFormat == ImageFormat::RGBA) { + size = 4 * srcw * srch; + } + memcpy(out_data, in_data, sizeof(uint8_t) * size); + return; + } + double scale_x = static_cast(srcw / dstw); + double scale_y = static_cast(srch / dsth); + + int* buf = new int[dstw + dsth]; + + int* xofs = buf; + int* yofs = buf + dstw; + float* ialpha = new float[dstw * 2]; + float* ibeta = new float[dsth * 2]; + + int w_in = srcw; + int w_out = dstw; + int num = 1; + int orih = dsth; + compute_xy( + srcw, srch, dstw, dsth, scale_x, scale_y, xofs, yofs, ialpha, ibeta); + + if (srcFormat == ImageFormat::GRAY) { + num = 1; + } else if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) { + int hout = static_cast(0.5 * dsth); + // uv todo + w_out = dstw; + num = 1; + dsth += hout; + } else if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) { + w_in = srcw * 3; + w_out = dstw * 3; + num = 3; + } else if (srcFormat == ImageFormat::BGRA || srcFormat == ImageFormat::RGBA) { + w_in = srcw * 4; + w_out = dstw * 4; + num = 4; + } + float* ialpha1 = nullptr; + int* xofs1 = nullptr; + int* yofs1 = nullptr; + if (orih < dsth) { + int tmp = dsth - orih; + float* ialpha1 = new float[dstw]; + int* xofs1 = new int[srcw]; + int* yofs1 = new int[tmp]; + compute_xy(srcw / 2, + srch / 2, + dstw / 2, + tmp, + scale_x, + scale_y, + xofs1, + yofs1, + ialpha1, + ibeta + dsth); + } +#pragma omp parallel for + for (int dy = 0; dy < dsth; dy++) { + uint8_t* out_ptr = out_data + dy * w_out; + int y_in_start = yofs[dy]; + int y_in_end = y_in_start + 1; + int y_flag = 0; // only one line + if (y_in_start < 0) { + y_flag = 1; + } + float b0 = ibeta[dy * 2]; + float b1 = ibeta[dy * 2 + 1]; + if (dy >= orih) { + num = 2; // uv + ialpha = ialpha1; + xofs = xofs1; + yofs = yofs1; + } + for (int dx = 0; dx < w_out; dx += num) { + int tmp = dx / num; + int x_in_start = xofs[tmp] * num; // 0 + int x_in_end = x_in_start + num; // 2 + int x_flag = 0; + if (x_in_start < 0) { + x_flag = 1; + x_in_end = 0; + } + // printf("x_in: %d, y_in: %d \n", x_in_start, y_in_start); + float a0 = ialpha[tmp * 2]; + float a1 = ialpha[tmp * 2 + 1]; + int tl_index = y_in_start * w_in + x_in_start; // 0 + int tr_index = y_in_start * w_in + x_in_end; // 2 + int bl_index = y_in_end * w_in + x_in_start; + int br_index = y_in_end * w_in + x_in_end; + int ind = dx; + for (int i = 0; i < num; i++) { + int tl = in_data[tl_index]; + int tr = in_data[tr_index]; + int bl = in_data[bl_index]; + int br = in_data[br_index]; + if (y_flag == 1) { + tl = 0; + tr = 0; + } + if (x_flag == 1) { + tl = 0; + bl = 0; + } + tl_index++; + tr_index++; + bl_index++; + br_index++; + float outval = (tl * a0 + tr * a1) * b0 + (bl * a0 + br * a1) * b1; + // printf("tl: %d, tr: %d, bl: %d, br: %d \n", tl, tr, bl, br); + // printf("br_index: %d, a0: %f, b1: %f, out: %f \n", br_index, a0, b1, + // outval); + out_ptr[ind++] = ceil(outval); + } + } + } +} + +void rotate90_basic(const uint8_t* in_data, + int h_in, + int w_in, + uint8_t* out_data, + int h_out, + int w_out, + int num) { + int win = w_in * num; + int wout = w_out * num; + for (int x = 0; x < h_in; x++) { + for (int y = 0; y < w_in; y++) { + int tmpy = y * num; + int tmpx = (w_out - 1 - x) * num; // x + for (int i = 0; i < num; i++) { + out_data[y * wout + tmpx] = in_data[x * win + tmpy]; + tmpx++; + tmpy++; + } + } + } +} + +void rotate180_basic(const uint8_t* in_data, + int h_in, + int w_in, + uint8_t* out_data, + int h_out, + int w_out, + int num) { + int win = w_in * num; + int h = h_in - 1; + int w = win - 1; + for (int x = 0; x < h_in; x++) { + for (int y = 0; y < w_in; y++) { + int tmpy = y * num; + int tmp = tmpy + (num - 1); + for (int i = 0; i < num; i++) { + out_data[(h - x) * win + w - tmp] = in_data[x * win + tmpy]; + tmpy++; + tmp--; + } + } + } +} +void rotate270_basic(const uint8_t* in_data, + int h_in, + int w_in, + uint8_t* out_data, + int h_out, + int w_out, + int num) { + int win = w_in * num; + int wout = w_out * num; + int h = h_out - 1; + for (int x = 0; x < h_in; x++) { + for (int y = 0; y < w_in; y++) { + int tmpy = y * num; + int tmpx = x * num; + for (int i = 0; i < num; i++) { + out_data[(h - y) * wout + tmpx] = + in_data[x * win + tmpy]; // (y,x) = in(x,y) + tmpx++; + tmpy++; + } + } + } +} + +void image_rotate_basic(const uint8_t* in_data, + uint8_t* out_data, + ImageFormat srcFormat, + int srcw, + int srch, + float rotate) { + int num = 1; + if (srcFormat == ImageFormat::GRAY) { + num = 1; + } else if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) { + num = 1; // todo + return; + } else if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) { + num = 3; + } else if (srcFormat == ImageFormat::BGRA || srcFormat == ImageFormat::RGBA) { + num = 4; + } + if (rotate == 90) { + rotate90_basic(in_data, srch, srcw, out_data, srcw, srch, num); + } else if (rotate == 180) { + rotate180_basic(in_data, srch, srcw, out_data, srch, srcw, num); + } else if (rotate == 270) { + rotate270_basic(in_data, srch, srcw, out_data, srcw, srch, num); + } +} + +void flipx_basic( + const uint8_t* in_data, int h_in, int w_in, uint8_t* out_data, int num) { + int h = h_in - 1; + int w = w_in * num; + for (int x = 0; x < h_in; x++) { + for (int y = 0; y < w_in; y++) { + int tmpy = y * num; + for (int i = 0; i < num; i++) { + out_data[(h - x) * w + tmpy] = + in_data[x * w + tmpy]; // (y,x) = in(x,y) + tmpy++; + } + } + } +} + +void flipy_basic( + const uint8_t* in_data, int h_in, int w_in, uint8_t* out_data, int num) { + int w = w_in * num - 1; + for (int x = 0; x < h_in; x++) { + for (int y = 0; y < w_in; y++) { + int tmpy = y * num; + int tmp = tmpy + (num - 1); + for (int i = 0; i < num; i++) { + out_data[x * w_in * num + w - tmp] = + in_data[x * w_in * num + tmpy]; // (y,x) = in(x,y) + tmpy++; + tmp--; + } + } + } +} + +void flipxy_basic( + const uint8_t* in_data, int h_in, int w_in, uint8_t* out_data, int num) { + int win = w_in * num; + int h = h_in - 1; + int w = win - 1; + for (int x = 0; x < h_in; x++) { + for (int y = 0; y < w_in; y++) { + int tmpy = y * num; + int tmp = tmpy + (num - 1); + for (int i = 0; i < num; i++) { + out_data[(h - x) * win + w - tmp] = + in_data[x * win + tmpy]; // (h-y,w-x) = in(x,y) + tmpy++; + tmp--; + } + } + } +} + +void image_flip_basic(const uint8_t* in_data, + uint8_t* out_data, + ImageFormat srcFormat, + int srcw, + int srch, + FlipParam flip) { + int num = 1; + if (srcFormat == ImageFormat::GRAY) { + num = 1; + } else if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) { + num = 1; // todo + return; + } else if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) { + num = 3; + } else if (srcFormat == ImageFormat::BGRA || srcFormat == ImageFormat::RGBA) { + num = 4; + } + // printf("image_flip_basic: %d \n", flip); + if (flip == FlipParam::X) { + flipx_basic(in_data, srch, srcw, out_data, num); + } else if (flip == FlipParam::Y) { + flipy_basic(in_data, srch, srcw, out_data, num); + } else if (flip == FlipParam::XY) { + flipxy_basic(in_data, srch, srcw, out_data, num); + } +} + +void bgr_to_tensor_chw_basic(const uint8_t* bgr, + float* output, + int width, + int height, + float* means, + float* scales, + int num) { + int size = width * height; + float r_means = means[0]; + float g_means = means[1]; + float b_means = means[2]; + float r_scales = scales[0]; + float g_scales = scales[1]; + float b_scales = scales[2]; + + for (int h = 0; h < height; h++) { + const uint8_t* ptr_bgr = bgr + h * width * num; + float* ptr_b = output + h * width; + float* ptr_g = ptr_b + size; + float* ptr_r = ptr_g + size; + for (int i = 0; i < width; i++) { + *ptr_b++ = (ptr_bgr[0] - b_means) * b_scales; + *ptr_g++ = (ptr_bgr[1] - g_means) * g_scales; + *ptr_r++ = (ptr_bgr[2] - r_means) * r_scales; + ptr_bgr += num; + } + } +} + +void bgr_to_tensor_hwc_basic(const uint8_t* bgr, + float* output, + int width, + int height, + float* means, + float* scales, + int num) { + int size = width * height; + float r_means = means[0]; + float g_means = means[1]; + float b_means = means[2]; + float r_scales = scales[0]; + float g_scales = scales[1]; + float b_scales = scales[2]; + + for (int h = 0; h < height; h++) { + const uint8_t* ptr_bgr = bgr + h * width * num; + float* out_bgr = output + h * width * num; + for (int i = 0; i < width; i++) { + *out_bgr++ = (ptr_bgr[0] - b_means) * b_scales; + *out_bgr++ = (ptr_bgr[1] - g_means) * g_scales; + *out_bgr++ = (ptr_bgr[2] - r_means) * r_scales; + ptr_bgr += num; + } + } +} + +void image_to_tensor_basic(const uint8_t* in_data, + Tensor* dst, + ImageFormat srcFormat, + LayoutType layout, + int srcw, + int srch, + float* means, + float* scales) { + float* output = dst->mutable_data(); + if (layout == LayoutType::kNCHW && + (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB)) { + bgr_to_tensor_chw_basic(in_data, output, srcw, srch, means, scales, 3); + } else if (layout == LayoutType::kNHWC && + (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB)) { + bgr_to_tensor_hwc_basic(in_data, output, srcw, srch, means, scales, 3); + } else if (layout == LayoutType::kNCHW && (srcFormat == ImageFormat::BGRA || + srcFormat == ImageFormat::RGBA)) { + bgr_to_tensor_chw_basic(in_data, output, srcw, srch, means, scales, 4); + } else if (layout == LayoutType::kNHWC && (srcFormat == ImageFormat::BGRA || + srcFormat == ImageFormat::RGBA)) { + bgr_to_tensor_hwc_basic(in_data, output, srcw, srch, means, scales, 4); + } +} diff --git a/lite/tests/cv/image_convert_test.cc b/lite/tests/cv/image_convert_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..7c0f867fae4bca1957ba1610db5f40b8c8dbabdf --- /dev/null +++ b/lite/tests/cv/image_convert_test.cc @@ -0,0 +1,723 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include "lite/core/context.h" +#include "lite/tests/cv/cv_basic.h" +#include "lite/tests/utils/timer.h" +#include "lite/utils/cv/paddle_image_preprocess.h" + +DEFINE_int32(cluster, 3, "cluster id"); +DEFINE_int32(threads, 1, "threads num"); +DEFINE_int32(warmup, 0, "warmup times"); +DEFINE_int32(repeats, 1, "repeats times"); +DEFINE_bool(basic_test, false, "do all tests"); +DEFINE_bool(check_result, true, "check the result"); + +DEFINE_int32(srcFormat, 0, "input image format"); +DEFINE_int32(dstFormat, 1, "output image format"); +DEFINE_int32(srch, 1920, "input height"); +DEFINE_int32(srcw, 1080, "input width"); +DEFINE_int32(dsth, 960, "output height"); +DEFINE_int32(dstw, 540, "output width"); +DEFINE_int32(angle, 90, "rotate angel"); +DEFINE_int32(flip_num, 0, "flip x"); +DEFINE_int32(layout, 0, "layout nchw"); + +typedef paddle::lite::utils::cv::ImageFormat ImageFormat; +typedef paddle::lite::utils::cv::FlipParam FlipParam; +typedef paddle::lite_api::DataLayoutType LayoutType; +typedef paddle::lite::utils::cv::TransParam TransParam; +typedef paddle::lite::utils::cv::ImagePreprocess ImagePreprocess; +typedef paddle::lite_api::Tensor Tensor_api; +typedef paddle::lite::Tensor Tensor; + +using paddle::lite::Timer; + +void fill_tensor_host_rand(uint8_t* dio, int64_t size) { + uint seed = 256; + for (int64_t i = 0; i < size; ++i) { + dio[i] = rand_r(&seed) % 256; // -128; + } +} + +void print_int8(uint8_t* ptr, int size, int width) { + for (int i = 0; i < size; i++) { + printf("%d ", *ptr++); + if ((i + 1) % width == 0) { + printf("\n"); + } + } + printf("\n"); +} + +void print_int(int* ptr, int size, int width) { + int j = 0; + for (int i = 0; i < size; i++) { + printf("%d ", *ptr++); + if ((i + 1) % width == 0) { + printf("\n"); + } + } + printf("\n"); +} + +void print_ff(const float* ptr, int size, int width) { + int j = 0; + for (int i = 0; i < size; i++) { + printf("%f ", *ptr++); + if ((i + 1) % width == 0) { + printf("\n"); + } + } + printf("\n"); +} + +#ifdef LITE_WITH_ARM +void test_img(const std::vector& cluster_id, + const std::vector& thread_num, + int srcw, + int srch, + int dstw, + int dsth, + ImageFormat srcFormat, + ImageFormat dstFormat, + float rotate, + FlipParam flip, + LayoutType layout, + int test_iter = 1) { +#ifdef LITE_WITH_ARM + paddle::lite::DeviceInfo::Init(); +#endif + for (auto& cls : cluster_id) { + for (auto& th : thread_num) { + std::unique_ptr ctx1( + new paddle::lite::KernelContext); + auto& ctx = ctx1->As(); + ctx.SetRunMode(static_cast(cls), th); + LOG(INFO) << "cluster: " << cls << ", threads: " << th; + + LOG(INFO) << " input tensor size, num= " << 1 << ", channel= " << 1 + << ", height= " << srch << ", width= " << srcw + << ", srcFormat= " << (ImageFormat)srcFormat; + // RGBA = 0, BGRA, RGB, BGR, GRAY, NV21 = 11, NV12, + if (srcFormat == ImageFormat::NV21) { + LOG(INFO) << "srcFormat: NV21"; + } + if (srcFormat == ImageFormat::NV12) { + LOG(INFO) << "srcFormat: NV12"; + } + if (srcFormat == ImageFormat::GRAY) { + LOG(INFO) << "srcFormat: GRAY"; + } + if (srcFormat == ImageFormat::BGRA) { + LOG(INFO) << "srcFormat: BGRA"; + } + if (srcFormat == ImageFormat::BGR) { + LOG(INFO) << "srcFormat: BGR"; + } + if (srcFormat == ImageFormat::RGBA) { + LOG(INFO) << "srcFormat: RGBA"; + } + if (srcFormat == ImageFormat::RGB) { + LOG(INFO) << "srcFormat: RGB"; + } + LOG(INFO) << " output tensor size, num=" << 1 << ", channel=" << 1 + << ", height=" << dsth << ", width=" << dstw + << ", dstFormat= " << (ImageFormat)dstFormat; + + if (dstFormat == ImageFormat::NV21) { + LOG(INFO) << "dstFormat: NV21"; + } + if (dstFormat == ImageFormat::NV12) { + LOG(INFO) << "dstFormat: NV12"; + } + if (dstFormat == ImageFormat::GRAY) { + LOG(INFO) << "dstFormat: GRAY"; + } + if (dstFormat == ImageFormat::BGRA) { + LOG(INFO) << "dstFormat: BGRA"; + } + if (dstFormat == ImageFormat::BGR) { + LOG(INFO) << "dstFormat: BGR"; + } + if (dstFormat == ImageFormat::RGBA) { + LOG(INFO) << "dstFormat: RGBA"; + } + if (dstFormat == ImageFormat::RGB) { + LOG(INFO) << "dstFormat: RGB"; + } + + LOG(INFO) << "Rotate = " << rotate << ", Flip = " << flip + << ", Layout = " << static_cast(layout); + + int size = 3 * srch * srcw; + if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) { + size = ceil(1.5 * srch) * srcw; + } else if (srcFormat == ImageFormat::BGRA || + srcFormat == ImageFormat::RGBA) { + size = 4 * srch * srcw; + } else if (srcFormat == ImageFormat::GRAY) { + size = srch * srcw; + } + uint8_t* src = new uint8_t[size]; + fill_tensor_host_rand(src, size); + + int out_size = srch * srcw; + int resize = dstw * dsth; + if (dstFormat == ImageFormat::NV12 || dstFormat == ImageFormat::NV21) { + out_size = ceil(1.5 * srch) * srcw; + resize = ceil(1.5 * dsth) * dstw; + } else if (dstFormat == ImageFormat::BGR || + dstFormat == ImageFormat::RGB) { + out_size = 3 * srch * srcw; + resize = 3 * dsth * dstw; + } else if (dstFormat == ImageFormat::BGRA || + dstFormat == ImageFormat::RGBA) { + out_size = 4 * srch * srcw; + resize = 4 * dsth * dstw; + } else if (dstFormat == ImageFormat::GRAY) { + out_size = srch * srcw; + resize = dsth * dstw; + } + // out + uint8_t* basic_dst = new uint8_t[out_size]; + uint8_t* lite_dst = new uint8_t[out_size]; + + // resize + uint8_t* resize_basic = new uint8_t[resize]; + uint8_t* resize_tmp = new uint8_t[resize]; + + uint8_t* tv_out_ratote_basic = new uint8_t[resize]; + uint8_t* tv_out_ratote = new uint8_t[resize]; + + uint8_t* tv_out_flip_basic = new uint8_t[resize]; + uint8_t* tv_out_flip = new uint8_t[resize]; + + std::vector shape_out = {1, 3, dsth, dstw}; + + Tensor tensor; + Tensor tensor_basic; + tensor.Resize(shape_out); + tensor_basic.Resize(shape_out); + tensor.set_precision(PRECISION(kFloat)); + tensor_basic.set_precision(PRECISION(kFloat)); + + float means[3] = {127.5f, 127.5f, 127.5f}; + float scales[3] = {1 / 127.5f, 1 / 127.5f, 1 / 127.5f}; + + if (FLAGS_check_result) { + LOG(INFO) << "image convert basic compute"; + image_convert_basic(src, + basic_dst, + (ImageFormat)srcFormat, + (ImageFormat)dstFormat, + srcw, + srch, + out_size); + + LOG(INFO) << "image resize basic compute"; + image_resize_basic(basic_dst, + resize_basic, + (ImageFormat)dstFormat, + srcw, + srch, + dstw, + dsth); + + LOG(INFO) << "image rotate basic compute"; + image_rotate_basic(resize_basic, + tv_out_ratote_basic, + (ImageFormat)dstFormat, + dstw, + dsth, + rotate); + + LOG(INFO) << "image flip basic compute"; + image_flip_basic(resize_basic, + tv_out_flip_basic, + (ImageFormat)dstFormat, + dstw, + dsth, + flip); + + LOG(INFO) << "image to tensor basic compute"; + image_to_tensor_basic(resize_basic, + &tensor_basic, + (ImageFormat)dstFormat, + layout, + dstw, + dsth, + means, + scales); + } + + Timer t1; + + LOG(INFO) << "saber cv compute"; + double to = 0; + double min_time = 100000; + TransParam tparam; + tparam.ih = srch; + tparam.iw = srcw; + tparam.oh = dsth; + tparam.ow = dstw; + tparam.flip_param = flip; + tparam.rotate_param = rotate; + + Tensor_api dst_tensor(&tensor); + dst_tensor.Resize(shape_out); + + ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam); + + for (int i = 0; i < test_iter; ++i) { + t1.clear(); + t1.start(); + + LOG(INFO) << "image convert saber compute"; + // 方法一: image_preprocess.imageCovert(src, lite_dst); + image_preprocess.imageCovert( + src, lite_dst, (ImageFormat)srcFormat, (ImageFormat)dstFormat); + + LOG(INFO) << "image resize saber compute"; + // 方法一:image_preprocess.imageResize(lite_dst, resize_tmp); + image_preprocess.imageResize(lite_dst, + resize_tmp, + (ImageFormat)dstFormat, + srcw, + srch, + dstw, + dsth); + + LOG(INFO) << "image rotate saber compute"; + // 方法一: image_preprocess.imageRotate(resize_tmp, tv_out_ratote); + image_preprocess.imageRotate(resize_tmp, + tv_out_ratote, + (ImageFormat)dstFormat, + dstw, + dsth, + rotate); + + LOG(INFO) << "image flip saber compute"; + // 方法一: image_preprocess.imageFlip(resize_tmp, tv_out_flip); + image_preprocess.imageFlip( + resize_tmp, tv_out_flip, (ImageFormat)dstFormat, dstw, dsth, flip); + + LOG(INFO) << "image to tensor compute"; + // 方法一: image_preprocess.image2Tensor( + // resize_tmp, &dst_tensor, layout, means, scales); + image_preprocess.image2Tensor(resize_tmp, + &dst_tensor, + (ImageFormat)dstFormat, + dstw, + dsth, + layout, + means, + scales); + + t1.end(); + double tdiff = t1.get_average_ms(); + to += tdiff; + if (tdiff < min_time) { + min_time = tdiff; + } + } + LOG(INFO) << "image trans total time : " << to + << ", avg time : " << to / test_iter; + + double max_ratio = 0; + double max_diff = 0; + const double eps = 1e-6f; + if (FLAGS_check_result) { + LOG(INFO) << "diff, image convert size: " << out_size; + uint8_t* diff_v = new uint8_t[out_size]; + for (int i = 0; i < out_size; i++) { + uint8_t a = lite_dst[i]; + uint8_t b = basic_dst[i]; + uint8_t diff1 = a - b; + uint8_t diff = diff1 > 0 ? diff1 : -diff1; + diff_v[i] = diff; + if (max_diff < diff) { + max_diff = diff; + max_ratio = 2.0 * max_diff / (a + b + eps); + } + } + if (std::abs(max_ratio) >= 1e-5f) { + int width = size / srch; + printf("din: \n"); + print_int8(src, size, width); + width = out_size / srch; + printf("saber result: \n"); + print_int8(lite_dst, out_size, width); + printf("basic result: \n"); + print_int8(basic_dst, out_size, width); + printf("diff result: \n"); + print_int8(diff_v, out_size, width); + } + delete[] diff_v; + LOG(INFO) << "compare result, max diff: " << max_diff + << ", max ratio: " << max_ratio; + bool rst = std::abs(max_ratio) < 1e-5f; + CHECK_EQ(rst, true) << "compute result error"; + } + LOG(INFO) << "image convert end"; + if (FLAGS_check_result) { + max_ratio = 0; + max_diff = 0; + // const double eps = 1e-6f; + int* diff_v = new int[resize]; + LOG(INFO) << "diff, image resize size: " << resize; + for (int i = 0; i < resize; i++) { + uint8_t a = resize_tmp[i]; + uint8_t b = resize_basic[i]; + int diff1 = a - b; + int diff = 0; // basic resize and saber resize 在float -> + // int转换时存在误差,误差范围是{-1, 1} + if (diff1 < -1 || diff1 > 1) diff = diff1 < 0 ? -diff1 : diff1; + diff_v[i] = diff; + if (max_diff < diff) { + max_diff = diff; + max_ratio = 2.0 * max_diff / (a + b + eps); + } + } + if (std::abs(max_ratio) >= 1e-5f) { + int width = out_size / srch; + printf("din: \n"); + print_int8(lite_dst, out_size, width); + width = resize / dsth; + printf("saber result: \n"); + print_int8(resize_tmp, resize, width); + printf("basic result: \n"); + print_int8(resize_basic, resize, width); + printf("diff result: \n"); + print_int(diff_v, resize, width); + } + delete[] diff_v; + // printf("\n"); + LOG(INFO) << "compare result, max diff: " << max_diff + << ", max ratio: " << max_ratio; + bool rst = std::abs(max_ratio) < 1e-5f; + CHECK_EQ(rst, true) << "compute result error"; + } + delete[] lite_dst; + delete[] basic_dst; + LOG(INFO) << "image resize end"; + + if (FLAGS_check_result) { + max_ratio = 0; + max_diff = 0; + int* diff_v = new int[resize]; + LOG(INFO) << "diff, image rotate size: " << resize; + for (int i = 0; i < resize; i++) { + int a = tv_out_ratote[i]; + int b = tv_out_ratote_basic[i]; + int diff1 = a - b; + int diff = 0; + if (diff1 < -1 || diff1 > 1) diff = diff1 < 0 ? -diff1 : diff1; + diff_v[i] = diff; + if (max_diff < diff) { + max_diff = diff; + max_ratio = 2.0 * max_diff / (a + b + eps); + } + } + if (std::abs(max_ratio) >= 1e-5f) { + int width = resize / dsth; + printf("din: \n"); + print_int8(resize_tmp, resize, width); + printf("saber result: \n"); + print_int8(tv_out_ratote, resize, width); + printf("basic result: \n"); + print_int8(tv_out_ratote_basic, resize, width); + printf("diff result: \n"); + print_int(diff_v, resize, width); + } + delete[] diff_v; + LOG(INFO) << "compare result, max diff: " << max_diff + << ", max ratio: " << max_ratio; + bool rst = std::abs(max_ratio) < 1e-5f; + CHECK_EQ(rst, true) << "compute result error"; + } + delete[] tv_out_ratote; + delete[] tv_out_ratote_basic; + LOG(INFO) << "image rotate end"; + + if (FLAGS_check_result) { + max_ratio = 0; + max_diff = 0; + int* diff_v = new int[resize]; + LOG(INFO) << "diff, image flip size: " << resize; + for (int i = 0; i < resize; i++) { + int a = tv_out_flip[i]; + int b = tv_out_flip_basic[i]; + int diff1 = a - b; + int diff = 0; + if (diff1 < -1 || diff1 > 1) diff = diff1 < 0 ? -diff1 : diff1; + diff_v[i] = diff; + if (max_diff < diff) { + max_diff = diff; + max_ratio = 2.0 * max_diff / (a + b + eps); + } + } + if (std::abs(max_ratio) >= 1e-5f) { + int width = resize / dsth; + printf("din: \n"); + print_int8(resize_tmp, resize, width); + printf("saber result: \n"); + print_int8(tv_out_flip, resize, width); + printf("basic result: \n"); + print_int8(tv_out_flip_basic, resize, width); + printf("diff result: \n"); + print_int(diff_v, resize, width); + } + delete[] diff_v; + LOG(INFO) << "compare result, max diff: " << max_diff + << ", max ratio: " << max_ratio; + bool rst = std::abs(max_ratio) < 1e-5f; + CHECK_EQ(rst, true) << "compute result error"; + } + delete[] tv_out_flip; + delete[] tv_out_flip_basic; + delete[] resize_tmp; + delete[] resize_basic; + LOG(INFO) << "image flip end"; + + if (FLAGS_check_result) { + max_ratio = 0; + max_diff = 0; + LOG(INFO) << "diff, iamge to tensor size: " << tensor.numel(); + const float* ptr_a = tensor.data(); + const float* ptr_b = tensor_basic.data(); + int ss = tensor.numel(); + float* diff_v = new float[ss]; + for (int i = 0; i < ss; i++) { + int a = ptr_a[i]; + int b = ptr_b[i]; + int diff1 = a - b; + int diff = 0; + if (diff1 < -1 || diff1 > 1) diff = diff1 < 0 ? -diff1 : diff1; + diff_v[i] = diff; + if (max_diff < diff) { + max_diff = diff; + max_ratio = 2.0 * max_diff / (a + b + eps); + } + } + if (std::abs(max_ratio) >= 1e-5f) { + int width = resize / srch; + printf("din: \n"); + print_int8(resize_tmp, resize, width); + printf("saber result: \n"); + print_ff(ptr_a, resize, width); + printf("basic result: \n"); + print_ff(ptr_b, resize, width); + printf("diff result: \n"); + print_ff(diff_v, resize, width); + } + LOG(INFO) << "compare result, max diff: " << max_diff + << ", max ratio: " << max_ratio; + bool rst = std::abs(max_ratio) < 1e-5f; + CHECK_EQ(rst, true) << "compute result error"; + LOG(INFO) << "iamge to tensor end"; + } + } + } +} + +#if 1 +TEST(TestImageConvertRand, test_func_image_convert_preprocess) { + if (FLAGS_basic_test) { + for (auto w : {1, 4, 8, 16, 112, 224, 1092}) { + for (auto h : {1, 4, 16, 112, 224}) { + for (auto ww : {66}) { + for (auto hh : {12}) { + for (auto rotate : {180}) { + for (auto flip : {0}) { + for (auto srcFormat : {0, 1, 2, 3, 4, 11, 12}) { + for (auto dstFormat : {0, 1, 2, 3}) { + for (auto layout : {1}) { + if ((dstFormat == ImageFormat::GRAY && + (srcFormat == ImageFormat::RGBA || + srcFormat == ImageFormat::BGRA)) || + (srcFormat == ImageFormat::GRAY && + (dstFormat == ImageFormat::RGBA || + dstFormat == ImageFormat::BGRA)) || + (srcFormat == ImageFormat::NV12 || + srcFormat == ImageFormat::NV21) && + (dstFormat == ImageFormat::GRAY || + dstFormat == ImageFormat::RGBA || + dstFormat == ImageFormat::BGRA)) { + continue; + } + if (srcFormat == ImageFormat::NV12 || + srcFormat == ImageFormat::NV21) { + if (w % 2) { // is not ou shu, two line y == one line + // uv + continue; + } + } + test_img({FLAGS_cluster}, + {1}, + w, + h, + ww, + hh, + (ImageFormat)srcFormat, + (ImageFormat)dstFormat, + rotate, + (FlipParam)flip, + (LayoutType)layout); + } + } + } + } + } + } + } + } + } + } +} +#endif +#if 1 +TEST(TestImageConvertRand, test_func_image_resize_preprocess) { + if (FLAGS_basic_test) { + for (auto w : {1, 4, 8, 16, 112, 224, 1092}) { + for (auto h : {1, 4, 16, 112, 224}) { + for (auto ww : {1, 2, 8, 32, 112}) { + for (auto hh : {1, 2, 8, 112}) { + for (auto rotate : {180}) { + for (auto flip : {0}) { + for (auto srcFormat : {0, 1, 2, 3, 4, 11, 12}) { + for (auto dstFormat : {0, 1, 2, 3}) { + for (auto layout : {1}) { + if (dstFormat == ImageFormat::NV12 || + dstFormat == ImageFormat::NV21 || + (dstFormat == ImageFormat::GRAY && + (srcFormat == ImageFormat::RGBA || + srcFormat == ImageFormat::BGRA)) || + (srcFormat == ImageFormat::GRAY && + (dstFormat == ImageFormat::RGBA || + dstFormat == ImageFormat::BGRA)) || + (srcFormat == ImageFormat::NV12 || + srcFormat == ImageFormat::NV21) && + (dstFormat == ImageFormat::GRAY || + dstFormat == ImageFormat::RGBA || + dstFormat == ImageFormat::BGRA)) { + continue; + } + if (srcFormat == ImageFormat::NV12 || + srcFormat == ImageFormat::NV21) { + if (w % 2) { // is not ou shu, two line y == one line + // uv + continue; + } + } + test_img({FLAGS_cluster}, + {1, 2, 4}, + w, + h, + ww, + hh, + (ImageFormat)srcFormat, + (ImageFormat)dstFormat, + rotate, + (FlipParam)flip, + (LayoutType)layout); + } + } + } + } + } + } + } + } + } + } +} +#endif +#if 1 +TEST(TestImageConvertRand, test_func_image_trans_preprocess) { + if (FLAGS_basic_test) { + for (auto w : {1, 8, 16, 112, 224, 1092}) { + for (auto h : {1, 16, 112, 224}) { + for (auto ww : {32, 112}) { + for (auto hh : {112}) { + for (auto rotate : {90, 180, 270}) { + for (auto flip : {0, 1, 2}) { + for (auto srcFormat : {11}) { + for (auto dstFormat : {3}) { + for (auto layout : {1, 3}) { + if (dstFormat == ImageFormat::NV12 || + dstFormat == ImageFormat::NV21 || + (dstFormat == ImageFormat::GRAY && + (srcFormat == ImageFormat::RGBA || + srcFormat == ImageFormat::BGRA)) || + (srcFormat == ImageFormat::GRAY && + (dstFormat == ImageFormat::RGBA || + dstFormat == ImageFormat::BGRA)) || + (srcFormat == ImageFormat::NV12 || + srcFormat == ImageFormat::NV21) && + (dstFormat == ImageFormat::GRAY || + dstFormat == ImageFormat::RGBA || + dstFormat == ImageFormat::BGRA)) { + continue; + } + if (srcFormat == ImageFormat::NV12 || + srcFormat == ImageFormat::NV21) { + if (w % 2) { // is not ou shu, two line y == one line + // uv + continue; + } + } + test_img({FLAGS_cluster}, + {1, 2, 4}, + w, + h, + ww, + hh, + (ImageFormat)srcFormat, + (ImageFormat)dstFormat, + rotate, + (FlipParam)flip, + (LayoutType)layout); + } + } + } + } + } + } + } + } + } + } +} +#endif +#if 1 +TEST(TestImageConvertCustom, test_func_image_preprocess_custom) { + test_img({FLAGS_cluster}, + {1, 2, 4}, + FLAGS_srcw, + FLAGS_srch, + FLAGS_dstw, + FLAGS_dsth, + (ImageFormat)FLAGS_srcFormat, + (ImageFormat)FLAGS_dstFormat, + FLAGS_angle, + (FlipParam)FLAGS_flip_num, + (LayoutType)FLAGS_layout); +} +#endif +#endif diff --git a/lite/tools/build.sh b/lite/tools/build.sh index d1f47c149ec7d6c30767d6db19e371e5e32b865d..4873e70773f31425d628ee2bbdd36f2cb2f921f1 100755 --- a/lite/tools/build.sh +++ b/lite/tools/build.sh @@ -19,6 +19,7 @@ BUILD_PYTHON=OFF BUILD_DIR=$(pwd) OPTMODEL_DIR="" BUILD_TAILOR=OFF +BUILD_CV=OFF readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz @@ -96,6 +97,7 @@ function make_tiny_publish_so { -DLITE_ON_TINY_PUBLISH=ON \ -DANDROID_STL_TYPE=$android_stl \ -DLITE_BUILD_EXTRA=$BUILD_EXTRA \ + -DLITE_WITH_CV=$BUILD_CV \ -DLITE_BUILD_TAILOR=$BUILD_TAILOR \ -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \ -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang} @@ -122,7 +124,7 @@ function make_full_publish_so { fi mkdir -p $build_directory cd $build_directory - + if [ ${os} == "armlinux" ]; then BUILD_JAVA=OFF fi @@ -137,6 +139,7 @@ function make_full_publish_so { -DLITE_SHUTDOWN_LOG=ON \ -DANDROID_STL_TYPE=$android_stl \ -DLITE_BUILD_EXTRA=$BUILD_EXTRA \ + -DLITE_WITH_CV=$BUILD_CV \ -DLITE_BUILD_TAILOR=$BUILD_TAILOR \ -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \ -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang} @@ -166,6 +169,7 @@ function make_all_tests { ${CMAKE_COMMON_OPTIONS} \ -DWITH_TESTING=ON \ -DLITE_BUILD_EXTRA=$BUILD_EXTRA \ + -DLITE_WITH_CV=$BUILD_CV \ -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang} make lite_compile_deps -j$NUM_PROC @@ -201,6 +205,7 @@ function make_ios { -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \ -DARM_TARGET_ARCH_ABI=$abi \ -DLITE_BUILD_EXTRA=$BUILD_EXTRA \ + -DLITE_WITH_CV=$BUILD_CV \ -DARM_TARGET_OS=$os make -j4 publish_inference @@ -362,11 +367,11 @@ function main { shift ;; tiny_publish) - make_tiny_publish_so $ARM_OS $ARM_ABI $ARM_LANG $ANDROID_STL + make_tiny_publish_so $ARM_OS $ARM_ABI $ARM_LANG $ANDROID_STL shift ;; full_publish) - make_full_publish_so $ARM_OS $ARM_ABI $ARM_LANG $ANDROID_STL + make_full_publish_so $ARM_OS $ARM_ABI $ARM_LANG $ANDROID_STL shift ;; test) @@ -382,7 +387,7 @@ function main { shift ;; cuda) - make_cuda + make_cuda shift ;; x86) diff --git a/lite/utils/CMakeLists.txt b/lite/utils/CMakeLists.txt index 6337085d829b115dc6d2553473ddcef8ac5115f8..ea7bfc97a5a35d7e178aa21b4d55605a617eb0d3 100644 --- a/lite/utils/CMakeLists.txt +++ b/lite/utils/CMakeLists.txt @@ -24,3 +24,5 @@ if(LITE_ON_TINY_PUBLISH OR LITE_ON_MODEL_OPTIMIZE_TOOL) else() lite_cc_library(utils SRCS string.cc DEPS ${utils_DEPS} any) endif() + +add_subdirectory(cv) diff --git a/lite/utils/cv/CMakeLists.txt b/lite/utils/cv/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..01f5341c972342afa13fabaf5183a7d5d8543c7f --- /dev/null +++ b/lite/utils/cv/CMakeLists.txt @@ -0,0 +1,11 @@ +if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ARM) + set(lite_cv_deps) + lite_cc_library(paddle_cv_arm SRCS + image_convert.cc + paddle_image_preprocess.cc + image2tensor.cc + image_flip.cc + image_rotate.cc + image_resize.cc + DEPS ${lite_cv_deps} paddle_api_light) +endif() diff --git a/lite/utils/cv/image2tensor.cc b/lite/utils/cv/image2tensor.cc new file mode 100644 index 0000000000000000000000000000000000000000..b51a82da1d0e9dc1750670ef55690e9a34a659fc --- /dev/null +++ b/lite/utils/cv/image2tensor.cc @@ -0,0 +1,597 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/utils/cv/image2tensor.h" +#include +namespace paddle { +namespace lite { +namespace utils { +namespace cv { +void bgr_to_tensor_chw(const uint8_t* src, + float* output, + int width, + int height, + float* means, + float* scales); + +void bgra_to_tensor_chw(const uint8_t* src, + float* output, + int width, + int height, + float* means, + float* scales); + +void bgr_to_tensor_hwc(const uint8_t* src, + float* output, + int width, + int height, + float* means, + float* scales); + +void bgra_to_tensor_hwc(const uint8_t* src, + float* output, + int width, + int height, + float* means, + float* scales); + +/* + * change image data to tensor data + * support image format is BGR(RGB) and BGRA(RGBA), Data layout is NHWC and + * NCHW + * param src: input image data + * param dstTensor: output tensor data + * param srcFormat: input image format, support BGR(GRB) and BGRA(RGBA) + * param srcw: input image width + * param srch: input image height + * param layout: output tensor layout,support NHWC and NCHW + * param means: means of image + * param scales: scales of image +*/ +void Image2Tensor::choose(const uint8_t* src, + Tensor* dst, + ImageFormat srcFormat, + LayoutType layout, + int srcw, + int srch, + float* means, + float* scales) { + float* output = dst->mutable_data(); + if (layout == LayoutType::kNCHW && (srcFormat == BGR || srcFormat == RGB)) { + impl_ = bgr_to_tensor_chw; + } else if (layout == LayoutType::kNHWC && + (srcFormat == BGR || srcFormat == RGB)) { + impl_ = bgr_to_tensor_hwc; + } else if (layout == LayoutType::kNCHW && + (srcFormat == BGRA || srcFormat == RGBA)) { + impl_ = bgra_to_tensor_chw; + } else if (layout == LayoutType::kNHWC && + (srcFormat == BGRA || srcFormat == RGBA)) { + impl_ = bgra_to_tensor_hwc; + } else { + printf("this layout: %d or image format: %d not support \n", + static_cast(layout), + srcFormat); + return; + } + impl_(src, output, srcw, srch, means, scales); +} +void bgr_to_tensor_chw(const uint8_t* src, + float* output, + int width, + int height, + float* means, + float* scales) { + int size = width * height; + float b_means = means[0]; + float g_means = means[1]; + float r_means = means[2]; + float b_scales = scales[0]; + float g_scales = scales[1]; + float r_scales = scales[2]; + + float* ptr_b = output; + float* ptr_g = ptr_b + size; + float* ptr_r = ptr_g + size; + + int dim8 = width >> 3; + int remain = width % 8; + + float32x4_t vbmean = vdupq_n_f32(b_means); + float32x4_t vgmean = vdupq_n_f32(g_means); + float32x4_t vrmean = vdupq_n_f32(r_means); + float32x4_t vbscale = vdupq_n_f32(b_scales); + float32x4_t vgscale = vdupq_n_f32(g_scales); + float32x4_t vrscale = vdupq_n_f32(r_scales); +#pragma omp parallel for + for (int i = 0; i < height; i += 1) { + const uint8_t* din_ptr = src + i * 3 * width; + float* ptr_b_h = ptr_b + i * width; + float* ptr_g_h = ptr_g + i * width; + float* ptr_r_h = ptr_r + i * width; + int cnt = dim8; + if (cnt > 0) { +#ifdef __aarch64__ + asm volatile( + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr0], #64] \n" + "prfm pldl1keep, [%[inptr0], #128] \n" + "prfm pldl1keep, [%[inptr0], #192] \n" + "1: \n" + "ld3 {v0.8b, v1.8b, v2.8b}, [%[inptr0]], #24 \n" // d8 = y0y3y6y9.. + // d9 = y1y4y7..." + // 8->16 + "ushll v3.8h, v0.8b, #0 \n" + "ushll v4.8h, v1.8b, #0 \n" + "ushll v5.8h, v2.8b, #0 \n" + // 16->32 + "ushll v6.4s, v3.4h, #0 \n" + "ushll2 v7.4s, v3.8h, #0 \n" + "ushll v8.4s, v4.4h, #0 \n" + "ushll2 v9.4s, v4.8h, #0 \n" + "ushll v10.4s, v5.4h, #0 \n" + "ushll2 v11.4s, v5.8h, #0 \n" + // int32->fp32 + "ucvtf v12.4s, v6.4s \n" + "ucvtf v13.4s, v7.4s \n" + "ucvtf v14.4s, v8.4s \n" + "ucvtf v15.4s, v9.4s \n" + "ucvtf v16.4s, v10.4s \n" + "ucvtf v17.4s, v11.4s \n" + // sub -mean + "fsub v12.4s, v12.4s, %w[vbmean].4s \n" + "fsub v13.4s, v13.4s, %w[vbmean].4s \n" + "fsub v14.4s, v14.4s, %w[vgmean].4s \n" + "fsub v15.4s, v15.4s, %w[vgmean].4s \n" + "fsub v16.4s, v16.4s, %w[vrmean].4s \n" + "fsub v17.4s, v17.4s, %w[vrmean].4s \n" + // mul * scale + "fmul v6.4s, v12.4s, %w[vbscale].4s \n" + "fmul v7.4s, v13.4s, %w[vbscale].4s \n" + "fmul v8.4s, v14.4s, %w[vgscale].4s \n" + "fmul v9.4s, v15.4s, %w[vgscale].4s \n" + "fmul v10.4s, v16.4s, %w[vrscale].4s \n" + "fmul v11.4s, v17.4s, %w[vrscale].4s \n" + // store + "st1 {v6.4s}, [%[outr0]], #16 \n" + "st1 {v8.4s}, [%[outr1]], #16 \n" + "st1 {v10.4s}, [%[outr2]], #16 \n" + "subs %w[cnt], %w[cnt], #1 \n" + "st1 {v7.4s}, [%[outr0]], #16 \n" + "st1 {v9.4s}, [%[outr1]], #16 \n" + "st1 {v11.4s}, [%[outr2]], #16 \n" + "bne 1b \n" + : [inptr0] "+r"(din_ptr), + [outr0] "+r"(ptr_b_h), + [outr1] "+r"(ptr_g_h), + [outr2] "+r"(ptr_r_h), + [cnt] "+r"(cnt) + : [vbmean] "w"(vbmean), + [vgmean] "w"(vgmean), + [vrmean] "w"(vrmean), + [vbscale] "w"(vbscale), + [vgscale] "w"(vgscale), + [vrscale] "w"(vrscale) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20"); +#else + asm volatile( + "pld [%[inptr0]] @ preload a, 64byte\n" + "pld [%[inptr0], #64] @ preload a, 64byte\n" + "pld [%[inptr0], #128] @ preload a, 64byte\n" + "pld [%[inptr0], #192] @ preload a, 64byte\n" + "1: \n" + "vld3.8 {d12, d13, d14}, [%[inptr0]]! \n" + // 8->16 + "vmovl.u8 q8, d12 \n" + "vmovl.u8 q9, d13 \n" + "vmovl.u8 q10, d14 \n" + // 16->32 + "vmovl.u16 q11, d16 \n" + "vmovl.u16 q12, d17 \n" + "vmovl.u16 q13, d18 \n" + "vmovl.u16 q14, d19 \n" + "vmovl.u16 q15, d20 \n" + "vmovl.u16 q6, d21 \n" + // int32->fp32 + "vcvt.f32.u32 q7, q11 \n" + "vcvt.f32.u32 q8, q12 \n" + "vcvt.f32.u32 q9, q13 \n" + "vcvt.f32.u32 q10, q14 \n" + "vcvt.f32.u32 q11, q15 \n" + "vcvt.f32.u32 q12, q6 \n" + // sub -mean + "vsub.f32 q7, q7, %q[vbmean] \n" + "vsub.f32 q8, q8, %q[vbmean] \n" + "vsub.f32 q9, q9, %q[vgmean] \n" + "vsub.f32 q10, q10, %q[vgmean] \n" + "vsub.f32 q11, q11, %q[vrmean] \n" + "vsub.f32 q12, q12, %q[vrmean] \n" + // mul *scale + "vmul.f32 q13, q7, %q[vbscale] \n" + "vmul.f32 q14, q8, %q[vbscale] \n" + "vmul.f32 q15, q9, %q[vgscale] \n" + "vmul.f32 q6, q10, %q[vgscale] \n" + "vmul.f32 q7, q11, %q[vrscale] \n" + "vmul.f32 q8, q12, %q[vrscale] \n" + // store + "vst1.32 {d26 - d27}, [%[outr0]]! \n" + "vst1.32 {d30 - d31}, [%[outr1]]! \n" + "vst1.32 {d14 - d15}, [%[outr2]]! \n" + "subs %[cnt], #1 \n" + "vst1.32 {d28 - d29}, [%[outr0]]! \n" + "vst1.32 {d12 - d13}, [%[outr1]]! \n" + "vst1.32 {d16 - d17}, [%[outr2]]! \n" + "bne 1b" + : [inptr0] "+r"(din_ptr), + [outr0] "+r"(ptr_b_h), + [outr1] "+r"(ptr_g_h), + [outr2] "+r"(ptr_r_h), + [cnt] "+r"(cnt) + : [vbmean] "w"(vbmean), + [vgmean] "w"(vgmean), + [vrmean] "w"(vrmean), + [vbscale] "w"(vbscale), + [vgscale] "w"(vgscale), + [vrscale] "w"(vrscale) + : "cc", + "memory", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); +#endif + } + for (int j = 0; j < remain; j++) { + *ptr_b_h++ = (*din_ptr - b_means) * b_scales; + din_ptr++; + *ptr_g_h++ = (*din_ptr - g_means) * g_scales; + din_ptr++; + *ptr_r_h++ = (*din_ptr - r_means) * r_scales; + din_ptr++; + } + } +} + +void bgra_to_tensor_chw(const uint8_t* src, + float* output, + int width, + int height, + float* means, + float* scales) { + int size = width * height; + float b_means = means[0]; + float g_means = means[1]; + float r_means = means[2]; + float b_scales = scales[0]; + float g_scales = scales[1]; + float r_scales = scales[2]; + + float* ptr_b = output; + float* ptr_g = ptr_b + size; + float* ptr_r = ptr_g + size; + + int dim8 = width >> 3; + int remain = width % 8; + + float32x4_t vbmean = vdupq_n_f32(b_means); + float32x4_t vgmean = vdupq_n_f32(g_means); + float32x4_t vrmean = vdupq_n_f32(r_means); + float32x4_t vbscale = vdupq_n_f32(b_scales); + float32x4_t vgscale = vdupq_n_f32(g_scales); + float32x4_t vrscale = vdupq_n_f32(r_scales); +#pragma omp parallel for + for (int i = 0; i < height; i += 1) { + const uint8_t* din_ptr = src + i * 4 * width; + float* ptr_b_h = ptr_b + i * width; + float* ptr_g_h = ptr_g + i * width; + float* ptr_r_h = ptr_r + i * width; + + for (int j = 0; j < dim8; j++) { + uint8x8x4_t v_bgr = vld4_u8(din_ptr); + + uint16x8_t vb_16 = vmovl_u8(v_bgr.val[0]); + uint16x8_t vg_16 = vmovl_u8(v_bgr.val[1]); + uint16x8_t vr_16 = vmovl_u8(v_bgr.val[2]); + + uint32x4_t vb_low_32 = vmovl_u16(vget_low_u16(vb_16)); + uint32x4_t vg_low_32 = vmovl_u16(vget_low_u16(vg_16)); + uint32x4_t vr_low_32 = vmovl_u16(vget_low_u16(vr_16)); + + uint32x4_t vb_high_32 = vmovl_u16(vget_high_u16(vb_16)); + uint32x4_t vg_high_32 = vmovl_u16(vget_high_u16(vg_16)); + uint32x4_t vr_high_32 = vmovl_u16(vget_high_u16(vr_16)); + + float32x4_t vb_low_f32 = vcvtq_f32_u32(vb_low_32); + float32x4_t vr_low_f32 = vcvtq_f32_u32(vr_low_32); + float32x4_t vg_low_f32 = vcvtq_f32_u32(vg_low_32); + + float32x4_t vb_high_f32 = vcvtq_f32_u32(vb_high_32); + float32x4_t vg_high_f32 = vcvtq_f32_u32(vg_high_32); + float32x4_t vr_high_f32 = vcvtq_f32_u32(vr_high_32); + + vb_low_f32 = vsubq_f32(vb_low_f32, vbmean); + vg_low_f32 = vsubq_f32(vg_low_f32, vgmean); + vr_low_f32 = vsubq_f32(vr_low_f32, vrmean); + + vb_high_f32 = vsubq_f32(vb_high_f32, vbmean); + vg_high_f32 = vsubq_f32(vg_high_f32, vgmean); + vr_high_f32 = vsubq_f32(vr_high_f32, vrmean); + + vb_low_f32 = vmulq_f32(vb_low_f32, vbscale); + vg_low_f32 = vmulq_f32(vg_low_f32, vgscale); + vr_low_f32 = vmulq_f32(vr_low_f32, vrscale); + + vb_high_f32 = vmulq_f32(vb_high_f32, vbscale); + vg_high_f32 = vmulq_f32(vg_high_f32, vgscale); + vr_high_f32 = vmulq_f32(vr_high_f32, vrscale); + + vst1q_f32(ptr_b_h, vb_low_f32); + vst1q_f32(ptr_g_h, vg_low_f32); + vst1q_f32(ptr_r_h, vr_low_f32); + + din_ptr += 32; + + vst1q_f32(ptr_b_h + 4, vb_high_f32); + vst1q_f32(ptr_g_h + 4, vg_high_f32); + vst1q_f32(ptr_r_h + 4, vr_high_f32); + + ptr_b_h += 8; + ptr_g_h += 8; + ptr_r_h += 8; + } + + for (int j = 0; j < remain; j++) { + *ptr_b_h++ = (*din_ptr - b_means) * b_scales; + din_ptr++; + *ptr_g_h++ = (*din_ptr - g_means) * g_scales; + din_ptr++; + *ptr_r_h++ = (*din_ptr - r_means) * r_scales; + din_ptr++; + din_ptr++; // a + } + } +} +void bgr_to_tensor_hwc(const uint8_t* src, + float* output, + int width, + int height, + float* means, + float* scales) { + int size = width * height; + float b_means = means[0]; + float g_means = means[1]; + float r_means = means[2]; + float b_scales = scales[0]; + float g_scales = scales[1]; + float r_scales = scales[2]; + + float* dout = output; + + int dim8 = width >> 3; + int remain = width % 8; + + float32x4_t vbmean = vdupq_n_f32(b_means); + float32x4_t vgmean = vdupq_n_f32(g_means); + float32x4_t vrmean = vdupq_n_f32(r_means); + float32x4_t vbscale = vdupq_n_f32(b_scales); + float32x4_t vgscale = vdupq_n_f32(g_scales); + float32x4_t vrscale = vdupq_n_f32(r_scales); +#pragma omp parallel for + for (int i = 0; i < height; i += 1) { + const uint8_t* din_ptr = src + i * 3 * width; + float* dout_ptr = dout + i * 3 * width; + + for (int j = 0; j < dim8; j++) { + uint8x8x3_t v_bgr = vld3_u8(din_ptr); + + uint16x8_t vb_16 = vmovl_u8(v_bgr.val[0]); + uint16x8_t vg_16 = vmovl_u8(v_bgr.val[1]); + uint16x8_t vr_16 = vmovl_u8(v_bgr.val[2]); + + uint32x4_t vb_low_32 = vmovl_u16(vget_low_u16(vb_16)); + uint32x4_t vg_low_32 = vmovl_u16(vget_low_u16(vg_16)); + uint32x4_t vr_low_32 = vmovl_u16(vget_low_u16(vr_16)); + + uint32x4_t vb_high_32 = vmovl_u16(vget_high_u16(vb_16)); + uint32x4_t vg_high_32 = vmovl_u16(vget_high_u16(vg_16)); + uint32x4_t vr_high_32 = vmovl_u16(vget_high_u16(vr_16)); + + float32x4_t vb_low_f32 = vcvtq_f32_u32(vb_low_32); + float32x4_t vr_low_f32 = vcvtq_f32_u32(vr_low_32); + float32x4_t vg_low_f32 = vcvtq_f32_u32(vg_low_32); + + float32x4_t vb_high_f32 = vcvtq_f32_u32(vb_high_32); + float32x4_t vg_high_f32 = vcvtq_f32_u32(vg_high_32); + float32x4_t vr_high_f32 = vcvtq_f32_u32(vr_high_32); + + vb_low_f32 = vsubq_f32(vb_low_f32, vbmean); + vg_low_f32 = vsubq_f32(vg_low_f32, vgmean); + vr_low_f32 = vsubq_f32(vr_low_f32, vrmean); + + vb_high_f32 = vsubq_f32(vb_high_f32, vbmean); + vg_high_f32 = vsubq_f32(vg_high_f32, vgmean); + vr_high_f32 = vsubq_f32(vr_high_f32, vrmean); + + vb_low_f32 = vmulq_f32(vb_low_f32, vbscale); + vg_low_f32 = vmulq_f32(vg_low_f32, vgscale); + vr_low_f32 = vmulq_f32(vr_low_f32, vrscale); + + vb_high_f32 = vmulq_f32(vb_high_f32, vbscale); + vg_high_f32 = vmulq_f32(vg_high_f32, vgscale); + vr_high_f32 = vmulq_f32(vr_high_f32, vrscale); + + float32x4x3_t val; + val.val[0] = vb_low_f32; + val.val[1] = vg_low_f32; + val.val[2] = vr_low_f32; + + vst3q_f32(dout_ptr, val); + + din_ptr += 24; + dout_ptr += 12; + + val.val[0] = vb_high_f32; + val.val[1] = vg_high_f32; + val.val[2] = vr_high_f32; + + vst3q_f32(dout_ptr, val); + + dout_ptr += 12; + } + + for (int j = 0; j < remain; j++) { + *dout_ptr++ = (*din_ptr - b_means) * b_scales; + din_ptr++; + *dout_ptr++ = (*din_ptr - g_means) * g_scales; + din_ptr++; + *dout_ptr++ = (*din_ptr - r_means) * r_scales; + din_ptr++; + } + } +} + +void bgra_to_tensor_hwc(const uint8_t* src, + float* output, + int width, + int height, + float* means, + float* scales) { + int size = width * height; + float b_means = means[0]; + float g_means = means[1]; + float r_means = means[2]; + float b_scales = scales[0]; + float g_scales = scales[1]; + float r_scales = scales[2]; + + float* dout = output; + + int dim8 = width >> 3; + int remain = width % 8; + + float32x4_t vbmean = vdupq_n_f32(b_means); + float32x4_t vgmean = vdupq_n_f32(g_means); + float32x4_t vrmean = vdupq_n_f32(r_means); + float32x4_t vbscale = vdupq_n_f32(b_scales); + float32x4_t vgscale = vdupq_n_f32(g_scales); + float32x4_t vrscale = vdupq_n_f32(r_scales); +#pragma omp parallel for + for (int i = 0; i < height; i += 1) { + const uint8_t* din_ptr = src + i * 4 * width; + float* dout_ptr = dout + i * 3 * width; + + for (int j = 0; j < dim8; j++) { + uint8x8x4_t v_bgr = vld4_u8(din_ptr); + + uint16x8_t vb_16 = vmovl_u8(v_bgr.val[0]); + uint16x8_t vg_16 = vmovl_u8(v_bgr.val[1]); + uint16x8_t vr_16 = vmovl_u8(v_bgr.val[2]); + // uint16x8_t va_16 = vmovl_u8(v_bgr.val[3]); + + uint32x4_t vb_low_32 = vmovl_u16(vget_low_u16(vb_16)); + uint32x4_t vg_low_32 = vmovl_u16(vget_low_u16(vg_16)); + uint32x4_t vr_low_32 = vmovl_u16(vget_low_u16(vr_16)); + + uint32x4_t vb_high_32 = vmovl_u16(vget_high_u16(vb_16)); + uint32x4_t vg_high_32 = vmovl_u16(vget_high_u16(vg_16)); + uint32x4_t vr_high_32 = vmovl_u16(vget_high_u16(vr_16)); + + float32x4_t vb_low_f32 = vcvtq_f32_u32(vb_low_32); + float32x4_t vr_low_f32 = vcvtq_f32_u32(vr_low_32); + float32x4_t vg_low_f32 = vcvtq_f32_u32(vg_low_32); + + float32x4_t vb_high_f32 = vcvtq_f32_u32(vb_high_32); + float32x4_t vg_high_f32 = vcvtq_f32_u32(vg_high_32); + float32x4_t vr_high_f32 = vcvtq_f32_u32(vr_high_32); + + vb_low_f32 = vsubq_f32(vb_low_f32, vbmean); + vg_low_f32 = vsubq_f32(vg_low_f32, vgmean); + vr_low_f32 = vsubq_f32(vr_low_f32, vrmean); + + vb_high_f32 = vsubq_f32(vb_high_f32, vbmean); + vg_high_f32 = vsubq_f32(vg_high_f32, vgmean); + vr_high_f32 = vsubq_f32(vr_high_f32, vrmean); + + vb_low_f32 = vmulq_f32(vb_low_f32, vbscale); + vg_low_f32 = vmulq_f32(vg_low_f32, vgscale); + vr_low_f32 = vmulq_f32(vr_low_f32, vrscale); + + vb_high_f32 = vmulq_f32(vb_high_f32, vbscale); + vg_high_f32 = vmulq_f32(vg_high_f32, vgscale); + vr_high_f32 = vmulq_f32(vr_high_f32, vrscale); + + float32x4x3_t val; + val.val[0] = vb_low_f32; + val.val[1] = vg_low_f32; + val.val[2] = vr_low_f32; + // val.val[3] = num_a; + + vst3q_f32(dout_ptr, val); + + din_ptr += 32; + dout_ptr += 12; + + val.val[0] = vb_high_f32; + val.val[1] = vg_high_f32; + val.val[2] = vr_high_f32; + + vst3q_f32(dout_ptr, val); + + dout_ptr += 12; + } + + for (int j = 0; j < remain; j++) { + *dout_ptr++ = (*din_ptr - b_means) * b_scales; + din_ptr++; + *dout_ptr++ = (*din_ptr - g_means) * g_scales; + din_ptr++; + *dout_ptr++ = (*din_ptr - r_means) * r_scales; + din_ptr++; + din_ptr++; // a + // *dout_ptr++ = 255; + } + } +} +} // namespace cv +} // namespace utils +} // namespace lite +} // namespace paddle diff --git a/lite/utils/cv/image2tensor.h b/lite/utils/cv/image2tensor.h new file mode 100644 index 0000000000000000000000000000000000000000..70dc4cdf69041d54c423c0c46dcad7fb7e164a92 --- /dev/null +++ b/lite/utils/cv/image2tensor.h @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/utils/cv/paddle_image_preprocess.h" +namespace paddle { +namespace lite { +namespace utils { +namespace cv { +typedef void (*tensor_func)(const uint8_t* src, + float* dst, + int srcw, + int srch, + float* means, + float* scales); +class Image2Tensor { + public: + void choose(const uint8_t* src, + Tensor* dst, + ImageFormat srcFormat, + LayoutType layout, + int srcw, + int srch, + float* means, + float* scales); + + private: + tensor_func impl_{nullptr}; +}; +} // namespace cv +} // namespace utils +} // namespace lite +} // namespace paddle diff --git a/lite/utils/cv/image_convert.cc b/lite/utils/cv/image_convert.cc new file mode 100644 index 0000000000000000000000000000000000000000..24b6db70dd4f4fb1ad8e8c915444684d4db07cfd --- /dev/null +++ b/lite/utils/cv/image_convert.cc @@ -0,0 +1,1166 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/utils/cv/image_convert.h" +#include +#include +#include +namespace paddle { +namespace lite { +namespace utils { +namespace cv { +void nv_to_bgr( + const uint8_t* src, uint8_t* dst, int srcw, int srch, int x_num, int y_num); + +void nv_to_bgra( + const uint8_t* src, uint8_t* dst, int srcw, int srch, int x_num, int y_num); + +void nv21_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch); +void nv21_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch); +void nv12_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch); +void nv12_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch); +// bgr rgb to gray +void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch); +// gray to bgr rgb +void hwc1_to_hwc3(const uint8_t* src, uint8_t* dst, int srcw, int srch); +// bgr to bgra or rgb to rgba +void hwc3_to_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch); +// bgra to bgr or rgba to rgb +void hwc4_to_hwc3(const uint8_t* src, uint8_t* dst, int srcw, int srch); +// bgr to rgb or rgb to bgr +void hwc3_trans(const uint8_t* src, uint8_t* dst, int srcw, int srch); +// bgra to rgba or rgba to bgra +void hwc4_trans(const uint8_t* src, uint8_t* dst, int srcw, int srch); +// bgra to rgb or rgba to bgr +void hwc4_trans_hwc3(const uint8_t* src, uint8_t* dst, int srcw, int srch); +// bgr to rgba or rgb to bgra +void hwc3_trans_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch); + +/* + * image color convert + * support NV12/NV21_to_BGR(RGB), NV12/NV21_to_BGRA(RGBA), + * BGR(RGB)and BGRA(RGBA) transform, + * BGR(RGB)and RGB(BGR) transform, + * BGR(RGB)and RGBA(BGRA) transform, + * BGR(RGB)and GRAY transform, + * param src: input image data + * param dst: output image data + * param srcFormat: input image image format support: GRAY, NV12(NV21), + * BGR(RGB) and BGRA(RGBA) + * param dstFormat: output image image format, support GRAY, BGR(RGB) and + * BGRA(RGBA) +*/ +void ImageConvert::choose(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + ImageFormat dstFormat, + int srcw, + int srch) { + if (srcFormat == dstFormat) { + // copy + int size = srcw * srch; + if (srcFormat == NV12 || srcFormat == NV21) { + size = srcw * (ceil(1.5 * srch)); + } else if (srcFormat == BGR || srcFormat == RGB) { + size = 3 * srcw * srch; + } else if (srcFormat == BGRA || srcFormat == RGBA) { + size = 4 * srcw * srch; + } + memcpy(dst, src, sizeof(uint8_t) * size); + return; + } else { + if (srcFormat == NV12 && (dstFormat == BGR || dstFormat == RGB)) { + impl_ = nv12_to_bgr; + } else if (srcFormat == NV21 && (dstFormat == BGR || dstFormat == RGB)) { + impl_ = nv21_to_bgr; + } else if (srcFormat == NV12 && (dstFormat == BGRA || dstFormat == RGBA)) { + impl_ = nv12_to_bgra; + } else if (srcFormat == NV21 && (dstFormat == BGRA || dstFormat == RGBA)) { + impl_ = nv21_to_bgra; + } else if ((srcFormat == RGBA && dstFormat == RGB) || + (srcFormat == BGRA && dstFormat == BGR)) { + impl_ = hwc4_to_hwc3; + } else if ((srcFormat == RGB && dstFormat == RGBA) || + (srcFormat == BGR && dstFormat == BGRA)) { + impl_ = hwc3_to_hwc4; + } else if ((srcFormat == RGB && dstFormat == BGR) || + (srcFormat == BGR && dstFormat == RGB)) { + impl_ = hwc3_trans; + } else if ((srcFormat == RGBA && dstFormat == BGRA) || + (srcFormat == BGRA && dstFormat == RGBA)) { + impl_ = hwc4_trans; + } else if ((srcFormat == RGB && dstFormat == GRAY) || + (srcFormat == BGR && dstFormat == GRAY)) { + impl_ = hwc3_to_hwc1; + } else if ((srcFormat == GRAY && dstFormat == RGB) || + (srcFormat == GRAY && dstFormat == BGR)) { + impl_ = hwc1_to_hwc3; + } else if ((srcFormat == RGBA && dstFormat == BGR) || + (srcFormat == BGRA && dstFormat == RGB)) { + impl_ = hwc4_trans_hwc3; + } else if ((srcFormat == RGB && dstFormat == BGRA) || + (srcFormat == BGR && dstFormat == RGBA)) { + impl_ = hwc3_trans_hwc4; + } else { + printf("srcFormat: %d, dstFormat: %d does not support! \n", + srcFormat, + dstFormat); + } + } + impl_(src, dst, srcw, srch); +} +/* +nv21(yvu) to BGR: stroe hwc dsth * dstw = srch * (srcw) +y_w = srcw, y_h = srch uv_w = srcw uv_h = 1/2 * srch +R = Y + 1.402*(V-128); +G = Y - 0.34414*(U-128) - 0.71414*(V-128); +B = Y + 1.772*(U-128); +浮点乘法用 7位精度处理(即a*b = ((a << 7)*b )>>7) +ra = 1.402 *128 = 179.456 = 179 +ga = 0.34414 * 64 = 44.3721 = 44 +gb = 0.71414 * 64 = 91.40992 = 91 +ba = 1.772 * 62 = 226.816 = 227 +nv12bgr, nv21tobgr +*/ +void nv_to_bgr(const uint8_t* src, + uint8_t* dst, + int srcw, + int srch, + int x_num, + int y_num) { + // nv21 x = 0, y = 1 + // nv12 x = 1, y = 0 + int y_h = srch; + int wout = srcw * 3; + const uint8_t* y = src; + const uint8_t* vu = src + y_h * srcw; + + int16x8_t bias = vdupq_n_s16(128); + int16x8_t ga = vdupq_n_s16(44); + int16x8_t ra = vdupq_n_s16(179); + int16x8_t ba = vdupq_n_s16(227); + int16x8_t gb = vdupq_n_s16(91); + int16x8_t zero = vdupq_n_s16(0); + int16x8_t max = vdupq_n_s16(255); + + uint8_t* zerobuf = new uint8_t[srcw]; + uint8_t* writebuf = new uint8_t[wout]; + memset(zerobuf, 0, sizeof(uint8_t) * srcw); + + int i = 0; +#pragma omp parallel for + for (i = 0; i < y_h; i += 2) { + const uint8_t* ptr_y1 = y + i * srcw; + const uint8_t* ptr_y2 = ptr_y1 + srcw; + const uint8_t* ptr_vu = vu + (i / 2) * srcw; + uint8_t* ptr_bgr1 = dst + i * wout; + uint8_t* ptr_bgr2 = ptr_bgr1 + wout; + if (i + 2 > y_h) { + ptr_y2 = zerobuf; + ptr_bgr2 = writebuf; + } + int j = 0; + for (; j < srcw - 15; j += 16) { + uint8x8x2_t y1 = vld2_u8(ptr_y1); // d8 = y0y2y4y6...y14 d9 = + // y1y3y5...y15 + uint8x8x2_t vu = + vld2_u8(ptr_vu); // d0 = v0v1v2v3v4v5...v7 d1 = u0u1u2...u7 + + uint8x8x2_t y2 = vld2_u8(ptr_y2); + + uint16x8_t v = vmovl_u8(vu.val[x_num]); + uint16x8_t u = vmovl_u8(vu.val[y_num]); + int16x8_t v_s = vreinterpretq_s16_u16(v); + int16x8_t u_s = vreinterpretq_s16_u16(u); + int16x8_t v_bias = vsubq_s16(v_s, bias); + int16x8_t u_bias = vsubq_s16(u_s, bias); + + // G = Y - 0.34414*(U-128) - 0.71414*(V-128); + int16x8_t g0 = vmulq_s16(ga, u_bias); + // R = Y + 1.402*(V-128); + int16x8_t r0 = vmulq_s16(ra, v_bias); + // B = Y + 1.772*(U-128); + int16x8_t b0 = vmulq_s16(ba, u_bias); + + g0 = vmlaq_s16(g0, gb, v_bias); + + int16x8_t y1_0_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[0])); + int16x8_t y1_1_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[1])); + + int16x8_t y2_0_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[0])); + int16x8_t y2_1_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[1])); + + int16x8_t r0_bias = vshrq_n_s16(r0, 7); // r0 / 128 + int16x8_t b0_bias = vshrq_n_s16(b0, 7); + int16x8_t g0_bias = vshrq_n_s16(g0, 7); + + int16x8_t r0_1 = vaddq_s16(y1_0_8, r0_bias); + int16x8_t b0_1 = vaddq_s16(y1_0_8, b0_bias); + int16x8_t g0_1 = vsubq_s16(y1_0_8, g0_bias); // g0_1 = y1_0_8 - g0_1 + + int16x8_t r0_2 = vaddq_s16(y1_1_8, r0_bias); + int16x8_t b0_2 = vaddq_s16(y1_1_8, b0_bias); + int16x8_t g0_2 = vsubq_s16(y1_1_8, g0_bias); + + r0_1 = vmaxq_s16(r0_1, zero); + b0_1 = vmaxq_s16(b0_1, zero); + g0_1 = vmaxq_s16(g0_1, zero); + + r0_2 = vmaxq_s16(r0_2, zero); + b0_2 = vmaxq_s16(b0_2, zero); + g0_2 = vmaxq_s16(g0_2, zero); + + r0_1 = vminq_s16(r0_1, max); + b0_1 = vminq_s16(b0_1, max); + g0_1 = vminq_s16(g0_1, max); + + r0_2 = vminq_s16(r0_2, max); + b0_2 = vminq_s16(b0_2, max); + g0_2 = vminq_s16(g0_2, max); + + uint8x8_t r00 = vreinterpret_u8_s8(vmovn_s16(r0_1)); + uint8x8_t b00 = vreinterpret_u8_s8(vmovn_s16(b0_1)); + uint8x8_t g00 = vreinterpret_u8_s8(vmovn_s16(g0_1)); + + uint8x8_t r01 = vreinterpret_u8_s8(vmovn_s16(r0_2)); + uint8x8_t b01 = vreinterpret_u8_s8(vmovn_s16(b0_2)); + uint8x8_t g01 = vreinterpret_u8_s8(vmovn_s16(g0_2)); + + int16x8_t r1_1 = vaddq_s16(y2_0_8, r0_bias); + int16x8_t b1_1 = vaddq_s16(y2_0_8, b0_bias); + int16x8_t g1_1 = vsubq_s16(y2_0_8, g0_bias); // g0_1 = y1_0_8 - g0_1 + + int16x8_t r1_2 = vaddq_s16(y2_1_8, r0_bias); + int16x8_t b1_2 = vaddq_s16(y2_1_8, b0_bias); + int16x8_t g1_2 = vsubq_s16(y2_1_8, g0_bias); + + uint8x8x2_t r00_0 = vtrn_u8(r00, r01); // 014589 236710 + uint8x8x2_t b00_0 = vtrn_u8(b00, b01); + uint8x8x2_t g00_0 = vtrn_u8(g00, g01); + + r1_1 = vmaxq_s16(r1_1, zero); + b1_1 = vmaxq_s16(b1_1, zero); + g1_1 = vmaxq_s16(g1_1, zero); + + r1_2 = vmaxq_s16(r1_2, zero); + b1_2 = vmaxq_s16(b1_2, zero); + g1_2 = vmaxq_s16(g1_2, zero); + + uint16x4_t r0_16 = vreinterpret_u16_u8(r00_0.val[0]); + uint16x4_t r1_16 = vreinterpret_u16_u8(r00_0.val[1]); + + uint16x4_t b0_16 = vreinterpret_u16_u8(b00_0.val[0]); + uint16x4_t b1_16 = vreinterpret_u16_u8(b00_0.val[1]); + + uint16x4_t g0_16 = vreinterpret_u16_u8(g00_0.val[0]); + uint16x4_t g1_16 = vreinterpret_u16_u8(g00_0.val[1]); + + uint16x4x2_t r00_1 = vtrn_u16(r0_16, r1_16); // 012389 456710 + uint16x4x2_t b00_1 = vtrn_u16(b0_16, b1_16); + uint16x4x2_t g00_1 = vtrn_u16(g0_16, g1_16); + + r1_1 = vminq_s16(r1_1, max); + b1_1 = vminq_s16(b1_1, max); + g1_1 = vminq_s16(g1_1, max); + + r1_2 = vminq_s16(r1_2, max); + b1_2 = vminq_s16(b1_2, max); + g1_2 = vminq_s16(g1_2, max); + + uint32x2_t r0_32 = vreinterpret_u32_u16(r00_1.val[0]); + uint32x2_t r1_32 = vreinterpret_u32_u16(r00_1.val[1]); + + uint32x2_t b0_32 = vreinterpret_u32_u16(b00_1.val[0]); + uint32x2_t b1_32 = vreinterpret_u32_u16(b00_1.val[1]); + + uint32x2_t g0_32 = vreinterpret_u32_u16(g00_1.val[0]); + uint32x2_t g1_32 = vreinterpret_u32_u16(g00_1.val[1]); + + uint32x2x2_t r00_2 = vtrn_u32(r0_32, r1_32); // 01234567 8910 + uint32x2x2_t b00_2 = vtrn_u32(b0_32, b1_32); + uint32x2x2_t g00_2 = vtrn_u32(g0_32, g1_32); + + r00 = vreinterpret_u8_s8(vmovn_s16(r1_1)); + b00 = vreinterpret_u8_s8(vmovn_s16(b1_1)); + g00 = vreinterpret_u8_s8(vmovn_s16(g1_1)); + + r01 = vreinterpret_u8_s8(vmovn_s16(r1_2)); + b01 = vreinterpret_u8_s8(vmovn_s16(b1_2)); + g01 = vreinterpret_u8_s8(vmovn_s16(g1_2)); + + uint8x8_t r0_8 = vreinterpret_u8_u32(r00_2.val[0]); + uint8x8_t b0_8 = vreinterpret_u8_u32(b00_2.val[0]); + uint8x8_t g0_8 = vreinterpret_u8_u32(g00_2.val[0]); + + uint8x8_t r1_8 = vreinterpret_u8_u32(r00_2.val[1]); + uint8x8_t b1_8 = vreinterpret_u8_u32(b00_2.val[1]); + uint8x8_t g1_8 = vreinterpret_u8_u32(g00_2.val[1]); + + uint8x8x3_t v_bgr; + v_bgr.val[0] = b0_8; + v_bgr.val[1] = g0_8; + v_bgr.val[2] = r0_8; + + r00_0 = vtrn_u8(r00, r01); // 014589 236710 + b00_0 = vtrn_u8(b00, b01); + g00_0 = vtrn_u8(g00, g01); + + vst3_u8(ptr_bgr1, v_bgr); + + r0_16 = vreinterpret_u16_u8(r00_0.val[0]); + r1_16 = vreinterpret_u16_u8(r00_0.val[1]); + + b0_16 = vreinterpret_u16_u8(b00_0.val[0]); + b1_16 = vreinterpret_u16_u8(b00_0.val[1]); + + g0_16 = vreinterpret_u16_u8(g00_0.val[0]); + g1_16 = vreinterpret_u16_u8(g00_0.val[1]); + + ptr_bgr1 += 24; + uint8x8x3_t v_bgr1; + v_bgr1.val[0] = b1_8; + v_bgr1.val[1] = g1_8; + v_bgr1.val[2] = r1_8; + + r00_1 = vtrn_u16(r0_16, r1_16); // 012389 456710 + b00_1 = vtrn_u16(b0_16, b1_16); + g00_1 = vtrn_u16(g0_16, g1_16); + + vst3_u8(ptr_bgr1, v_bgr1); + + r0_32 = vreinterpret_u32_u16(r00_1.val[0]); + r1_32 = vreinterpret_u32_u16(r00_1.val[1]); + + b0_32 = vreinterpret_u32_u16(b00_1.val[0]); + b1_32 = vreinterpret_u32_u16(b00_1.val[1]); + + g0_32 = vreinterpret_u32_u16(g00_1.val[0]); + g1_32 = vreinterpret_u32_u16(g00_1.val[1]); + + ptr_bgr1 += 24; + + r00_2 = vtrn_u32(r0_32, r1_32); // 01234567 8910 + b00_2 = vtrn_u32(b0_32, b1_32); + g00_2 = vtrn_u32(g0_32, g1_32); + + ptr_vu += 16; + ptr_y1 += 16; + ptr_y2 += 16; + + r0_8 = vreinterpret_u8_u32(r00_2.val[0]); + b0_8 = vreinterpret_u8_u32(b00_2.val[0]); + g0_8 = vreinterpret_u8_u32(g00_2.val[0]); + + r1_8 = vreinterpret_u8_u32(r00_2.val[1]); + b1_8 = vreinterpret_u8_u32(b00_2.val[1]); + g1_8 = vreinterpret_u8_u32(g00_2.val[1]); + + v_bgr.val[0] = b0_8; + v_bgr.val[1] = g0_8; + v_bgr.val[2] = r0_8; + + v_bgr1.val[0] = b1_8; + v_bgr1.val[1] = g1_8; + v_bgr1.val[2] = r1_8; + + vst3_u8(ptr_bgr2, v_bgr); + vst3_u8(ptr_bgr2 + 24, v_bgr1); + + ptr_bgr2 += 48; + } + // two data + for (; j < srcw; j += 2) { + uint8_t _y0 = ptr_y1[0]; + uint8_t _y1 = ptr_y1[1]; + uint8_t _v = ptr_vu[x_num]; + uint8_t _u = ptr_vu[y_num]; + uint8_t _y0_1 = ptr_y2[0]; + uint8_t _y1_1 = ptr_y2[1]; + + int ra = floor((179 * (_v - 128)) >> 7); + int ga = floor((44 * (_u - 128) + 91 * (_v - 128)) >> 7); + int ba = floor((227 * (_u - 128)) >> 7); + + int r = _y0 + ra; + int g = _y0 - ga; + int b = _y0 + ba; + + int r1 = _y1 + ra; + int g1 = _y1 - ga; + int b1 = _y1 + ba; + + r = r < 0 ? 0 : (r > 255) ? 255 : r; + g = g < 0 ? 0 : (g > 255) ? 255 : g; + b = b < 0 ? 0 : (b > 255) ? 255 : b; + + r1 = r1 < 0 ? 0 : (r1 > 255) ? 255 : r1; + g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1; + b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1; + + *ptr_bgr1++ = b; + *ptr_bgr1++ = g; + *ptr_bgr1++ = r; + + int r2 = _y0_1 + ra; + int g2 = _y0_1 - ga; + int b2 = _y0_1 + ba; + + int r3 = _y1_1 + ra; + int g3 = _y1_1 - ga; + int b3 = _y1_1 + ba; + + r2 = r2 < 0 ? 0 : (r2 > 255) ? 255 : r2; + g2 = g2 < 0 ? 0 : (g2 > 255) ? 255 : g2; + b2 = b2 < 0 ? 0 : (b2 > 255) ? 255 : b2; + + r3 = r3 < 0 ? 0 : (r3 > 255) ? 255 : r3; + g3 = g3 < 0 ? 0 : (g3 > 255) ? 255 : g3; + b3 = b3 < 0 ? 0 : (b3 > 255) ? 255 : b3; + + *ptr_bgr1++ = b1; + *ptr_bgr1++ = g1; + *ptr_bgr1++ = r1; + + *ptr_bgr2++ = b2; + *ptr_bgr2++ = g2; + *ptr_bgr2++ = r2; + + ptr_y1 += 2; + ptr_y2 += 2; + ptr_vu += 2; + + *ptr_bgr2++ = b3; + *ptr_bgr2++ = g3; + *ptr_bgr2++ = r3; + } + } + delete[] zerobuf; + delete[] writebuf; +} +// nv12bgra, nv21tobgra +void nv_to_bgra(const uint8_t* src, + uint8_t* dst, + int srcw, + int srch, + int x_num, + int y_num) { + // nv21 x = 0, y = 1 + // nv12 x = 1, y = 0 + int y_h = srch; + int vu_h = 1 / 2 * srch; + const uint8_t* y = src; + const uint8_t* vu = src + y_h * srcw; + int wout = srcw * 4; + + uint8_t* zerobuf = new uint8_t[srcw]; + uint8_t* writebuf = new uint8_t[wout]; + memset(zerobuf, 0, sizeof(uint8_t) * srcw); + + int16x8_t bias = vdupq_n_s16(128); + int16x8_t ga = vdupq_n_s16(44); + int16x8_t ra = vdupq_n_s16(179); + int16x8_t ba = vdupq_n_s16(227); + int16x8_t gb = vdupq_n_s16(91); + int16x8_t zero = vdupq_n_s16(0); + int16x8_t max = vdupq_n_s16(255); + uint8x8_t a_8 = vdup_n_u8(255); +#pragma omp parallel for + for (int i = 0; i < y_h; i += 2) { + const uint8_t* ptr_y1 = y + i * srcw; + const uint8_t* ptr_y2 = ptr_y1 + srcw; + const uint8_t* ptr_vu = vu + (i / 2) * srcw; + uint8_t* ptr_bgr1 = dst + i * wout; + uint8_t* ptr_bgr2 = ptr_bgr1 + wout; + if (i + 2 > y_h) { + ptr_y2 = zerobuf; + ptr_bgr2 = writebuf; + } + int j = 0; + for (; j < srcw - 15; j += 16) { + uint8x8x2_t y1 = vld2_u8(ptr_y1); // d8 = y0y2y4y6...y14 d9 = + // y1y3y5...y15 + uint8x8x2_t vu = + vld2_u8(ptr_vu); // d0 = v0v1v2v3v4v5...v7 d1 = u0u1u2...u7 + + uint8x8x2_t y2 = vld2_u8(ptr_y2); + + uint16x8_t v = vmovl_u8(vu.val[x_num]); + uint16x8_t u = vmovl_u8(vu.val[y_num]); + int16x8_t v_s = vreinterpretq_s16_u16(v); + int16x8_t u_s = vreinterpretq_s16_u16(u); + int16x8_t v_bias = vsubq_s16(v_s, bias); + int16x8_t u_bias = vsubq_s16(u_s, bias); + + // G = Y - 0.34414*(U-128) - 0.71414*(V-128); + int16x8_t g0 = vmulq_s16(ga, u_bias); + // R = Y + 1.402*(V-128); + int16x8_t r0 = vmulq_s16(ra, v_bias); + // B = Y + 1.772*(U-128); + int16x8_t b0 = vmulq_s16(ba, u_bias); + + g0 = vmlaq_s16(g0, gb, v_bias); + + int16x8_t y1_0_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[0])); + int16x8_t y1_1_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[1])); + + int16x8_t y2_0_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[0])); + int16x8_t y2_1_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[1])); + + int16x8_t r0_bias = vshrq_n_s16(r0, 7); // r0 / 128 + int16x8_t b0_bias = vshrq_n_s16(b0, 7); + int16x8_t g0_bias = vshrq_n_s16(g0, 7); + + int16x8_t r0_1 = vaddq_s16(y1_0_8, r0_bias); + int16x8_t b0_1 = vaddq_s16(y1_0_8, b0_bias); + int16x8_t g0_1 = vsubq_s16(y1_0_8, g0_bias); // g0_1 = y1_0_8 - g0_1 + + int16x8_t r0_2 = vaddq_s16(y1_1_8, r0_bias); + int16x8_t b0_2 = vaddq_s16(y1_1_8, b0_bias); + int16x8_t g0_2 = vsubq_s16(y1_1_8, g0_bias); + + r0_1 = vmaxq_s16(r0_1, zero); + b0_1 = vmaxq_s16(b0_1, zero); + g0_1 = vmaxq_s16(g0_1, zero); + + r0_2 = vmaxq_s16(r0_2, zero); + b0_2 = vmaxq_s16(b0_2, zero); + g0_2 = vmaxq_s16(g0_2, zero); + + r0_1 = vminq_s16(r0_1, max); + b0_1 = vminq_s16(b0_1, max); + g0_1 = vminq_s16(g0_1, max); + + r0_2 = vminq_s16(r0_2, max); + b0_2 = vminq_s16(b0_2, max); + g0_2 = vminq_s16(g0_2, max); + + uint8x8_t r00 = vreinterpret_u8_s8(vmovn_s16(r0_1)); + uint8x8_t b00 = vreinterpret_u8_s8(vmovn_s16(b0_1)); + uint8x8_t g00 = vreinterpret_u8_s8(vmovn_s16(g0_1)); + + uint8x8_t r01 = vreinterpret_u8_s8(vmovn_s16(r0_2)); + uint8x8_t b01 = vreinterpret_u8_s8(vmovn_s16(b0_2)); + uint8x8_t g01 = vreinterpret_u8_s8(vmovn_s16(g0_2)); + + int16x8_t r1_1 = vaddq_s16(y2_0_8, r0_bias); + int16x8_t b1_1 = vaddq_s16(y2_0_8, b0_bias); + int16x8_t g1_1 = vsubq_s16(y2_0_8, g0_bias); // g0_1 = y1_0_8 - g0_1 + + int16x8_t r1_2 = vaddq_s16(y2_1_8, r0_bias); + int16x8_t b1_2 = vaddq_s16(y2_1_8, b0_bias); + int16x8_t g1_2 = vsubq_s16(y2_1_8, g0_bias); + + uint8x8x2_t r00_0 = vtrn_u8(r00, r01); // 014589 236710 + uint8x8x2_t b00_0 = vtrn_u8(b00, b01); + uint8x8x2_t g00_0 = vtrn_u8(g00, g01); + + r1_1 = vmaxq_s16(r1_1, zero); + b1_1 = vmaxq_s16(b1_1, zero); + g1_1 = vmaxq_s16(g1_1, zero); + + r1_2 = vmaxq_s16(r1_2, zero); + b1_2 = vmaxq_s16(b1_2, zero); + g1_2 = vmaxq_s16(g1_2, zero); + + uint16x4_t r0_16 = vreinterpret_u16_u8(r00_0.val[0]); + uint16x4_t r1_16 = vreinterpret_u16_u8(r00_0.val[1]); + + uint16x4_t b0_16 = vreinterpret_u16_u8(b00_0.val[0]); + uint16x4_t b1_16 = vreinterpret_u16_u8(b00_0.val[1]); + + uint16x4_t g0_16 = vreinterpret_u16_u8(g00_0.val[0]); + uint16x4_t g1_16 = vreinterpret_u16_u8(g00_0.val[1]); + + uint16x4x2_t r00_1 = vtrn_u16(r0_16, r1_16); // 012389 456710 + uint16x4x2_t b00_1 = vtrn_u16(b0_16, b1_16); + uint16x4x2_t g00_1 = vtrn_u16(g0_16, g1_16); + + r1_1 = vminq_s16(r1_1, max); + b1_1 = vminq_s16(b1_1, max); + g1_1 = vminq_s16(g1_1, max); + + r1_2 = vminq_s16(r1_2, max); + b1_2 = vminq_s16(b1_2, max); + g1_2 = vminq_s16(g1_2, max); + + uint32x2_t r0_32 = vreinterpret_u32_u16(r00_1.val[0]); + uint32x2_t r1_32 = vreinterpret_u32_u16(r00_1.val[1]); + + uint32x2_t b0_32 = vreinterpret_u32_u16(b00_1.val[0]); + uint32x2_t b1_32 = vreinterpret_u32_u16(b00_1.val[1]); + + uint32x2_t g0_32 = vreinterpret_u32_u16(g00_1.val[0]); + uint32x2_t g1_32 = vreinterpret_u32_u16(g00_1.val[1]); + + uint32x2x2_t r00_2 = vtrn_u32(r0_32, r1_32); // 01234567 8910 + uint32x2x2_t b00_2 = vtrn_u32(b0_32, b1_32); + uint32x2x2_t g00_2 = vtrn_u32(g0_32, g1_32); + + r00 = vreinterpret_u8_s8(vmovn_s16(r1_1)); + b00 = vreinterpret_u8_s8(vmovn_s16(b1_1)); + g00 = vreinterpret_u8_s8(vmovn_s16(g1_1)); + + r01 = vreinterpret_u8_s8(vmovn_s16(r1_2)); + b01 = vreinterpret_u8_s8(vmovn_s16(b1_2)); + g01 = vreinterpret_u8_s8(vmovn_s16(g1_2)); + + uint8x8_t r0_8 = vreinterpret_u8_u32(r00_2.val[0]); + uint8x8_t b0_8 = vreinterpret_u8_u32(b00_2.val[0]); + uint8x8_t g0_8 = vreinterpret_u8_u32(g00_2.val[0]); + + uint8x8_t r1_8 = vreinterpret_u8_u32(r00_2.val[1]); + uint8x8_t b1_8 = vreinterpret_u8_u32(b00_2.val[1]); + uint8x8_t g1_8 = vreinterpret_u8_u32(g00_2.val[1]); + + uint8x8x4_t v_bgr; + v_bgr.val[0] = b0_8; + v_bgr.val[1] = g0_8; + v_bgr.val[2] = r0_8; + v_bgr.val[3] = a_8; + + r00_0 = vtrn_u8(r00, r01); // 014589 236710 + b00_0 = vtrn_u8(b00, b01); + g00_0 = vtrn_u8(g00, g01); + + // ptr_bgr3 += 8; + // ptr_bgr1 += 8; + // ptr_bgr2 += 8; + // vst3_u8(ptr_bgr1, v_bgr); + vst4_u8(ptr_bgr1, v_bgr); + + r0_16 = vreinterpret_u16_u8(r00_0.val[0]); + r1_16 = vreinterpret_u16_u8(r00_0.val[1]); + + b0_16 = vreinterpret_u16_u8(b00_0.val[0]); + b1_16 = vreinterpret_u16_u8(b00_0.val[1]); + + g0_16 = vreinterpret_u16_u8(g00_0.val[0]); + g1_16 = vreinterpret_u16_u8(g00_0.val[1]); + + ptr_bgr1 += 32; + // uint8x8x3_t v_bgr1; + uint8x8x4_t v_bgr1; + v_bgr1.val[0] = b1_8; + v_bgr1.val[1] = g1_8; + v_bgr1.val[2] = r1_8; + v_bgr1.val[3] = a_8; + + r00_1 = vtrn_u16(r0_16, r1_16); // 012389 456710 + b00_1 = vtrn_u16(b0_16, b1_16); + g00_1 = vtrn_u16(g0_16, g1_16); + + // vst3_u8(ptr_bgr1, v_bgr1); + vst4_u8(ptr_bgr1, v_bgr1); + + r0_32 = vreinterpret_u32_u16(r00_1.val[0]); + r1_32 = vreinterpret_u32_u16(r00_1.val[1]); + + b0_32 = vreinterpret_u32_u16(b00_1.val[0]); + b1_32 = vreinterpret_u32_u16(b00_1.val[1]); + + g0_32 = vreinterpret_u32_u16(g00_1.val[0]); + g1_32 = vreinterpret_u32_u16(g00_1.val[1]); + + // ptr_bgr1 += 24; + ptr_bgr1 += 32; + + r00_2 = vtrn_u32(r0_32, r1_32); // 01234567 8910 + b00_2 = vtrn_u32(b0_32, b1_32); + g00_2 = vtrn_u32(g0_32, g1_32); + + ptr_vu += 16; + ptr_y1 += 16; + ptr_y2 += 16; + + r0_8 = vreinterpret_u8_u32(r00_2.val[0]); + b0_8 = vreinterpret_u8_u32(b00_2.val[0]); + g0_8 = vreinterpret_u8_u32(g00_2.val[0]); + + r1_8 = vreinterpret_u8_u32(r00_2.val[1]); + b1_8 = vreinterpret_u8_u32(b00_2.val[1]); + g1_8 = vreinterpret_u8_u32(g00_2.val[1]); + + v_bgr.val[0] = b0_8; + v_bgr.val[1] = g0_8; + v_bgr.val[2] = r0_8; + + v_bgr1.val[0] = b1_8; + v_bgr1.val[1] = g1_8; + v_bgr1.val[2] = r1_8; + + // vst3_u8(ptr_bgr2, v_bgr); + // vst3_u8(ptr_bgr2 + 24, v_bgr1); + vst4_u8(ptr_bgr2, v_bgr); + vst4_u8(ptr_bgr2 + 32, v_bgr1); + + ptr_bgr2 += 64; + } + // two data + for (; j < srcw; j += 2) { + uint8_t _y0 = ptr_y1[0]; + uint8_t _y1 = ptr_y1[1]; + uint8_t _v = ptr_vu[x_num]; + uint8_t _u = ptr_vu[y_num]; + uint8_t _y0_1 = ptr_y2[0]; + uint8_t _y1_1 = ptr_y2[1]; + + int ra = floor((179 * (_v - 128)) >> 7); + int ga = floor((44 * (_u - 128) + 91 * (_v - 128)) >> 7); + int ba = floor((227 * (_u - 128)) >> 7); + + int r = _y0 + ra; + int g = _y0 - ga; + int b = _y0 + ba; + + int r1 = _y1 + ra; + int g1 = _y1 - ga; + int b1 = _y1 + ba; + + r = r < 0 ? 0 : (r > 255) ? 255 : r; + g = g < 0 ? 0 : (g > 255) ? 255 : g; + b = b < 0 ? 0 : (b > 255) ? 255 : b; + + r1 = r1 < 0 ? 0 : (r1 > 255) ? 255 : r1; + g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1; + b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1; + + // *ptr_bgr1++ = b; + // *ptr_bgr2++ = g; + // *ptr_bgr3++ = r; + *ptr_bgr1++ = b; + *ptr_bgr1++ = g; + *ptr_bgr1++ = r; + *ptr_bgr1++ = 255; + + int r2 = _y0_1 + ra; + int g2 = _y0_1 - ga; + int b2 = _y0_1 + ba; + + int r3 = _y1_1 + ra; + int g3 = _y1_1 - ga; + int b3 = _y1_1 + ba; + + r2 = r2 < 0 ? 0 : (r2 > 255) ? 255 : r2; + g2 = g2 < 0 ? 0 : (g2 > 255) ? 255 : g2; + b2 = b2 < 0 ? 0 : (b2 > 255) ? 255 : b2; + + r3 = r3 < 0 ? 0 : (r3 > 255) ? 255 : r3; + g3 = g3 < 0 ? 0 : (g3 > 255) ? 255 : g3; + b3 = b3 < 0 ? 0 : (b3 > 255) ? 255 : b3; + + *ptr_bgr1++ = b1; + *ptr_bgr1++ = g1; + *ptr_bgr1++ = r1; + *ptr_bgr1++ = 255; + + *ptr_bgr2++ = b2; + *ptr_bgr2++ = g2; + *ptr_bgr2++ = r2; + *ptr_bgr2++ = 255; + + ptr_y1 += 2; + ptr_y2 += 2; + ptr_vu += 2; + + *ptr_bgr2++ = b3; + *ptr_bgr2++ = g3; + *ptr_bgr2++ = r3; + *ptr_bgr2++ = 255; + } + } + delete[] zerobuf; + delete[] writebuf; +} +void nv21_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch) { + nv_to_bgr(src, dst, srcw, srch, 0, 1); +} + +// nv12(yuv) to BGR:store hwc dsth * dstw = srch * srcw y_w = srcw, y_h = srch +// uv_w = srcw uv_h = 1/2 * srch +void nv12_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch) { + // exchange vu forward + nv_to_bgr(src, dst, srcw, srch, 1, 0); +} +// nv21(yvu) to BGRA: stroe hwc dsth * dstw = srch * (srcw) y_w = srcw, y_h = +// srch uv_w = srcw uv_h = 1/2 * srch +void nv21_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch) { + nv_to_bgra(src, dst, srcw, srch, 0, 1); +} +// nv12(yuv) to BGRA:store hwc dsth * dstw = srch * srcw y_w = srcw, y_h = srch +// uv_w = srcw uv_h = 1/2 * srch +void nv12_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch) { + nv_to_bgra(src, dst, srcw, srch, 1, 0); +} +/* +采用CV_BGR2GRAY,转换公式Gray = 0.1140*B + 0.5870*G + 0.2989*R +采用CV_RGB2GRAY,转换公式Gray = 0.1140*R + 0.5870*G + 0.2989*B +b = 0.114 *128 = 14.529 = 15 +g = 0.587 * 128 = 75.136 = 75 +r = 0.2989 * 127 = 38.2592 = 38 +Gray = (15*B + 75*G + 38*R)/128 +bgr2gray, rgb2gray +*/ +void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) { + uint8_t b = 15; + uint8_t g = 75; + uint8_t r = 38; + + uint8x8_t vb = vdup_n_u8(b); + uint8x8_t vg = vdup_n_u8(g); + uint8x8_t vr = vdup_n_u8(r); +#ifdef __aarch64__ +#else + uint8_t vb_array[8] = {b, b, b, b, b, b, b, b}; + uint8_t vg_array[8] = {g, g, g, g, g, g, g, g}; + uint8_t vr_array[8] = {r, r, r, r, r, r, r, r}; +#endif + int cnt_pro = srcw >> 3; + int remain_pro = srcw % 8; + int win = srcw * 3; + int i = 0; +#pragma omp parallel for + for (i = 0; i < srch - 3; i += 4) { + int j = 0; + const uint8_t* inptr0 = src + i * win; + const uint8_t* inptr1 = inptr0 + win; + const uint8_t* inptr2 = inptr1 + win; + const uint8_t* inptr3 = inptr2 + win; + uint8_t* outr0 = dst + i * srcw; + uint8_t* outr1 = outr0 + srcw; + uint8_t* outr2 = outr1 + srcw; + uint8_t* outr3 = outr2 + srcw; + + int cnt = cnt_pro; + if (cnt > 0) { +#ifdef __aarch64__ + asm volatile( + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr0], #128] \n" + "prfm pldl1keep, [%[inptr1]] \n" + "prfm pldl1keep, [%[inptr1], #128] \n" + "prfm pldl1keep, [%[inptr2]] \n" + "prfm pldl1keep, [%[inptr2], #128] \n" + "prfm pldl1keep, [%[inptr3]] \n" + "prfm pldl1keep, [%[inptr3], #128] \n" + "1: \n" + "ld3 {v0.8b - v2.8b}, [%[inptr0]], #24 \n" // d8 = y0y3y6y9.. d9 = + // y1y4y7... + "ld3 {v3.8b - v5.8b}, [%[inptr1]], #24 \n" // d8 = y0y3y6y9.. d9 = + // y1y4y7... + "ld3 {v6.8b - v8.8b}, [%[inptr2]], #24 \n" // d8 = y0y3y6y9.. d9 = + // y1y4y7... + "ld3 {v9.8b - v11.8b}, [%[inptr3]], #24 \n" // d8 = y0y3y6y9.. d9 = + // y1y4y7... + // mul b + "umull v12.8h, v0.8b, %w[vb].8b \n" // v0 * vb + "umull v13.8h, v3.8b, %w[vb].8b \n" // v0 * vb + "umull v14.8h, v6.8b, %w[vb].8b \n" // v0 * vb + "umull v15.8h, v9.8b, %w[vb].8b \n" // v0 * vb + // mul g + "umull v16.8h, v1.8b, %w[vg].8b \n" // v0 * vb + "umull v17.8h, v4.8b, %w[vg].8b \n" // v0 * vb + "umull v18.8h, v7.8b, %w[vg].8b \n" // v0 * vb + "umull v19.8h, v10.8b, %w[vg].8b \n" // v0 * vb + // mul r + "umlal v12.8h, v2.8b, %w[vr].8b \n" // v0 * vb + "umlal v13.8h, v5.8b, %w[vr].8b \n" // v0 * vb + "umlal v14.8h, v8.8b, %w[vr].8b \n" // v0 * vb + "umlal v15.8h, v11.8b, %w[vr].8b \n" // v0 * vb + // 16->32 + "uaddl v0.4s, v16.4h, v12.4h \n" + "uaddl2 v1.4s, v16.8h, v12.8h \n" + "uaddl v2.4s, v17.4h, v13.4h \n" + "uaddl2 v3.4s, v17.8h, v13.8h \n" + "uaddl v4.4s, v18.4h, v14.4h \n" + "uaddl2 v5.4s, v18.8h, v14.8h \n" + "uaddl v6.4s, v19.4h, v15.4h \n" + "uaddl2 v7.4s, v19.8h, v15.8h \n" + // 32->16 v0 >> 7 + "shrn v12.4h, v0.4s, #7 \n" + "shrn2 v12.8h, v1.4s, #7 \n" + "shrn v13.4h, v2.4s, #7 \n" + "shrn2 v13.8h, v3.4s, #7 \n" + "shrn v14.4h, v4.4s, #7 \n" + "shrn2 v14.8h, v5.4s, #7 \n" + "shrn v15.4h, v6.4s, #7 \n" + "shrn2 v15.8h, v7.4s, #7 \n" + // 16->8 + "xtn v0.8b, v12.8h \n" + "xtn v1.8b, v13.8h \n" + "xtn v2.8b, v14.8h \n" + "xtn v3.8b, v15.8h \n" + "subs %w[cnt], %w[cnt], #1 \n" + "st1 {v0.8b}, [%[outr0]], #8 \n" + "st1 {v1.8b}, [%[outr1]], #8 \n" + "st1 {v2.8b}, [%[outr2]], #8 \n" + "st1 {v3.8b}, [%[outr3]], #8 \n" + "bne 1b \n" + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outr0] "+r"(outr0), + [outr1] "+r"(outr1), + [outr2] "+r"(outr2), + [outr3] "+r"(outr3), + [cnt] "+r"(cnt) + : [vb] "w"(vb), [vg] "w"(vg), [vr] "w"(vr) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20"); +#else + asm volatile( + "pld [%[inptr0]] @ preload a, 64byte\n" + "pld [%[inptr0], #128] @ preload a, 64byte\n" + "pld [%[inptr1]] @ preload a, 64byte\n" + "pld [%[inptr1], #128] @ preload a, 64byte\n" + "pld [%[inptr2]] @ preload a, 64byte\n" + "pld [%[inptr2], #128] @ preload a, 64byte\n" + "pld [%[inptr3]] @ preload a, 64byte\n" + "pld [%[inptr3], #128] @ preload a, 64byte\n" + "vld1.8 d0, [%[vb]] \n" + "vld1.8 d1, [%[vg]] \n" + "vld1.8 d2, [%[vr]] \n" + "1: \n" + "vld3.8 {d3, d4, d5}, [%[inptr0]]! \n" + "vld3.8 {d6, d7, d8}, [%[inptr1]]! \n" + "vld3.8 {d9, d10, d11}, [%[inptr2]]! \n" + "vld3.8 {d12, d13, d14}, [%[inptr3]]! \n" + // vb + "vmull.u8 q8, d3, d0 \n" + "vmull.u8 q9, d6, d0 \n" + "vmull.u8 q10, d9, d0 \n" + "vmull.u8 q11, d12, d0 \n" + // vg + "vmull.u8 q12, d4, d1 \n" + "vmull.u8 q13, d7, d1 \n" + "vmull.u8 q14, d10, d1 \n" + "vmull.u8 q15, d13, d1 \n" + // vr + "vmlal.u8 q8, d5, d2 \n" + "vmlal.u8 q9, d6, d2 \n" + "vmlal.u8 q10, d11, d2 \n" + "vmlal.u8 q11, d14, d2 \n" + // 16->32 + "vaddl.u16 q2, d24, d16 \n" + "vaddl.u16 q3, d25, d17 \n" + "vaddl.u16 q4, d26, d18 \n" + "vaddl.u16 q5, d27, d19 \n" + "vaddl.u16 q6, d28, d20 \n" + "vaddl.u16 q7, d29, d21 \n" + "vaddl.u16 q8, d30, d22 \n" + "vaddl.u16 q9, d31, d23 \n" + // 32->16 q2 >> 7 + "vshrn.u32 d20, q2, #7 \n" + "vshrn.u32 d21, q3, #7 \n" + "vshrn.u32 d22, q4, #7 \n" + "vshrn.u32 d23, q5, #7 \n" + "vshrn.u32 d24, q6, #7 \n" + "vshrn.u32 d25, q7, #7 \n" + "vshrn.u32 d26, q8, #7 \n" + "vshrn.u32 d27, q8, #7 \n" + // 16->8 + "vmovn.u16 d4, q10 \n" + "vmovn.u16 d5, q11 \n" + "vmovn.u16 d6, q12 \n" + "vmovn.u16 d7, q13 \n" + "subs %[cnt], #1 \n" + // store + "vst1.8 d4, [%[outr0]]! \n" + "vst1.8 d5, [%[outr1]]! \n" + "vst1.8 d6, [%[outr2]]! \n" + "vst1.8 d7, [%[outr3]]! \n" + "bne 1b \n" + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outr0] "+r"(outr0), + [outr1] "+r"(outr1), + [outr2] "+r"(outr2), + [outr3] "+r"(outr3), + [cnt] "+r"(cnt) + : [vb] "r"(vb_array), [vg] "r"(vg_array), [vr] "r"(vr_array) + : "cc", + "memory", + "q0", + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); +#endif + } + for (; j < remain_pro; j++) { + *outr0++ = (inptr0[0] * b + inptr0[1] * g + inptr0[2] * r) >> 7; + *outr1++ = (inptr1[0] * b + inptr1[1] * g + inptr1[2] * r) >> 7; + *outr2++ = (inptr2[0] * b + inptr2[1] * g + inptr2[2] * r) >> 7; + *outr3++ = (inptr3[0] * b + inptr3[1] * g + inptr3[2] * r) >> 7; + inptr0 += 3; + inptr1 += 3; + inptr2 += 3; + inptr3 += 3; + } + } + for (; i < srch; i++) { + int j = 0; + const uint8_t* inptr0 = src + i * win; + uint8_t* outr0 = dst + i * srcw; + for (j = 0; j < cnt_pro; j++) { + uint8x8x3_t y0 = vld3_u8(inptr0); // d8 = y0y3y6y9.. d9 = y1y4y7...y + uint16x8_t val0 = vmull_u8(y0.val[0], vb); + + uint16x8_t val0_1 = vmull_u8(y0.val[1], vg); + + val0 = vmlal_u8(val0, y0.val[2], vr); + + uint32x4_t v0_sum0 = vaddl_u16(vget_low_u16(val0_1), vget_low_u16(val0)); + uint32x4_t v0_sum1 = + vaddl_u16(vget_high_u16(val0_1), vget_high_u16(val0)); + + uint16x4_t v0_sum0_16 = vshrn_n_u32(v0_sum0, 7); + uint16x4_t v0_sum1_16 = vshrn_n_u32(v0_sum1, 7); + + uint16x8_t v0_sum = vcombine_u16(v0_sum0_16, v0_sum1_16); + + uint8x8_t vout0 = vmovn_u16(v0_sum); + + inptr0 += 24; + vst1_u8(outr0, vout0); + outr0 += 8; + } + for (; j < srcw; j++) { + *outr0++ = (inptr0[0] * b + inptr0[1] * g + inptr0[2] * r) >> 7; + inptr0 += 3; + } + } +} +/* +采用CV_GRAY2BGR,转换公式B = G = R = Gray +采用CV_GRAY2RGB,转换公式R = G = B = Gray +gray2bgr, gray2rgb +*/ +void hwc1_to_hwc3(const uint8_t* src, uint8_t* dst, int srcw, int srch) { + for (int i = 0; i < srch; i++) { + for (int j = 0; j < srcw; j++) { + *dst++ = *src; + *dst++ = *src; + *dst++ = *src; + src++; + } + } +} +// bgr2bgra, rgb2rgba +void hwc3_to_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch) { + for (int i = 0; i < srch; i++) { + for (int j = 0; j < srcw; j++) { + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = 255; + } + } +} +// bgra2bgr, rgba2rgb +void hwc4_to_hwc3(const uint8_t* src, uint8_t* dst, int srcw, int srch) { + for (int i = 0; i < srch; i++) { + for (int j = 0; j < srcw; j++) { + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + src++; + } + } +} +// bgr2rgb, rgb2bgr +void hwc3_trans(const uint8_t* src, uint8_t* dst, int srcw, int srch) { + for (int i = 0; i < srch; i++) { + for (int j = 0; j < srcw; j++) { + *dst++ = src[2]; // r + *dst++ = src[1]; // g + *dst++ = src[0]; // b + src += 3; + } + } +} +// bgra2rgba, rgba2bgra +void hwc4_trans(const uint8_t* src, uint8_t* dst, int srcw, int srch) { + for (int i = 0; i < srch; i++) { + for (int j = 0; j < srcw; j++) { + *dst++ = src[2]; // r + *dst++ = src[1]; // g + *dst++ = src[0]; // b + *dst++ = src[3]; // a + src += 4; + } + } +} +// bgra2rgb, rgba2bgr +void hwc4_trans_hwc3(const uint8_t* src, uint8_t* dst, int srcw, int srch) { + for (int i = 0; i < srch; i++) { + for (int j = 0; j < srcw; j++) { + *dst++ = src[2]; // r + *dst++ = src[1]; // g + *dst++ = src[0]; // b + // *dst++ = src[4];//a + src += 4; + } + } +} +// bgr2rgba, rgb2bga +void hwc3_trans_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch) { + for (int i = 0; i < srch; i++) { + for (int j = 0; j < srcw; j++) { + *dst++ = src[2]; // r + *dst++ = src[1]; // g + *dst++ = src[0]; // b + *dst++ = 255; // a + src += 3; + } + } +} +} // namespace cv +} // namespace utils +} // namespace lite +} // namespace paddle diff --git a/lite/utils/cv/image_convert.h b/lite/utils/cv/image_convert.h new file mode 100644 index 0000000000000000000000000000000000000000..a10f869564878feaff196e61636b3789d68c43b0 --- /dev/null +++ b/lite/utils/cv/image_convert.h @@ -0,0 +1,43 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "lite/utils/cv/paddle_image_preprocess.h" +namespace paddle { +namespace lite { +namespace utils { +namespace cv { +typedef void (*convert_func)(const uint8_t* src, + uint8_t* dst, + int srcw, + int srch); +class ImageConvert { + public: + void choose(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + ImageFormat dstFormat, + int srcw, + int srch); + + private: + convert_func impl_{nullptr}; +}; +} // namespace cv +} // namespace utils +} // namespace lite +} // namespace paddle diff --git a/lite/utils/cv/image_flip.cc b/lite/utils/cv/image_flip.cc new file mode 100644 index 0000000000000000000000000000000000000000..fd84691a2d1d244350f40238bc137d5d159ba62b --- /dev/null +++ b/lite/utils/cv/image_flip.cc @@ -0,0 +1,1942 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/utils/cv/image_flip.h" +#include +#include +namespace paddle { +namespace lite { +namespace utils { +namespace cv { +// gray +void flip_hwc1_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in); +void flip_hwc1_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in); +void flip_hwc1_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in); +// rgb bgr +void flip_hwc3_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in); +void flip_hwc3_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in); +void flip_hwc3_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in); +// rgba bgra +void flip_hwc4_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in); +void flip_hwc4_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in); +void flip_hwc4_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in); + +void flip_hwc1(const uint8_t* src, + uint8_t* dst, + int srcw, + int srch, + FlipParam flip_param) { + if (flip_param == X) { + flip_hwc1_x(src, dst, srcw, srch); + } else if (flip_param == Y) { + flip_hwc1_y(src, dst, srcw, srch); + } else if (flip_param == XY) { + flip_hwc1_xy(src, dst, srcw, srch); + } +} + +void flip_hwc3(const uint8_t* src, + uint8_t* dst, + int srcw, + int srch, + FlipParam flip_param) { + if (flip_param == X) { + flip_hwc3_x(src, dst, srcw, srch); + } else if (flip_param == Y) { + flip_hwc3_y(src, dst, srcw, srch); + } else if (flip_param == XY) { + flip_hwc3_xy(src, dst, srcw, srch); + } +} + +void flip_hwc4(const uint8_t* src, + uint8_t* dst, + int srcw, + int srch, + FlipParam flip_param) { + if (flip_param == X) { + flip_hwc4_x(src, dst, srcw, srch); + } else if (flip_param == Y) { + flip_hwc4_y(src, dst, srcw, srch); + } else if (flip_param == XY) { + flip_hwc4_xy(src, dst, srcw, srch); + } +} +/* +1 2 3 +4 5 6 +7 8 9 +rotate: +7 8 9 +4 5 6 +1 2 3 +*/ +void flip_hwc1_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { + int h = h_in - 1; + uint8_t zerobuff[8] = {0, 0, 0, 0, 0, 0, 0, 0}; +#pragma omp parallel for + for (int i = 0; i < h_in; i += 4) { + const uint8_t* inptr0 = src + i * w_in; + const uint8_t* inptr1 = inptr0 + w_in; + const uint8_t* inptr2 = inptr1 + w_in; + const uint8_t* inptr3 = inptr2 + w_in; + + uint8_t* outptr0 = dst + (h - i) * w_in; // last + uint8_t* outptr1 = outptr0 - w_in; + uint8_t* outptr2 = outptr1 - w_in; + uint8_t* outptr3 = outptr2 - w_in; + if (i + 3 >= h_in) { + switch ((i + 3) - h_in) { + case 3: + inptr0 = zerobuff; + outptr0 = zerobuff; + case 2: + inptr1 = zerobuff; + outptr1 = zerobuff; + case 1: + inptr2 = zerobuff; + outptr2 = zerobuff; + case 0: + inptr3 = zerobuff; + outptr3 = zerobuff; + default: + break; + } + } + int j = 0; + for (; j < w_in - 7; j += 8) { +#ifdef __aarch64__ + asm volatile( + "ld1 {v0.8b}, [%[inptr0]], #8 \n" // v0={00,01,02, 03, 04, 05, + // 06, 07}" + "ld1 {v1.8b}, [%[inptr1]], #8 \n" // v0={10,11,12, 13, 14, 15, + // 16, 17}" + "ld1 {v2.8b}, [%[inptr2]], #8 \n" // v0={20,21,22, 23, 24, 25, + // 26, 27}" + "ld1 {v3.8b}, [%[inptr3]], #8 \n" // v0={30,31,32, 33, 34, 35, + // 36, 37}" + + "st1 {v0.8b}, [%[outptr0]], #8 \n" // 00 10 20 30 04 14 + // 24 34 + "st1 {v1.8b}, [%[outptr1]], #8 \n" // 02 12 22 32 + "st1 {v2.8b}, [%[outptr2]], #8 \n" // 01 11 21 31 + "st1 {v3.8b}, [%[outptr3]], #8 \n" // 03 13 23 33 + + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3) + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +#else + asm volatile( + "vld1.8 {d0}, [%[inptr0]]! @ zip load r0, d0 =00 01 02 03 04 05 " + "06 07\n" + "vld1.8 {d4}, [%[inptr1]]! @ zip load r1, d2 =10 11 12 13 14 15 " + "16 17\n" + "vld1.8 {d8}, [%[inptr2]]! @ zip load r1, d4 =20 21 22 23 24 25 " + "26 27\n" + "vld1.8 {d12}, [%[inptr3]]! @ zip load r1, d6 = 30 31 32 33 34 35 " + "36 37\n" + + "vst1.32 {d0}, [%[outptr0]]! @ write d0(q0,low),r00,r10 20 30\n" + "vst1.32 {d4}, [%[outptr1]]! @ write d4(q0,low),r01,r11 21 31\n" + "vst1.32 {d8}, [%[outptr2]]! @ write d4(q0,low),r01,r11 21 31\n" + "vst1.32 {d12}, [%[outptr3]]! @ write d4(q0,low),r01,r11 21 " + "31\n" + + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3) + : + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); +#endif + } + for (; j < w_in; j++) { + if (i + 3 >= h_in) { + switch ((i + 3) - h_in) { + case 0: + *outptr2++ = *inptr2++; + case 1: + *outptr1++ = *inptr1++; + case 2: + *outptr0++ = *inptr0++; + case 3: + // inptr3 = zerobuff; + default: + break; + } + } else { + *outptr3++ = *inptr3++; + *outptr2++ = *inptr2++; + *outptr1++ = *inptr1++; + *outptr0++ = *inptr0++; + } + } + } +} + +/* +1 2 3 +4 5 6 +7 8 9 +flip: +3 2 1 +6 5 4 +9 8 7 +*/ +void flip_hwc1_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { + int64_t stride_w = 8; + uint8_t zerobuff[8] = {0, 0, 0, 0, 0, 0, 0, 0}; +#pragma omp parallel for + for (int i = 0; i < h_in; i += 4) { + const uint8_t* inptr0 = src + i * w_in; + const uint8_t* inptr1 = inptr0 + w_in; + const uint8_t* inptr2 = inptr1 + w_in; + const uint8_t* inptr3 = inptr2 + w_in; + + uint8_t* outptr0 = dst + (i + 1) * w_in - stride_w; // last col + uint8_t* outptr1 = outptr0 + w_in; + uint8_t* outptr2 = outptr1 + w_in; + uint8_t* outptr3 = outptr2 + w_in; + if (i + 3 >= h_in) { + switch ((i + 3) - h_in) { + case 3: + inptr0 = zerobuff; + outptr0 = zerobuff; + case 2: + inptr1 = zerobuff; + outptr1 = zerobuff; + case 1: + inptr2 = zerobuff; + outptr2 = zerobuff; + case 0: + inptr3 = zerobuff; + outptr3 = zerobuff; + default: + break; + } + } + int j = 0; + for (; j < w_in - 7; j += 8) { +#ifdef __aarch64__ + asm volatile( + "ld1 {v0.8b}, [%[inptr0]], #8 \n" // v0={00,01,02, 03, 04, 05, + // 06, 07}" + "ld1 {v1.8b}, [%[inptr1]], #8 \n" // v0={10,11,12, 13, 14, 15, + // 16, 17}" + "ld1 {v2.8b}, [%[inptr2]], #8 \n" // v0={20,21,22, 23, 24, 25, + // 26, 27}" + "ld1 {v3.8b}, [%[inptr3]], #8 \n" // v0={30,31,32, 33, 34, 35, + // 36, 37}" + + "rev64 v4.8b, v0.8b \n" //@ reverse 07 06 05 04 03 02 + // 01 00 + "rev64 v5.8b, v1.8b \n" //@ reverse 07 06 05 04 03 02 + // 01 00 + "rev64 v6.8b, v2.8b \n" //@ reverse 07 06 05 04 03 02 + // 01 00 + "rev64 v7.8b, v3.8b \n" //@ reverse 07 06 05 04 03 02 + // 01 00 + + "st1 {v4.8b}, [%[outptr0]] \n" // 00 10 20 30 04 14 24 34 + "st1 {v5.8b}, [%[outptr1]] \n" // 02 12 22 32 + "st1 {v6.8b}, [%[outptr2]] \n" // 01 11 21 31 + "st1 {v7.8b}, [%[outptr3]] \n" // 03 13 23 33 + + "sub %[outptr0], %[outptr0], %[stride_w] \n" //@ ptr - stride_w + "sub %[outptr1], %[outptr1], %[stride_w] \n" + "sub %[outptr2], %[outptr2], %[stride_w] \n" + "sub %[outptr3], %[outptr3], %[stride_w] \n" + + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3), + [stride_w] "+r"(stride_w) + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +#else + asm volatile( + "vld1.8 {d0}, [%[inptr0]]! @ zip load r0, d0 =00 01 02 03 04 05 " + "06 07\n" + "vld1.8 {d4}, [%[inptr1]]! @ zip load r1, d2 =10 11 12 13 14 15 " + "16 17\n" + "vld1.8 {d8}, [%[inptr2]]! @ zip load r1, d4 =20 21 22 23 24 25 " + "26 27\n" + "vld1.8 {d12}, [%[inptr3]]! @ zip load r1, d6 = 30 31 32 33 34 35 " + "36 37\n" + + "vrev64.8 d1, d0 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d5, d4 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d9, d8 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d13, d12 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + + "vst1.32 {d1}, [%[outptr0]] @ write d0(q0,low),r00,r10 20 30\n" + "vst1.32 {d5}, [%[outptr1]] @ write d4(q0,low),r01,r11 21 31\n" + "vst1.32 {d9}, [%[outptr2]] @ write d4(q0,low),r01,r11 21 31\n" + "vst1.32 {d13}, [%[outptr3]] @ write d4(q0,low),r01,r11 21 31\n" + + "sub %[outptr0], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr1], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr2], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr3], %[stride_w] @ ptr - stride_w \n" + + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3), + [stride_w] "+r"(stride_w) + : + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); +#endif + } + outptr3 += stride_w - 1; + outptr2 += stride_w - 1; + outptr1 += stride_w - 1; + outptr0 += stride_w - 1; + for (; j < w_in; j++) { + if (i + 3 >= h_in) { + switch ((i + 3) - h_in) { + case 0: + *outptr2-- = *inptr2++; + case 1: + *outptr1-- = *inptr1++; + // inptr1 = zerobuff; + case 2: + *outptr0-- = *inptr0++; + case 3: + // inptr3 = zerobuff; + default: + break; + } + } else { + *outptr3-- = *inptr3++; + *outptr2-- = *inptr2++; + *outptr1-- = *inptr1++; + *outptr0-- = *inptr0++; + } + } + } +} + +/* +1 2 3 +4 5 6 +7 8 9 +flip: +9 8 7 +6 5 4 +3 2 1 +*/ +void flip_hwc1_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { + int64_t stride_w = 8; + uint8_t zerobuff[8] = {0, 0, 0, 0, 0, 0, 0, 0}; +#pragma omp parallel for + for (int i = 0; i < h_in; i += 4) { + const uint8_t* inptr0 = src + i * w_in; + const uint8_t* inptr1 = inptr0 + w_in; + const uint8_t* inptr2 = inptr1 + w_in; + const uint8_t* inptr3 = inptr2 + w_in; + + uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w; // last col + uint8_t* outptr1 = outptr0 - w_in; + uint8_t* outptr2 = outptr1 - w_in; + uint8_t* outptr3 = outptr2 - w_in; + if (i + 3 >= h_in) { + switch ((i + 3) - h_in) { + case 3: + inptr0 = zerobuff; + outptr0 = zerobuff; + case 2: + inptr1 = zerobuff; + outptr1 = zerobuff; + case 1: + inptr2 = zerobuff; + outptr2 = zerobuff; + case 0: + inptr3 = zerobuff; + outptr3 = zerobuff; + default: + break; + } + } + int j = 0; + for (; j < w_in - 7; j += 8) { +#ifdef __aarch64__ + asm volatile( + "ld1 {v0.8b}, [%[inptr0]], #8 \n" // v0={00,01,02, 03, 04, 05, + // 06, 07}" + "ld1 {v1.8b}, [%[inptr1]], #8 \n" // v0={10,11,12, 13, 14, 15, + // 16, 17}" + "ld1 {v2.8b}, [%[inptr2]], #8 \n" // v0={20,21,22, 23, 24, 25, + // 26, 27}" + "ld1 {v3.8b}, [%[inptr3]], #8 \n" // v0={30,31,32, 33, 34, 35, + // 36, 37}" + + "rev64 v4.8b, v0.8b \n" //@ reverse 07 06 05 04 03 02 + // 01 00 + "rev64 v5.8b, v1.8b \n" //@ reverse 07 06 05 04 03 02 + // 01 00 + "rev64 v6.8b, v2.8b \n" //@ reverse 07 06 05 04 03 02 + // 01 00 + "rev64 v7.8b, v3.8b \n" //@ reverse 07 06 05 04 03 02 + // 01 00 + + "st1 {v4.8b}, [%[outptr0]] \n" // 00 10 20 30 04 14 24 34 + "st1 {v5.8b}, [%[outptr1]] \n" // 02 12 22 32 + "st1 {v6.8b}, [%[outptr2]] \n" // 01 11 21 31 + "st1 {v7.8b}, [%[outptr3]] \n" // 03 13 23 33 + + "sub %[outptr0], %[outptr0], %[stride_w] \n" //@ ptr - stride_w + "sub %[outptr1], %[outptr1], %[stride_w] \n" + "sub %[outptr2], %[outptr2], %[stride_w] \n" + "sub %[outptr3], %[outptr3], %[stride_w] \n" + + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3), + [stride_w] "+r"(stride_w) + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +#else + asm volatile( + "vld1.8 {d0}, [%[inptr0]]! @ zip load r0, d0 =00 01 02 03 04 05 " + "06 07\n" + "vld1.8 {d4}, [%[inptr1]]! @ zip load r1, d2 =10 11 12 13 14 15 " + "16 17\n" + "vld1.8 {d8}, [%[inptr2]]! @ zip load r1, d4 =20 21 22 23 24 25 " + "26 27\n" + "vld1.8 {d12}, [%[inptr3]]! @ zip load r1, d6 = 30 31 32 33 34 35 " + "36 37\n" + + "vrev64.8 d1, d0 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d5, d4 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d9, d8 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d13, d12 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + + "vst1.32 {d1}, [%[outptr0]] @ write d0(q0,low),r00,r10 20 30\n" + "vst1.32 {d5}, [%[outptr1]] @ write d4(q0,low),r01,r11 21 31\n" + "vst1.32 {d9}, [%[outptr2]] @ write d4(q0,low),r01,r11 21 31\n" + "vst1.32 {d13}, [%[outptr3]] @ write d4(q0,low),r01,r11 21 31\n" + + "sub %[outptr0], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr1], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr2], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr3], %[stride_w] @ ptr - stride_w \n" + + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3), + [stride_w] "+r"(stride_w) + : + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); +#endif + } + outptr3 += stride_w - 1; + outptr2 += stride_w - 1; + outptr1 += stride_w - 1; + outptr0 += stride_w - 1; + for (; j < w_in; j++) { + if (i + 3 >= h_in) { + switch ((i + 3) - h_in) { + case 0: + *outptr2-- = *inptr2++; + case 1: + *outptr1-- = *inptr1++; + // inptr1 = zerobuff; + case 2: + *outptr0-- = *inptr0++; + case 3: + // inptr3 = zerobuff; + default: + break; + } + } else { + *outptr3-- = *inptr3++; + *outptr2-- = *inptr2++; + *outptr1-- = *inptr1++; + *outptr0-- = *inptr0++; + } + } + } +} + +void flip_hwc3_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { + int h = h_in - 1; + int win = w_in * 3; + uint8_t zerobuff[30000]; + memset(zerobuff, 0, win * sizeof(uint8_t)); + uint8_t zerobuff2[30000]; + memset(zerobuff2, 0, win * sizeof(uint8_t)); +#pragma omp parallel for + for (int i = 0; i < h_in; i += 4) { + const uint8_t* inptr0 = src + i * win; + const uint8_t* inptr1 = inptr0 + win; + const uint8_t* inptr2 = inptr1 + win; + const uint8_t* inptr3 = inptr2 + win; + + uint8_t* outptr0 = dst + (h - i) * win; // last + uint8_t* outptr1 = outptr0 - win; + uint8_t* outptr2 = outptr1 - win; + uint8_t* outptr3 = outptr2 - win; + if (i + 3 >= h_in) { + switch ((i + 3) - h_in) { + case 3: + inptr0 = zerobuff; + outptr0 = zerobuff2; + case 2: + inptr1 = zerobuff; + outptr1 = zerobuff2; + case 1: + inptr2 = zerobuff; + outptr2 = zerobuff2; + case 0: + inptr3 = zerobuff; + outptr3 = zerobuff2; + default: + break; + } + } + int j = 0; + for (; j < w_in - 7; j += 8) { +#ifdef __aarch64__ + asm volatile( + "ld3 {v0.8b, v1.8b, v2.8b}, [%[inptr0]], #24 \n" // v0={00,01,02, + // 03, 04, 05, + // 06, 07}" + "ld3 {v3.8b, v4.8b, v5.8b}, [%[inptr1]], #24 \n" // v0={10,11,12, + // 13, 14, 15, + // 16, 17}" + "ld3 {v6.8b, v7.8b, v8.8b}, [%[inptr2]], #24 \n" // v0={20,21,22, + // 23, 24, 25, + // 26, 27}" + "ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24 \n" // v0={30,31,32, + // 33, 34, 35, + // 36, 37}" + + "st3 {v0.8b, v1.8b, v2.8b}, [%[outptr0]], #24 \n" // 00 + // 10 + // 20 + // 30 + // 04 + // 14 + // 24 + // 34 + "st3 {v3.8b, v4.8b, v5.8b}, [%[outptr1]], #24 \n" // 02 + // 12 + // 22 + // 32 + "st3 {v6.8b, v7.8b, v8.8b}, [%[outptr2]], #24 \n" // 01 + // 11 + // 21 + // 31 + "st3 {v9.8b, v10.8b, v11.8b}, [%[outptr3]], #24 \n" // 03 13 23 33 + + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3) + : + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11"); +#else + asm volatile( + "vld3.8 {d0, d1, d2}, [%[inptr0]]! @ zip load r0, d0 =00 01 02 03 " + "04 05 06 07\n" + "vld3.8 {d3, d4, d5}, [%[inptr1]]! @ zip load r1, d2 =10 11 12 13 " + "14 15 16 17\n" + "vld3.8 {d6, d7, d8}, [%[inptr2]]! @ zip load r1, d4 =20 21 22 23 " + "24 25 26 27\n" + "vld3.8 {d9, d10, d11}, [%[inptr3]]! @ zip load r1, d6 = 30 31 32 " + "33 34 35 36 37\n" + + "vst3.8 {d0, d1, d2}, [%[outptr0]]! @ write d0(q0,low),r00,r10 " + "20 30\n" + "vst3.8 {d3, d4, d5}, [%[outptr1]]! @ write d4(q0,low),r01,r11 " + "21 31\n" + "vst3.8 {d6, d7, d8}, [%[outptr2]]! @ write d4(q0,low),r01,r11 " + "21 31\n" + "vst3.8 {d9, d10, d11}, [%[outptr3]]! @ write " + "d4(q0,low),r01,r11 21 31\n" + + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3) + : + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); +#endif + } + for (; j < w_in; j++) { + if (i + 3 >= h_in) { + switch ((i + 3) - h_in) { + case 0: + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + case 1: + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + case 2: + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + case 3: + // inptr3 = zerobuff; + default: + break; + } + } else { + *outptr3++ = *inptr3++; + *outptr3++ = *inptr3++; + *outptr3++ = *inptr3++; + + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + } + } + } +} + +void flip_hwc3_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { + int win = w_in * 3; + uint8_t zerobuff[30000]; + memset(zerobuff, 0, win * sizeof(uint8_t)); + uint8_t zerobuff2[30000]; + memset(zerobuff2, 0, win * sizeof(uint8_t)); + int64_t stride_w = 24; +#pragma omp parallel for + for (int i = 0; i < h_in; i += 4) { + const uint8_t* inptr0 = src + i * win; + const uint8_t* inptr1 = inptr0 + win; + const uint8_t* inptr2 = inptr1 + win; + const uint8_t* inptr3 = inptr2 + win; + + uint8_t* outptr0 = dst + (i + 1) * win - stride_w; // last col + uint8_t* outptr1 = outptr0 + win; + uint8_t* outptr2 = outptr1 + win; + uint8_t* outptr3 = outptr2 + win; + int j = 0; + if (i + 3 >= h_in) { + switch ((i + 3) - h_in) { + case 3: + inptr0 = zerobuff; + outptr0 = zerobuff2; + case 2: + inptr1 = zerobuff; + outptr1 = zerobuff2; + case 1: + inptr2 = zerobuff; + outptr2 = zerobuff2; + case 0: + inptr3 = zerobuff; + outptr3 = zerobuff2; + default: + break; + } + } + for (; j < w_in - 7; j += 8) { +#ifdef __aarch64__ + asm volatile( + "ld3 {v0.8b, v1.8b, v2.8b}, [%[inptr0]], #24 \n" // v0={00,01,02, + // 03, 04, 05, + // 06, 07}" + "ld3 {v3.8b, v4.8b, v5.8b}, [%[inptr1]], #24 \n" // v0={10,11,12, + // 13, 14, 15, + // 16, 17}" + "ld3 {v6.8b, v7.8b, v8.8b}, [%[inptr2]], #24 \n" // v0={20,21,22, + // 23, 24, 25, + // 26, 27}" + "ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24 \n" // v0={30,31,32, + // 33, 34, 35, + // 36, 37}" + + "rev64 v12.8b, v0.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 b + "rev64 v13.8b, v1.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 g + "rev64 v14.8b, v2.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 r + + "rev64 v15.8b, v3.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v16.8b, v4.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v17.8b, v5.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + + "rev64 v18.8b, v6.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v19.8b, v7.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v20.8b, v8.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + + "rev64 v21.8b, v9.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v22.8b, v10.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v23.8b, v11.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + + "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]] \n" // 00 10 + // 20 30 + // 04 14 + // 24 34 + "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr1]] \n" // 02 12 + // 22 32 + "st3 {v18.8b, v19.8b, v20.8b}, [%[outptr2]] \n" // 01 11 + // 21 31 + "st3 {v21.8b, v22.8b, v23.8b}, [%[outptr3]] \n" // 03 13 + // 23 33 + + "sub %[outptr0], %[outptr0], %[stride_w] \n" //@ ptr - stride_w + "sub %[outptr1], %[outptr1], %[stride_w] \n" + "sub %[outptr2], %[outptr2], %[stride_w] \n" + "sub %[outptr3], %[outptr3], %[stride_w] \n" + + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3), + [stride_w] "+r"(stride_w) + : + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23"); +#else + asm volatile( + "vld3.8 {d0, d1, d2}, [%[inptr0]]! @ zip load r0, d0 =00 01 02 03 " + "04 05 06 07\n" + "vld3.8 {d3, d4, d5}, [%[inptr1]]! @ zip load r1, d2 =10 11 12 13 " + "14 15 16 17\n" + "vld3.8 {d6, d7, d8}, [%[inptr2]]! @ zip load r1, d4 =20 21 22 23 " + "24 25 26 27\n" + "vld3.8 {d9, d10, d11}, [%[inptr3]]! @ zip load r1, d6 = 30 31 32 " + "33 34 35 36 37\n" + + "vrev64.8 d12, d0 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d13, d1 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d14, d2 @ reverse 07 06 05 04 03 02 01 00 \n" + + "vrev64.8 d15, d3 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d16, d4 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d17, d5 @ reverse 07 06 05 04 03 02 01 00 \n" + + "vrev64.8 d18, d6 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d19, d7 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d20, d8 @ reverse 07 06 05 04 03 02 01 00 \n" + + "vrev64.8 d21, d9 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d22, d10 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d23, d11 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + + "vst3.8 {d12, d13, d14}, [%[outptr0]] @ write " + "d0(q0,low),r00,r10 20 30\n" + "vst3.8 {d15, d16, d17}, [%[outptr1]] @ write " + "d4(q0,low),r01,r11 21 31\n" + "vst3.8 {d18, d19, d20}, [%[outptr2]] @ write " + "d4(q0,low),r01,r11 21 31\n" + "vst3.8 {d21, d22, d23}, [%[outptr3]] @ write " + "d4(q0,low),r01,r11 21 31\n" + + "sub %[outptr0], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr1], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr2], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr3], %[stride_w] @ ptr - stride_w \n" + + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3), + [stride_w] "+r"(stride_w) + : + : "q0", + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12"); +#endif + } + outptr3 += stride_w - 3; + outptr2 += stride_w - 3; + outptr1 += stride_w - 3; + outptr0 += stride_w - 3; + for (; j < w_in; j++) { + if (i + 3 >= h_in) { + switch ((i + 3) - h_in) { + case 0: + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + outptr2 -= 6; + case 1: + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + outptr1 -= 6; + case 2: + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + outptr0 -= 6; + case 3: + // inptr3 = zerobuff; + default: + break; + } + } else { + *outptr3++ = *inptr3++; + *outptr3++ = *inptr3++; + *outptr3++ = *inptr3++; + outptr3 -= 6; + + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + outptr2 -= 6; + + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + outptr1 -= 6; + + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + outptr0 -= 6; + } + } + } +} + +void flip_hwc3_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { + int64_t stride_w = 24; + int win = w_in * 3; + uint8_t zerobuff[30000]; + memset(zerobuff, 0, win * sizeof(uint8_t)); + uint8_t zerobuff2[30000]; + memset(zerobuff2, 0, win * sizeof(uint8_t)); +#pragma omp parallel for + for (int i = 0; i < h_in; i += 4) { + const uint8_t* inptr0 = src + i * win; + const uint8_t* inptr1 = inptr0 + win; + const uint8_t* inptr2 = inptr1 + win; + const uint8_t* inptr3 = inptr2 + win; + + uint8_t* outptr0 = dst + (h_in - i) * win - stride_w; // last col + uint8_t* outptr1 = outptr0 - win; + uint8_t* outptr2 = outptr1 - win; + uint8_t* outptr3 = outptr2 - win; + if (i + 3 >= h_in) { + switch ((i + 3) - h_in) { + case 3: + inptr0 = zerobuff; + outptr0 = zerobuff2; + case 2: + inptr1 = zerobuff; + outptr1 = zerobuff2; + case 1: + inptr2 = zerobuff; + outptr2 = zerobuff2; + case 0: + inptr3 = zerobuff; + outptr3 = zerobuff2; + default: + break; + } + } + int j = 0; + for (; j < w_in - 7; j += 8) { +#ifdef __aarch64__ + asm volatile( + "ld3 {v0.8b, v1.8b, v2.8b}, [%[inptr0]], #24 \n" // v0={00,01,02, + // 03, 04, 05, + // 06, 07}" + "ld3 {v3.8b, v4.8b, v5.8b}, [%[inptr1]], #24 \n" // v0={10,11,12, + // 13, 14, 15, + // 16, 17}" + "ld3 {v6.8b, v7.8b, v8.8b}, [%[inptr2]], #24 \n" // v0={20,21,22, + // 23, 24, 25, + // 26, 27}" + "ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24 \n" // v0={30,31,32, + // 33, 34, 35, + // 36, 37}" + + "rev64 v12.8b, v0.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 b + "rev64 v13.8b, v1.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 g + "rev64 v14.8b, v2.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 r + + "rev64 v15.8b, v3.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v16.8b, v4.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v17.8b, v5.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + + "rev64 v18.8b, v6.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v19.8b, v7.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v20.8b, v8.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + + "rev64 v21.8b, v9.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v22.8b, v10.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v23.8b, v11.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + + "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]] \n" // 00 10 + // 20 30 + // 04 14 + // 24 34 + "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr1]] \n" // 02 12 + // 22 32 + "st3 {v18.8b, v19.8b, v20.8b}, [%[outptr2]] \n" // 01 11 + // 21 31 + "st3 {v21.8b, v22.8b, v23.8b}, [%[outptr3]] \n" // 03 13 + // 23 33 + + "sub %[outptr0], %[outptr0], %[stride_w] \n" //@ ptr - stride_w + "sub %[outptr1], %[outptr1], %[stride_w] \n" + "sub %[outptr2], %[outptr2], %[stride_w] \n" + "sub %[outptr3], %[outptr3], %[stride_w] \n" + + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3), + [stride_w] "+r"(stride_w) + : + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23"); +#else + asm volatile( + "vld3.8 {d0, d1, d2}, [%[inptr0]]! @ zip load r0, d0 =00 01 02 03 " + "04 05 06 07\n" + "vld3.8 {d3, d4, d5}, [%[inptr1]]! @ zip load r1, d2 =10 11 12 13 " + "14 15 16 17\n" + "vld3.8 {d6, d7, d8}, [%[inptr2]]! @ zip load r1, d4 =20 21 22 23 " + "24 25 26 27\n" + "vld3.8 {d9, d10, d11}, [%[inptr3]]! @ zip load r1, d6 = 30 31 32 " + "33 34 35 36 37\n" + + "vrev64.8 d12, d0 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d13, d1 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d14, d2 @ reverse 07 06 05 04 03 02 01 00 \n" + + "vrev64.8 d15, d3 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d16, d4 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d17, d5 @ reverse 07 06 05 04 03 02 01 00 \n" + + "vrev64.8 d18, d6 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d19, d7 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d20, d8 @ reverse 07 06 05 04 03 02 01 00 \n" + + "vrev64.8 d21, d9 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d22, d10 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d23, d11 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + + "vst3.8 {d12, d13, d14}, [%[outptr0]] @ write " + "d0(q0,low),r00,r10 20 30\n" + "vst3.8 {d15, d16, d17}, [%[outptr1]] @ write " + "d4(q0,low),r01,r11 21 31\n" + "vst3.8 {d18, d19, d20}, [%[outptr2]] @ write " + "d4(q0,low),r01,r11 21 31\n" + "vst3.8 {d21, d22, d23}, [%[outptr3]] @ write " + "d4(q0,low),r01,r11 21 31\n" + + "sub %[outptr0], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr1], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr2], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr3], %[stride_w] @ ptr - stride_w \n" + + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3), + [stride_w] "+r"(stride_w) + : + : "q0", + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12"); +#endif + } + outptr3 += stride_w - 3; + outptr2 += stride_w - 3; + outptr1 += stride_w - 3; + outptr0 += stride_w - 3; + for (; j < w_in; j++) { + if (i + 3 >= h_in) { + switch ((i + 3) - h_in) { + case 0: + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + outptr2 -= 6; + case 1: + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + outptr1 -= 6; + case 2: + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + outptr0 -= 6; + case 3: + // inptr3 = zerobuff; + default: + break; + } + } else { + *outptr3++ = *inptr3++; + *outptr3++ = *inptr3++; + *outptr3++ = *inptr3++; + outptr3 -= 6; + + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + outptr2 -= 6; + + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + outptr1 -= 6; + + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + outptr0 -= 6; + } + } + } +} + +void flip_hwc4_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { + int h = h_in - 1; + int win = w_in * 4; + uint8_t zerobuff[40000]; + memset(zerobuff, 0, win * sizeof(uint8_t)); + uint8_t zerobuff2[40000]; + memset(zerobuff2, 0, win * sizeof(uint8_t)); +#pragma omp parallel for + for (int i = 0; i < h_in; i += 4) { + const uint8_t* inptr0 = src + i * win; + const uint8_t* inptr1 = inptr0 + win; + const uint8_t* inptr2 = inptr1 + win; + const uint8_t* inptr3 = inptr2 + win; + + uint8_t* outptr0 = dst + (h - i) * win; // last + uint8_t* outptr1 = outptr0 - win; + uint8_t* outptr2 = outptr1 - win; + uint8_t* outptr3 = outptr2 - win; + if (i + 3 >= h_in) { + switch ((i + 3) - h_in) { + case 3: + inptr0 = zerobuff; + outptr0 = zerobuff2; + case 2: + inptr1 = zerobuff; + outptr1 = zerobuff2; + case 1: + inptr2 = zerobuff; + outptr2 = zerobuff2; + case 0: + inptr3 = zerobuff; + outptr3 = zerobuff2; + default: + break; + } + } + int j = 0; + for (; j < w_in - 7; j += 8) { +#ifdef __aarch64__ + asm volatile( + "ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[inptr0]], #32 \n" // v0={00,01,02, + // 03, + // 04, + // 05, + // 06, + // 07}" + "ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [%[inptr1]], #32 \n" // v0={10,11,12, + // 13, + // 14, + // 15, + // 16, + // 17}" + "ld4 {v8.8b, v9.8b, v10.8b, v11.8b}, [%[inptr2]], #32 \n" // v0={20,21,22, + // 23, + // 24, + // 25, + // 26, + // 27}" + "ld4 {v12.8b, v13.8b, v14.8b, v15.8b}, [%[inptr3]], #32 \n" // v0={30,31,32, + // 33, + // 34, + // 35, + // 36, + // 37}" + + "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[outptr0]], #32 \n" // 00 10 20 + // 30 04 14 + // 24 34 + "st4 {v4.8b, v5.8b, v6.8b, v7.8b}, [%[outptr1]], #32 \n" // 02 12 22 32 + "st4 {v8.8b, v9.8b, v10.8b, v11.8b}, [%[outptr2]], #32 \n" // 01 11 21 31 + "st4 {v12.8b, v13.8b, v14.8b, v15.8b}, [%[outptr3]], #32 " + " \n" // 03 13 23 33 + + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3) + : + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); +#else + asm volatile( + "vld4.8 {d0, d1, d2, d3}, [%[inptr0]]! @ zip load r0, d0 =00 01 " + "02 03 04 05 06 07\n" + "vld4.8 {d4, d5, d6, d7}, [%[inptr1]]! @ zip load r1, d2 =10 11 " + "12 13 14 15 16 17\n" + "vld4.8 {d8, d9, d10, d11}, [%[inptr2]]! @ zip load r1, d4 =20 21 " + "22 23 24 25 26 27\n" + "vld4.8 {d12, d13, d14, d15}, [%[inptr3]]! @ zip load r1, d6 = 30 " + "31 32 33 34 35 36 37\n" + + "vst4.8 {d0, d1, d2, d3}, [%[outptr0]]! @ write " + "d0(q0,low),r00,r10 20 30\n" + "vst4.8 {d4, d5, d6, d7}, [%[outptr1]]! @ write " + "d4(q0,low),r01,r11 21 31\n" + "vst4.8 {d8, d9, d10, d11}, [%[outptr2]]! @ write " + "d4(q0,low),r01,r11 21 31\n" + "vst4.8 {d12, d13, d14, d15}, [%[outptr3]]! @ write " + "d4(q0,low),r01,r11 21 31\n" + + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3) + : + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); +#endif + } + for (; j < w_in; j++) { + if (i + 3 >= h_in) { + switch ((i + 3) - h_in) { + case 0: + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + case 1: + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + case 2: + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + case 3: + // inptr3 = zerobuff; + default: + break; + } + } else { + *outptr3++ = *inptr3++; + *outptr3++ = *inptr3++; + *outptr3++ = *inptr3++; + *outptr3++ = *inptr3++; + + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + } + } + } +} + +void flip_hwc4_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { + int win = w_in * 4; + uint8_t zerobuff[40000]; + memset(zerobuff, 0, win * sizeof(uint8_t)); + uint8_t zerobuff2[40000]; + memset(zerobuff2, 0, win * sizeof(uint8_t)); + int64_t stride_w = 32; +#pragma omp parallel for + for (int i = 0; i < h_in; i += 4) { + const uint8_t* inptr0 = src + i * win; + const uint8_t* inptr1 = inptr0 + win; + const uint8_t* inptr2 = inptr1 + win; + const uint8_t* inptr3 = inptr2 + win; + + uint8_t* outptr0 = dst + (i + 1) * win - stride_w; // last col + uint8_t* outptr1 = outptr0 + win; + uint8_t* outptr2 = outptr1 + win; + uint8_t* outptr3 = outptr2 + win; + if (i + 3 >= h_in) { + switch ((i + 3) - h_in) { + case 3: + inptr0 = zerobuff; + outptr0 = zerobuff2; + case 2: + inptr1 = zerobuff; + outptr1 = zerobuff2; + case 1: + inptr2 = zerobuff; + outptr2 = zerobuff2; + case 0: + inptr3 = zerobuff; + outptr3 = zerobuff2; + default: + break; + } + } + int j = 0; + for (; j < w_in - 7; j += 8) { +#ifdef __aarch64__ + asm volatile( + "ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[inptr0]], #32 \n" // v0={00,01,02, + // 03, + // 04, + // 05, + // 06, + // 07}" + "ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [%[inptr1]], #32 \n" // v0={10,11,12, + // 13, + // 14, + // 15, + // 16, + // 17}" + "ld4 {v8.8b, v9.8b, v10.8b, v11.8b}, [%[inptr2]], #32 \n" // v0={20,21,22, + // 23, + // 24, + // 25, + // 26, + // 27}" + "ld4 {v12.8b, v13.8b, v14.8b, v15.8b}, [%[inptr3]], #32 \n" // v0={30,31,32, + // 33, + // 34, + // 35, + // 36, + // 37}" + + "rev64 v16.8b, v0.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 b + "rev64 v17.8b, v1.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 g + "rev64 v18.8b, v2.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 r + "rev64 v19.8b, v3.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + + "rev64 v20.8b, v4.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v21.8b, v5.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v22.8b, v6.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v23.8b, v7.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + + "rev64 v0.8b, v8.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v1.8b, v9.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v2.8b, v10.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v3.8b, v11.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + + "rev64 v4.8b, v12.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v5.8b, v13.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v6.8b, v14.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v7.8b, v15.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + + "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[outptr0]] \n" // 00 10 20 30 04 14 24 34 + "st4 {v20.8b, v21.8b, v22.8b, v23.8b}, [%[outptr1]] \n" // 02 12 22 32 + "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[outptr2]] \n" // 01 + // 11 + // 21 + // 31 + "st4 {v4.8b, v5.8b, v6.8b, v7.8b}, [%[outptr3]] \n" // 03 13 23 33 + + "sub %[outptr0], %[outptr0], %[stride_w] \n" //@ ptr - + // stride_w + "sub %[outptr1], %[outptr1], %[stride_w] \n" + "sub %[outptr2], %[outptr2], %[stride_w] \n" + "sub %[outptr3], %[outptr3], %[stride_w] \n" + + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3), + [stride_w] "+r"(stride_w) + : + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23"); +#else + asm volatile( + "vld4.8 {d0, d1, d2, d3}, [%[inptr0]]! @ zip load r0, d0 =00 01 " + "02 03 04 05 06 07\n" + "vld4.8 {d4, d5, d6, d7}, [%[inptr1]]! @ zip load r1, d2 =10 11 " + "12 13 14 15 16 17\n" + "vld4.8 {d8, d9, d10, d11}, [%[inptr2]]! @ zip load r1, d4 =20 " + "21 22 23 24 25 26 27\n" + "vld4.8 {d12, d13, d14, d15}, [%[inptr3]]! @ zip load r1, d6 = " + "30 31 32 33 34 35 36 37\n" + + "vrev64.8 d16, d0 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d17, d1 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d18, d2 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d19, d3 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + + "vrev64.8 d20, d4 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d21, d5 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d22, d6 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d23, d7 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + + "vrev64.8 d0, d8 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d1, d9 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d2, d10 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d3, d11 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + + "vrev64.8 d4, d12 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d5, d13 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d6, d14 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d7, d15 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + + "vst4.8 {d16, d17, d18, d19}, [%[outptr0]] @ write " + "d0(q0,low),r00,r10 20 30\n" + "vst4.8 {d20, d21, d22, d23}, [%[outptr1]] @ write " + "d4(q0,low),r01,r11 21 31\n" + "vst4.8 {d0, d1, d2, d3}, [%[outptr2]] @ write " + "d4(q0,low),r01,r11 21 31\n" + "vst4.8 {d4, d5, d6, d7}, [%[outptr3]] @ write " + "d4(q0,low),r01,r11 21 31\n" + + "sub %[outptr0], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr1], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr2], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr3], %[stride_w] @ ptr - stride_w \n" + + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3), + [stride_w] "+r"(stride_w) + : + : "q0", + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12"); +#endif + } + outptr3 += stride_w - 4; + outptr2 += stride_w - 4; + outptr1 += stride_w - 4; + outptr0 += stride_w - 4; + for (; j < w_in; j++) { + if (i + 3 >= h_in) { + switch ((i + 3) - h_in) { + case 0: + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + outptr2 -= 8; + case 1: + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + outptr1 -= 8; + case 2: + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + outptr0 -= 8; + case 3: + // inptr3 = zerobuff; + default: + break; + } + } else { + *outptr3++ = *inptr3++; + *outptr3++ = *inptr3++; + *outptr3++ = *inptr3++; + *outptr3++ = *inptr3++; + outptr3 -= 8; + + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + outptr2 -= 8; + + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + outptr1 -= 8; + + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + outptr0 -= 8; + } + } + } +} + +void flip_hwc4_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { + int64_t stride_w = 32; + int win = w_in * 4; + uint8_t zerobuff[40000]; + memset(zerobuff, 0, win * sizeof(uint8_t)); + uint8_t zerobuff2[40000]; + memset(zerobuff2, 0, win * sizeof(uint8_t)); +#pragma omp parallel for + for (int i = 0; i < h_in; i += 4) { + const uint8_t* inptr0 = src + i * win; + const uint8_t* inptr1 = inptr0 + win; + const uint8_t* inptr2 = inptr1 + win; + const uint8_t* inptr3 = inptr2 + win; + + uint8_t* outptr0 = dst + (h_in - i) * win - stride_w; // last col + uint8_t* outptr1 = outptr0 - win; + uint8_t* outptr2 = outptr1 - win; + uint8_t* outptr3 = outptr2 - win; + if (i + 3 >= h_in) { + switch ((i + 3) - h_in) { + case 3: + inptr0 = zerobuff; + outptr0 = zerobuff2; + case 2: + inptr1 = zerobuff; + outptr1 = zerobuff2; + case 1: + inptr2 = zerobuff; + outptr2 = zerobuff2; + case 0: + inptr3 = zerobuff; + outptr3 = zerobuff2; + default: + break; + } + } + int j = 0; + for (; j < w_in - 7; j += 8) { +#ifdef __aarch64__ + asm volatile( + "ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[inptr0]], #32 \n" // v0={00,01,02, + // 03, + // 04, + // 05, + // 06, + // 07}" + "ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [%[inptr1]], #32 \n" // v0={10,11,12, + // 13, + // 14, + // 15, + // 16, + // 17}" + "ld4 {v8.8b, v9.8b, v10.8b, v11.8b}, [%[inptr2]], #32 \n" // v0={20,21,22, + // 23, + // 24, + // 25, + // 26, + // 27}" + "ld4 {v12.8b, v13.8b, v14.8b, v15.8b}, [%[inptr3]], #32 \n" // v0={30,31,32, + // 33, + // 34, + // 35, + // 36, + // 37}" + + "rev64 v16.8b, v0.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 b + "rev64 v17.8b, v1.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 g + "rev64 v18.8b, v2.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 r + "rev64 v19.8b, v3.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + + "rev64 v20.8b, v4.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v21.8b, v5.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v22.8b, v6.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v23.8b, v7.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + + "rev64 v0.8b, v8.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v1.8b, v9.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v2.8b, v10.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v3.8b, v11.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + + "rev64 v4.8b, v12.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v5.8b, v13.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v6.8b, v14.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v7.8b, v15.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + + "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[outptr0]] \n" // 00 10 20 30 04 14 24 34 + "st4 {v20.8b, v21.8b, v22.8b, v23.8b}, [%[outptr1]] \n" // 02 12 22 32 + "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[outptr2]] \n" // 01 + // 11 + // 21 + // 31 + "st4 {v4.8b, v5.8b, v6.8b, v7.8b}, [%[outptr3]] \n" // 03 + // 13 + // 23 + // 33 + + "sub %[outptr0], %[outptr0], %[stride_w] \n" //@ ptr - + // stride_w + "sub %[outptr1], %[outptr1], %[stride_w] \n" + "sub %[outptr2], %[outptr2], %[stride_w] \n" + "sub %[outptr3], %[outptr3], %[stride_w] \n" + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3), + [stride_w] "+r"(stride_w) + : + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23"); +#else + asm volatile( + "vld4.8 {d0, d1, d2, d3}, [%[inptr0]]! @ zip load r0, d0 =00 01 " + "02 03 04 05 06 07\n" + "vld4.8 {d4, d5, d6, d7}, [%[inptr1]]! @ zip load r1, d2 =10 11 " + "12 13 14 15 16 17\n" + "vld4.8 {d8, d9, d10, d11}, [%[inptr2]]! @ zip load r1, d4 =20 " + "21 22 23 24 25 26 27\n" + "vld4.8 {d12, d13, d14, d15}, [%[inptr3]]! @ zip load r1, d6 = " + "30 31 32 33 34 35 36 37\n" + + "vrev64.8 d16, d0 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d17, d1 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d18, d2 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d19, d3 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + + "vrev64.8 d20, d4 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d21, d5 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d22, d6 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d23, d7 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + + "vrev64.8 d0, d8 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d1, d9 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d2, d10 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d3, d11 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + + "vrev64.8 d4, d12 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d5, d13 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d6, d14 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d7, d15 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + + "vst4.8 {d16, d17, d18, d19}, [%[outptr0]] @ write " + "d0(q0,low),r00,r10 20 30\n" + "vst4.8 {d20, d21, d22, d23}, [%[outptr1]] @ write " + "d4(q0,low),r01,r11 21 31\n" + "vst4.8 {d0, d1, d2, d3}, [%[outptr2]] @ write " + "d4(q0,low),r01,r11 21 31\n" + "vst4.8 {d4, d5, d6, d7}, [%[outptr3]] @ write " + "d4(q0,low),r01,r11 21 31\n" + + "sub %[outptr0], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr1], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr2], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr3], %[stride_w] @ ptr - stride_w \n" + + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3), + [stride_w] "+r"(stride_w) + : + : "q0", + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12"); +#endif + } + outptr3 += stride_w - 4; + outptr2 += stride_w - 4; + outptr1 += stride_w - 4; + outptr0 += stride_w - 4; + for (; j < w_in; j++) { + if (i + 3 >= h_in) { + switch ((i + 3) - h_in) { + case 0: + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + outptr2 -= 8; + case 1: + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + outptr1 -= 8; + case 2: + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + outptr0 -= 8; + case 3: + // inptr3 = zerobuff; + default: + break; + } + } else { + *outptr3++ = *inptr3++; + *outptr3++ = *inptr3++; + *outptr3++ = *inptr3++; + *outptr3++ = *inptr3++; + outptr3 -= 8; + + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + outptr2 -= 8; + + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + outptr1 -= 8; + + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + outptr0 -= 8; + } + } + } +} +} // namespace cv +} // namespace utils +} // namespace lite +} // namespace paddle diff --git a/lite/utils/cv/image_flip.h b/lite/utils/cv/image_flip.h new file mode 100644 index 0000000000000000000000000000000000000000..5e513324a179423ec1d008d6e6cd33d29a79c095 --- /dev/null +++ b/lite/utils/cv/image_flip.h @@ -0,0 +1,33 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "lite/utils/cv/paddle_image_preprocess.h" +namespace paddle { +namespace lite { +namespace utils { +namespace cv { +void flip_hwc1( + const uint8_t* src, uint8_t* dst, int srcw, int srch, FlipParam flip_param); +void flip_hwc3( + const uint8_t* src, uint8_t* dst, int srcw, int srch, FlipParam flip_param); +void flip_hwc4( + const uint8_t* src, uint8_t* dst, int srcw, int srch, FlipParam flip_param); +} // namespace cv +} // namespace utils +} // namespace lite +} // namespace paddle diff --git a/lite/utils/cv/image_resize.cc b/lite/utils/cv/image_resize.cc new file mode 100644 index 0000000000000000000000000000000000000000..8b0b8aa17d3ced769c7ff606e9ba5fe78208b3d7 --- /dev/null +++ b/lite/utils/cv/image_resize.cc @@ -0,0 +1,364 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// ncnn license +// Tencent is pleased to support the open source community by making ncnn +// available. +// +// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this +// file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "lite/utils/cv/image_resize.h" +#include +#include +#include +namespace paddle { +namespace lite { +namespace utils { +namespace cv { +void compute_xy(int srcw, + int srch, + int dstw, + int dsth, + double scale_x, + double scale_y, + int* xofs, + int* yofs, + int16_t* ialpha, + int16_t* ibeta); +// use bilinear method to resize +void resize(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + int srcw, + int srch, + int dstw, + int dsth) { + int size = srcw * srch; + if (srcw == dstw && srch == dsth) { + if (srcFormat == NV12 || srcFormat == NV21) { + size = srcw * (floor(1.5 * srch)); + } else if (srcFormat == BGR || srcFormat == RGB) { + size = 3 * srcw * srch; + } else if (srcFormat == BGRA || srcFormat == RGBA) { + size = 4 * srcw * srch; + } + memcpy(dst, src, sizeof(uint8_t) * size); + return; + } + double scale_x = static_cast(srcw / dstw); + double scale_y = static_cast(srch / dsth); + + int* buf = new int[dstw * 2 + dsth * 2]; + + int* xofs = buf; + int* yofs = buf + dstw; + int16_t* ialpha = reinterpret_cast(buf + dstw + dsth); + int16_t* ibeta = reinterpret_cast(buf + 2 * dstw + dsth); + + compute_xy( + srcw, srch, dstw, dsth, scale_x, scale_y, xofs, yofs, ialpha, ibeta); + + int w_out = dstw; + int w_in = srcw; + int num = 1; + int orih = dsth; + if (srcFormat == GRAY) { + num = 1; + } else if (srcFormat == NV12 || srcFormat == NV21) { + num = 1; + int hout = static_cast(0.5 * dsth); + dsth += hout; + } else if (srcFormat == BGR || srcFormat == RGB) { + w_in = srcw * 3; + w_out = dstw * 3; + num = 3; + + } else if (srcFormat == BGRA || srcFormat == RGBA) { + w_in = srcw * 4; + w_out = dstw * 4; + num = 4; + } + + int* xofs1 = nullptr; + int* yofs1 = nullptr; + int16_t* ialpha1 = nullptr; + if (orih < dsth) { // uv + int tmp = dsth - orih; + int w = dstw / 2; + xofs1 = new int[w]; + yofs1 = new int[tmp]; + ialpha1 = new int16_t[srcw]; + compute_xy(srcw / 2, + srch / 2, + w, + tmp, + scale_x, + scale_y, + xofs1, + yofs1, + ialpha1, + ibeta + orih); + } + int cnt = w_out >> 3; + int remain = w_out % 8; + int32x4_t _v2 = vdupq_n_s32(2); +#pragma omp parallel for + for (int dy = 0; dy < dsth; dy++) { + int16_t* rowsbuf0 = new int16_t[w_out]; + int16_t* rowsbuf1 = new int16_t[w_out]; + int sy = yofs[dy]; + if (dy >= orih) { + xofs = xofs1; + yofs = yofs1; + ialpha = ialpha1; + } + if (sy < 0) { + memset(rowsbuf0, 0, sizeof(uint16_t) * w_out); + const uint8_t* S1 = src + srcw * (sy + 1); + const int16_t* ialphap = ialpha; + int16_t* rows1p = rowsbuf1; + for (int dx = 0; dx < dstw; dx++) { + int sx = xofs[dx] * num; // num = 4 + int16_t a0 = ialphap[0]; + int16_t a1 = ialphap[1]; + + const uint8_t* S1pl = S1 + sx; + const uint8_t* S1pr = S1 + sx + num; + if (sx < 0) { + S1pl = S1; + } + for (int i = 0; i < num; i++) { + if (sx < 0) { + *rows1p++ = ((*S1pl++) * a1) >> 4; + } else { + *rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4; + } + } + ialphap += 2; + } + } else { + // hresize two rows + const uint8_t* S0 = src + w_in * (sy); + const uint8_t* S1 = src + w_in * (sy + 1); + const int16_t* ialphap = ialpha; + int16_t* rows0p = rowsbuf0; + int16_t* rows1p = rowsbuf1; + for (int dx = 0; dx < dstw; dx++) { + int sx = xofs[dx] * num; // num = 4 + int16_t a0 = ialphap[0]; + int16_t a1 = ialphap[1]; + + const uint8_t* S0pl = S0 + sx; + const uint8_t* S0pr = S0 + sx + num; + const uint8_t* S1pl = S1 + sx; + const uint8_t* S1pr = S1 + sx + num; + if (sx < 0) { + S0pl = S0; + S1pl = S1; + } + for (int i = 0; i < num; i++) { + if (sx < 0) { + *rows0p = ((*S0pl++) * a1) >> 4; + *rows1p = ((*S1pl++) * a1) >> 4; + rows0p++; + rows1p++; + } else { + *rows0p++ = ((*S0pl++) * a0 + (*S0pr++) * a1) >> 4; + *rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4; + } + } + ialphap += 2; + } + } + int ind = dy * 2; + int16_t b0 = ibeta[ind]; + int16_t b1 = ibeta[ind + 1]; + int16x8_t _b0 = vdupq_n_s16(b0); + int16x8_t _b1 = vdupq_n_s16(b1); + uint8_t* dp_ptr = dst + dy * w_out; + int16_t* rows0p = rowsbuf0; + int16_t* rows1p = rowsbuf1; + int re_cnt = cnt; + if (re_cnt > 0) { +#ifdef __aarch64__ + asm volatile( + "1: \n" + "ld1 {v0.8h}, [%[rows0p]], #16 \n" + "ld1 {v1.8h}, [%[rows1p]], #16 \n" + "orr v6.16b, %w[_v2].16b, %w[_v2].16b \n" + "orr v7.16b, %w[_v2].16b, %w[_v2].16b \n" + "smull v2.4s, v0.4h, %w[_b0].4h \n" + "smull2 v4.4s, v0.8h, %w[_b0].8h \n" + "smull v3.4s, v1.4h, %w[_b1].4h \n" + "smull2 v5.4s, v1.8h, %w[_b1].8h \n" + + "ssra v6.4s, v2.4s, #16 \n" + "ssra v7.4s, v4.4s, #16 \n" + "ssra v6.4s, v3.4s, #16 \n" + "ssra v7.4s, v5.4s, #16 \n" + + "shrn v0.4h, v6.4s, #2 \n" + "shrn2 v0.8h, v7.4s, #2 \n" + "subs %w[cnt], %w[cnt], #1 \n" + "sqxtun v1.8b, v0.8h \n" + "st1 {v1.8b}, [%[dp]], #8 \n" + "bne 1b \n" + : [rows0p] "+r"(rows0p), + [rows1p] "+r"(rows1p), + [cnt] "+r"(re_cnt), + [dp] "+r"(dp_ptr) + : [_b0] "w"(_b0), [_b1] "w"(_b1), [_v2] "w"(_v2) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +#else + asm volatile( + "mov r4, #2 \n" + "vdup.s32 q12, r4 \n" + "0: \n" + "vld1.s16 {d2-d3}, [%[rows0p]]!\n" + "vld1.s16 {d6-d7}, [%[rows1p]]!\n" + "vorr.s32 q10, q12, q12 \n" + "vorr.s32 q11, q12, q12 \n" + + "vmull.s16 q0, d2, %[_b0] \n" + "vmull.s16 q1, d3, %[_b0] \n" + "vmull.s16 q2, d6, %[_b1] \n" + "vmull.s16 q3, d7, %[_b1] \n" + + "vsra.s32 q10, q0, #16 \n" + "vsra.s32 q11, q1, #16 \n" + "vsra.s32 q10, q2, #16 \n" + "vsra.s32 q11, q3, #16 \n" + + "vshrn.s32 d20, q10, #2 \n" + "vshrn.s32 d21, q11, #2 \n" + "subs %[cnt], #1 \n" + "vqmovun.s16 d20, q10 \n" + "vst1.8 {d20}, [%[dp]]! \n" + "bne 0b \n" + : [rows0p] "+r"(rows0p), + [rows1p] "+r"(rows1p), + [cnt] "+r"(re_cnt), + [dp] "+r"(dp_ptr) + : [_b0] "w"(_b0), [_b1] "w"(_b1) + : "cc", + "memory", + "r4", + "q0", + "q1", + "q2", + "q3", + "q8", + "q9", + "q10", + "q11", + "q12"); + +#endif // __aarch64__ + } + for (int i = 0; i < remain; i++) { + // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> + // INTER_RESIZE_COEF_BITS; + *dp_ptr++ = + (uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) + + (int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >> + 2); + } + } + delete[] buf; +} +// compute xofs, yofs, alpha, beta +void compute_xy(int srcw, + int srch, + int dstw, + int dsth, + double scale_x, + double scale_y, + int* xofs, + int* yofs, + int16_t* ialpha, + int16_t* ibeta) { + float fy = 0.f; + float fx = 0.f; + int sy = 0; + int sx = 0; + const int resize_coef_bits = 11; + const int resize_coef_scale = 1 << resize_coef_bits; +#define SATURATE_CAST_SHORT(X) \ + (int16_t)::std::min( \ + ::std::max(static_cast(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \ + SHRT_MAX); + + for (int dx = 0; dx < dstw; dx++) { + fx = static_cast((dx + 0.5) * scale_x - 0.5); + sx = floor(fx); + fx -= sx; + + if (sx < 0) { + sx = 0; + fx = 0.f; + } + if (sx >= srcw - 1) { + sx = srcw - 2; + fx = 1.f; + } + + xofs[dx] = sx; + + float a0 = (1.f - fx) * resize_coef_scale; + float a1 = fx * resize_coef_scale; + + ialpha[dx * 2] = SATURATE_CAST_SHORT(a0); + ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1); + } + for (int dy = 0; dy < dsth; dy++) { + fy = static_cast((dy + 0.5) * scale_y - 0.5); + sy = floor(fy); + fy -= sy; + + if (sy < 0) { + sy = 0; + fy = 0.f; + } + if (sy >= srch - 1) { + sy = srch - 2; + fy = 1.f; + } + + yofs[dy] = sy; + + float b0 = (1.f - fy) * resize_coef_scale; + float b1 = fy * resize_coef_scale; + + ibeta[dy * 2] = SATURATE_CAST_SHORT(b0); + ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1); + } +#undef SATURATE_CAST_SHORT +} + +} // namespace cv +} // namespace utils +} // namespace lite +} // namespace paddle diff --git a/lite/utils/cv/image_resize.h b/lite/utils/cv/image_resize.h new file mode 100644 index 0000000000000000000000000000000000000000..e2e399f542c3b00eaf6a3b09f6315b38518f409f --- /dev/null +++ b/lite/utils/cv/image_resize.h @@ -0,0 +1,53 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// ncnn license +// Tencent is pleased to support the open source community by making ncnn +// available. +// +// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this +// file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#pragma once + +#include +#include +#include "lite/utils/cv/paddle_image_preprocess.h" +namespace paddle { +namespace lite { +namespace utils { +namespace cv { +void resize(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + int srcw, + int srch, + int dstw, + int dsth); + +} // namespace cv +} // namespace utils +} // namespace lite +} // namespace paddle diff --git a/lite/utils/cv/image_rotate.cc b/lite/utils/cv/image_rotate.cc new file mode 100644 index 0000000000000000000000000000000000000000..04ba84076685f89c376203d69ea631afe03671ec --- /dev/null +++ b/lite/utils/cv/image_rotate.cc @@ -0,0 +1,1972 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/utils/cv/image_rotate.h" +#include +#include +namespace paddle { +namespace lite { +namespace utils { +namespace cv { +// gray +void rotate_hwc1_90( + const uint8_t* src, uint8_t* dst, int w_in, int h_in, int w_out, int h_out); +void rotate_hwc1_180( + const uint8_t* src, uint8_t* dst, int w_in, int h_in, int w_out, int h_out); +void rotate_hwc1_270( + const uint8_t* src, uint8_t* dst, int w_in, int h_in, int w_out, int h_out); + +// bgr rgb +void rotate_hwc3_90( + const uint8_t* src, uint8_t* dst, int w_in, int h_in, int w_out, int h_out); +void rotate_hwc3_180( + const uint8_t* src, uint8_t* dst, int w_in, int h_in, int w_out, int h_out); +void rotate_hwc3_270( + const uint8_t* src, uint8_t* dst, int w_in, int h_in, int w_out, int h_out); +// rgba bgra +void rotate_hwc4_90( + const uint8_t* src, uint8_t* dst, int w_in, int h_in, int w_out, int h_out); +void rotate_hwc4_180( + const uint8_t* src, uint8_t* dst, int w_in, int h_in, int w_out, int h_out); +void rotate_hwc4_270( + const uint8_t* src, uint8_t* dst, int w_in, int h_in, int w_out, int h_out); + +void rotate_hwc1( + const uint8_t* src, uint8_t* dst, int srcw, int srch, float degree) { + if (degree == 90) { + rotate_hwc1_90(src, dst, srcw, srch, srch, srcw); + } else if (degree == 180) { + rotate_hwc1_180(src, dst, srcw, srch, srcw, srch); + } else if (degree == 270) { + rotate_hwc1_270(src, dst, srcw, srch, srch, srcw); + } +} + +void rotate_hwc3( + const uint8_t* src, uint8_t* dst, int srcw, int srch, float degree) { + if (degree == 90) { + rotate_hwc3_90(src, dst, srcw, srch, srch, srcw); + } else if (degree == 180) { + rotate_hwc3_180(src, dst, srcw, srch, srcw, srch); + } else if (degree == 270) { + rotate_hwc3_270(src, dst, srcw, srch, srch, srcw); + } +} + +void rotate_hwc4( + const uint8_t* src, uint8_t* dst, int srcw, int srch, float degree) { + if (degree == 90) { + rotate_hwc4_90(src, dst, srcw, srch, srch, srcw); + } else if (degree == 180) { + rotate_hwc4_180(src, dst, srcw, srch, srcw, srch); + } else if (degree == 270) { + rotate_hwc4_270(src, dst, srcw, srch, srch, srcw); + } +} +#ifdef __aarch64__ + +#define INPUT_C1 \ + "ld1 {v0.8b}, [%[inptr0]] \n" \ + "ld1 {v4.8b}, [%[inptr1]] \n" \ + "ld1 {v8.8b}, [%[inptr2]] \n" \ + "ld1 {v12.8b}, [%[inptr3]] \n" + +#define INPUT_C3 \ + "ld3 {v0.8b, v1.8b, v2.8b}, [%[inptr0]] \n" \ + "ld3 {v4.8b, v5.8b, v6.8b}, [%[inptr1]] \n" \ + "ld3 {v8.8b, v9.8b, v10.8b}, [%[inptr2]] \n" \ + "ld3 {v12.8b, v13.8b, v14.8b}, [%[inptr3]] \n" + +#define ADD_INPUT \ + "add %[inptr0], %[inptr0], %[stride_h] \n" \ + "add %[inptr1], %[inptr1], %[stride_h] \n" \ + "add %[inptr2], %[inptr2], %[stride_h] \n" \ + "add %[inptr3], %[inptr3], %[stride_h] \n" + +#define SUB_INPUT \ + "sub %[inptr0], %[inptr0], %[stride_h_w] \n" /* 4 - 4*w_in + 8 */ \ + "sub %[inptr1], %[inptr1], %[stride_h_w] \n" /* 5 - 4*w_in + 8 */ \ + "sub %[inptr2], %[inptr2], %[stride_h_w] \n" /* 6 - 4*w_in + 8 */ \ + "sub %[inptr3], %[inptr3], %[stride_h_w] \n" /* 7 - 4*w_in + 8 */ + +#define TRANS_C1_8 \ + "trn1 v1.8b, v0.8b, v4.8b \n" /* b v4=00 10 02 12 04 14 06 16*/ \ + "trn1 v5.8b, v8.8b, v12.8b \n" /* v4={20 30 22 32 24 34 26 36 */ \ + "trn2 v2.8b, v0.8b, v4.8b \n" /* v5={01 11 03 13 05 15 07 17 */ \ + "trn2 v6.8b, v8.8b, v12.8b \n" /* v7={21 31 23 33 25 35 27 37 */ + +#define TRANS_C1_16 \ + "trn1 v9.4h, v1.4h, v5.4h \n" \ + "trn1 v13.4h, v2.4h, v6.4h \n" /* v22=01 11 21 31 05 15 25 35 */ \ + "trn2 v10.4h, v1.4h, v5.4h \n" /* v21=02 12 22 32 06 16 26 36*/ \ + "trn2 v14.4h, v2.4h, v6.4h \n" /* v23=03 13 23 33 07 17 27 37 */ + +#define TRANS_C1 \ + "trn1 v0.4h, v1.4h, v5.4h \n" /* b v4=40 50 60 70 04 14 24 34 */ \ + "trn1 v8.4h, v2.4h, v6.4h \n" /* v4=41 11 21 31 05 15 25 35 */ \ + "trn2 v4.4h, v1.4h, v5.4h \n" /* v5=42 12 22 32 06 16 26 36*/ \ + "trn2 v12.4h, v2.4h, v6.4h \n" /* v7=43 13 23 33 07 17 27 37 */ \ + "trn1 v1.2s, v9.2s, v0.2s \n" /* b v1=00 10 20 30 40 50 60 */ \ + "trn1 v2.2s, v13.2s, v8.2s \n" /* v2= 01 11 21 31 41 51 61 71 */ \ + "trn1 v5.2s, v10.2s, v4.2s \n" /* b v5=02 12 22 32 42 52 62 */ \ + "trn1 v6.2s, v14.2s, v12.2s \n" /* v6=03 13 23 33 43 53 63 73*/ \ + \ + "trn2 v3.2s, v9.2s, v0.2s \n" /* v3=04 14 24 34 44 54 64 74*/ \ + "trn2 v7.2s, v13.2s, v8.2s \n" /* v7=05 15 25 35 45 55 65 75*/ \ + "trn2 v11.2s, v10.2s, v4.2s \n" /* v11=06 16 26 36 46 56 66 */ \ + "trn2 v15.2s, v14.2s, v12.2s \n" /* v15=07 17 27 37 47 57 67 */ + +#define REVERSE_C1 \ + "rev64 v0.8b, v1.8b \n" \ + "rev64 v4.8b, v2.8b \n" \ + "rev64 v8.8b, v5.8b \n" \ + "rev64 v12.8b, v6.8b \n" \ + \ + "rev64 v1.8b, v3.8b \n" \ + "rev64 v5.8b, v7.8b \n" \ + "rev64 v9.8b, v11.8b \n" \ + "rev64 v13.8b, v15.8b \n" + +#define STORE_C1_R \ + "st1 {v0.8b}, [%[outptr0]] \n" /* b v1=00 10 20 30 40 50 60*/ \ + "st1 {v4.8b}, [%[outptr1]] \n" /* v2=01 11 21 31 41 51 61 71*/ \ + "st1 {v8.8b}, [%[outptr2]] \n" /* b v5=02 12 22 32 42 52 62 */ \ + "st1 {v12.8b}, [%[outptr3]] \n" /* v6=03 13 23 33 43 53 63 73*/ \ + \ + "st1 {v1.8b}, [%[outptr4]] \n" /* v3=04 14 24 34 44 54 64 74}*/ \ + "st1 {v5.8b}, [%[outptr5]] \n" /* v7=05 15 25 35 45 55 65 75}*/ \ + "st1 {v9.8b}, [%[outptr6]] \n" /* v11=06 16 26 36 46 56 66 */ \ + "st1 {v13.8b}, [%[outptr7]] \n" /* v15=07 17 27 37 47 57 67 */ + +#define STORE_C1 \ + "st1 {v1.8b}, [%[outptr0]] \n" /* b v1=00 10 20 30 40 50 60 */ \ + "st1 {v2.8b}, [%[outptr1]] \n" /* v2=01 11 21 31 41 51 61 71*/ \ + "st1 {v5.8b}, [%[outptr2]] \n" /* b v5=02 12 22 32 42 52 62 */ \ + "st1 {v6.8b}, [%[outptr3]] \n" /* v6=03 13 23 33 43 53 63 73}*/ \ + \ + "st1 {v3.8b}, [%[outptr4]] \n" /* v3=04 14 24 34 44 54 64 74*/ \ + "st1 {v7.8b}, [%[outptr5]] \n" /* v7=05 15 25 35 45 55 65 75*/ \ + "st1 {v11.8b}, [%[outptr6]] \n" /* v11=06 16 26 36 46 56 66 */ \ + "st1 {v15.8b}, [%[outptr7]] \n" /* v15=07 17 27 37 47 57 67*/ + +#define TRANS_C3_8 \ + "trn1 v3.8b, v0.8b, v4.8b \n" /* b v4=00 10 02 12 04 14 06 16 */ \ + "trn1 v7.8b, v8.8b, v12.8b \n" /* v4=20 30 22 32 24 34 26 36 */ \ + "trn2 v11.8b, v0.8b, v4.8b \n" /* v5=01 11 03 13 05 15 07 17 */ \ + "trn2 v15.8b, v8.8b, v12.8b \n" /* v7=21 31 23 33 25 35 27 37*/ \ + \ + "trn1 v16.8b, v1.8b, v5.8b \n" \ + "trn1 v18.8b, v9.8b, v13.8b \n" /* v4=20 30 22 32 24 34 26 36 */ \ + "trn2 v17.8b, v1.8b, v5.8b \n" /* v5={01 11 03 13 05 15 07 17 */ \ + "trn2 v19.8b, v9.8b, v13.8b \n" /* v7=21 31 23 33 25 35 27 37 */ \ + \ + "trn1 v20.8b, v2.8b, v6.8b \n" \ + "trn1 v22.8b, v10.8b, v14.8b \n" \ + "trn2 v21.8b, v2.8b, v6.8b \n" /* v5=01 11 03 13 05 15 07 17 */ \ + "trn2 v23.8b, v10.8b, v14.8b \n" + +#define TRANS_C3_16 \ + "trn1 v24.4h, v3.4h, v7.4h \n" \ + "trn1 v26.4h, v11.4h, v15.4h \n" /* v4=01 11 21 31 05 15 25 35*/ \ + "trn2 v25.4h, v3.4h, v7.4h \n" /* v5=02 12 22 32 06 16 26 36*/ \ + "trn2 v27.4h, v11.4h, v15.4h \n" \ + \ + "trn1 v28.4h, v16.4h, v18.4h \n" /* g v4=00 10 20 30 04 14 24 */ \ + "trn1 v30.4h, v17.4h, v19.4h \n" \ + "trn2 v29.4h, v16.4h, v18.4h \n" /* v5=02 12 22 32 06 16 26 */ \ + "trn2 v31.4h, v17.4h, v19.4h \n" \ + \ + "trn1 v16.4h, v20.4h, v22.4h \n" /* r v4=00 10 20 30 04 14 24 */ \ + "trn1 v18.4h, v21.4h, v23.4h \n" \ + "trn2 v17.4h, v20.4h, v22.4h \n" /* v5=02 12 22 32 06 16 26 */ \ + "trn2 v19.4h, v21.4h, v23.4h \n" + +#define TRANS_C3 \ + "trn1 v3.8b, v0.8b, v4.8b \n" /* b v4=40 50 42 52 04 14 06 16 */ \ + "trn1 v7.8b, v8.8b, v12.8b \n" /* v4=60 70 62 72 24 34 26 36 */ \ + "trn2 v11.8b, v0.8b, v4.8b \n" /* v5=41 51 03 13 05 15 07 17 */ \ + "trn2 v15.8b, v8.8b, v12.8b \n" /* v7=61 71 23 33 25 35 27 37 */ \ + \ + "trn1 v20.8b, v2.8b, v6.8b \n" \ + "trn1 v22.8b, v10.8b, v14.8b \n" \ + "trn2 v21.8b, v2.8b, v6.8b \n" /* v5=41 51 03 13 05 15 07 17 */ \ + "trn2 v23.8b, v10.8b, v14.8b \n" \ + \ + "trn1 v0.4h, v3.4h, v7.4h \n" /* b v4=40 50 60 70 04 14 24 34 */ \ + "trn1 v4.4h, v11.4h, v15.4h \n" /* v4=41 51 61 71 05 15 25 35 */ \ + "trn2 v8.4h, v3.4h, v7.4h \n" /* v5=42 52 62 72 06 16 26 36*/ \ + "trn2 v12.4h, v11.4h, v15.4h \n" \ + \ + "trn1 v2.4h, v20.4h, v22.4h \n" /* r v4=40 50 60 70 */ \ + "trn1 v6.4h, v21.4h, v23.4h \n" /* v4=41 51 61 71 */ \ + "trn2 v10.4h, v20.4h, v22.4h \n" /* v5=42 52 62 72 */ \ + "trn2 v14.4h, v21.4h, v23.4h \n" /* v7=43 53 63 73 */ \ + \ + "trn1 v20.2s, v24.2s, v0.2s \n" \ + "trn1 v21.2s, v26.2s, v4.2s \n" /* v4=01 11 21 31 41 51 61 71 */ \ + "trn1 v22.2s, v25.2s, v8.2s \n" /* v5=02 12 22 32 42 52 62 72 */ \ + "trn1 v23.2s, v27.2s, v12.2s \n" \ + \ + "trn2 v3.2s, v24.2s, v0.2s \n" \ + "trn2 v7.2s, v26.2s, v4.2s \n" /* v4=05 11 21 31 41 51 61 71 */ \ + "trn2 v11.2s, v25.2s, v8.2s \n" /* v5=06 12 22 32 42 52 62 72 */ \ + "trn2 v15.2s, v27.2s, v12.2s \n" /* v7=07 13 23 33 43 53 63 */ \ + \ + "trn1 v0.2s, v16.2s, v2.2s \n" /* r v4=00 10 20 30 40 50 60 */ \ + "trn1 v4.2s, v18.2s, v6.2s \n" /* v4=01 11 21 31 41 51 61 71 */ \ + "trn1 v8.2s, v17.2s, v10.2s \n" /* v5=02 12 22 32 42 52 62 72 */ \ + "trn1 v12.2s, v19.2s, v14.2s \n" /* v7=03 13 23 33 43 53 63 */ \ + \ + "trn2 v24.2s, v16.2s, v2.2s \n" /* r v4=04 10 20 30 40 50 60 */ \ + "trn2 v25.2s, v18.2s, v6.2s \n" /* v4=05 11 21 31 41 51 61 71 */ \ + "trn2 v26.2s, v17.2s, v10.2s \n" /* v5=06 12 22 32 42 52 62 */ \ + "trn2 v27.2s, v19.2s, v14.2s \n" /* v7=07 13 23 33 43 53 63 */ \ + \ + "trn1 v16.8b, v1.8b, v5.8b \n" /* g v4={00 10 02 12 04 14 06 */ \ + "trn1 v18.8b, v9.8b, v13.8b \n" /* v4={20 30 22 32 24 34 26 */ \ + "trn2 v17.8b, v1.8b, v5.8b \n" /* v5={01 11 03 13 05 15 07 17 */ \ + "trn2 v19.8b, v9.8b, v13.8b \n" /* v7={21 31 23 33 25 35 27 */ \ + \ + "sub %[inptr0], %[inptr0], %[stride_h_w] \n" /* 4 - 4*w_in + 8 */ \ + "sub %[inptr1], %[inptr1], %[stride_h_w] \n" /* 5 - 4*w_in + 8 */ \ + "sub %[inptr2], %[inptr2], %[stride_h_w] \n" /* 6 - 4*w_in + 8 */ \ + "sub %[inptr3], %[inptr3], %[stride_h_w] \n" /* 7 - 4*w_in + 8 */ \ + \ + "trn1 v1.4h, v16.4h, v18.4h \n" /* g v4={00 10 20 30 04 14 24*/ \ + "trn1 v5.4h, v17.4h, v19.4h \n" /* v4={ 01 11 21 31 05 15 25 */ \ + "trn2 v9.4h, v16.4h, v18.4h \n" /* v5={02 12 22 32 06 16 26 36*/ \ + "trn2 v13.4h, v17.4h, v19.4h \n" /* v7={03 13 23 33 07 17 27 */ \ + \ + "trn1 v2.2s, v28.2s, v1.2s \n" /* g v4=00 10 20 30 40 50 60 */ \ + "trn1 v6.2s, v30.2s, v5.2s \n" /* v4=01 11 21 31 41 51 61 71 */ \ + "trn1 v10.2s, v29.2s, v9.2s \n" /* v5=02 12 22 32 42 52 62 72 */ \ + "trn1 v14.2s, v31.2s, v13.2s \n" /* v7=03 13 23 33 43 53 63 */ \ + \ + "trn2 v16.2s, v28.2s, v1.2s \n" /* g v4=04 10 20 30 40 50 60 */ \ + "trn2 v17.2s, v30.2s, v5.2s \n" /* v4=05 11 21 31 41 51 61 71 */ \ + "trn2 v18.2s, v29.2s, v9.2s \n" /* v5=06 12 22 32 42 52 62 72 */ \ + "trn2 v19.2s, v31.2s, v13.2s \n" /* v7=07 13 23 33 43 53 63 */ + +#define REVERSE_C3 \ + "rev64 v28.8b, v20.8b \n" /* b 00 10 20 30 40 50 60 70*/ \ + "rev64 v29.8b, v2.8b \n" /* g 00 10 20 30 40 50 60 70*/ \ + "rev64 v30.8b, v0.8b \n" /* r 00 10 20 30 40 50 60 70*/ \ + \ + "rev64 v0.8b, v21.8b \n" /* b 01 11 21 31 41 51 61 71 */ \ + "rev64 v1.8b, v6.8b \n" /* g 01 11 21 31 41 51 61 71 */ \ + "rev64 v2.8b, v4.8b \n" /* r 01 11 21 31 41 51 61 71 */ \ + \ + "rev64 v4.8b, v22.8b \n" /* b 02 12 22 32 42 52 62 72 */ \ + "rev64 v5.8b, v10.8b \n" /* g 02 12 22 32 42 52 62 72*/ \ + "rev64 v6.8b, v8.8b \n" /* r 02 12 22 32 42 52 62 72 */ \ + \ + "rev64 v8.8b, v23.8b \n" /* b 03 13 23 33 43 53 63 73 */ \ + "rev64 v9.8b, v14.8b \n" /* g 03 13 23 33 43 53 63 73 */ \ + "rev64 v10.8b, v12.8b \n" /* r 03 13 23 33 43 53 63 73 */ \ + \ + "rev64 v12.8b, v3.8b \n" /* b 04 14 20 30 40 50 60 70 */ \ + "rev64 v13.8b, v16.8b \n" /* g 04 14 20 30 40 50 60 70 */ \ + "rev64 v14.8b, v24.8b \n" /* r 04 14 20 30 40 50 60 70 */ \ + \ + "rev64 v20.8b, v7.8b \n" /* b 05 15 20 30 40 50 60 70 */ \ + "rev64 v21.8b, v17.8b \n" /* g 05 15 20 30 40 50 60 70 */ \ + "rev64 v22.8b, v25.8b \n" /* r 05 15 20 30 40 50 60 70 */ \ + \ + "rev64 v23.8b, v11.8b \n" /* b 06 15 20 30 40 50 60 70 */ \ + "rev64 v24.8b, v18.8b \n" /* g 06 15 20 30 40 50 60 70 */ \ + "rev64 v25.8b, v26.8b \n" /* r 06 15 20 30 40 50 60 70 */ \ + \ + "rev64 v16.8b, v15.8b \n" /* b 07 15 20 30 40 50 60 70 */ \ + "rev64 v17.8b, v19.8b \n" /* g 07 15 20 30 40 50 60 70 */ \ + "rev64 v18.8b, v27.8b \n" /* r 07 15 20 30 40 50 60 70 */ + +#define MOV_C3 \ + "mov v28.8b, v20.8b \n" /* b 00 10 20 30 40 50 60 70*/ \ + "mov v29.8b, v2.8b \n" /* g 00 10 20 30 40 50 60 70*/ \ + "mov v30.8b, v0.8b \n" /* r 00 10 20 30 40 50 60 70*/ \ + \ + "mov v0.8b, v21.8b \n" /* b 01 11 21 31 41 51 61 71 */ \ + "mov v1.8b, v6.8b \n" /* g 01 11 21 31 41 51 61 71 */ \ + "mov v2.8b, v4.8b \n" /* r 01 11 21 31 41 51 61 71 */ \ + \ + "mov v4.8b, v22.8b \n" /* b 02 12 22 32 42 52 62 72 */ \ + "mov v5.8b, v10.8b \n" /* g 02 12 22 32 42 52 62 72*/ \ + "mov v6.8b, v8.8b \n" /* r 02 12 22 32 42 52 62 72 */ \ + \ + "mov v8.8b, v23.8b \n" /* b 03 13 23 33 43 53 63 73 */ \ + "mov v9.8b, v14.8b \n" /* g 03 13 23 33 43 53 63 73 */ \ + "mov v10.8b, v12.8b \n" /* r 03 13 23 33 43 53 63 73 */ \ + \ + "mov v12.8b, v3.8b \n" /* b 04 14 20 30 40 50 60 70 */ \ + "mov v13.8b, v16.8b \n" /* g 04 14 20 30 40 50 60 70 */ \ + "mov v14.8b, v24.8b \n" /* r 04 14 20 30 40 50 60 70 */ \ + \ + "mov v20.8b, v7.8b \n" /* b 05 15 20 30 40 50 60 70 */ \ + "mov v21.8b, v17.8b \n" /* g 05 15 20 30 40 50 60 70 */ \ + "mov v22.8b, v25.8b \n" /* r 05 15 20 30 40 50 60 70 */ \ + \ + "mov v23.8b, v11.8b \n" /* b 06 15 20 30 40 50 60 70 */ \ + "mov v24.8b, v18.8b \n" /* g 06 15 20 30 40 50 60 70 */ \ + "mov v25.8b, v26.8b \n" /* r 06 15 20 30 40 50 60 70 */ \ + \ + "mov v16.8b, v15.8b \n" /* b 07 15 20 30 40 50 60 70 */ \ + "mov v17.8b, v19.8b \n" /* g 07 15 20 30 40 50 60 70 */ \ + "mov v18.8b, v27.8b \n" /* r 07 15 20 30 40 50 60 70 */ + +#define STORE_C3 \ + "st3 {v28.8b, v29.8b, v30.8b}, [%[outptr0]] \n" \ + "st3 {v0.8b, v1.8b, v2.8b}, [%[outptr1]] \n" \ + "st3 {v4.8b, v5.8b, v6.8b}, [%[outptr2]] \n" \ + "st3 {v8.8b, v9.8b, v10.8b}, [%[outptr3]] \n" \ + \ + "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr4]] \n" \ + "st3 {v20.8b, v21.8b, v22.8b}, [%[outptr5]] \n" \ + "st3 {v23.8b, v24.8b, v25.8b}, [%[outptr6]] \n" \ + "st3 {v16.8b, v17.8b, v18.8b}, [%[outptr7]] \n" + +#else + +#define INPUT_C1 \ + "vld1.8 {d0}, [%[inptr0]] @ zip load r0, d0 =00 01 02 03 04 05 06 07\n" \ + "vld1.8 {d4}, [%[inptr1]] @ zip load r1, d2 =10 11 12 13 14 15 16 17\n" \ + "vld1.8 {d8}, [%[inptr2]] @ zip load r1, d4 =20 21 22 23 24 25 26 27\n" \ + "vld1.8 {d12}, [%[inptr3]] @ zip load r1, d6 = 30 31 32 33 34 35 36 37\n" + +#define INPUT_C3 \ + "vld3.8 {d0, d1, d2}, [%[inptr0]] @ zip load r0, d0 =00 01 02 03 04 05 " \ + "06 07\n" \ + "vld3.8 {d4, d5, d6}, [%[inptr1]] @ zip load r1, d2 =10 11 12 13 14 15 " \ + "16 17\n" \ + "vld3.8 {d8, d9, d10}, [%[inptr2]] @ zip load r1, d4 =20 21 22 23 24 25 " \ + "26 27\n" \ + "vld3.8 {d12, d13, d14}, [%[inptr3]] @ zip load r1, d6 = 30 31 32 33 34 " \ + "35 36 37\n" + +#define ADD_INPUT \ + "add %[inptr0], %[inptr0], %[stride_h] \n" \ + "add %[inptr1], %[inptr1], %[stride_h] \n" \ + "add %[inptr2], %[inptr2], %[stride_h] \n" \ + "add %[inptr3], %[inptr3], %[stride_h] \n" + +#define SUB_INPUT \ + "sub %[inptr0], %[inptr0], %[stride_h_w] \n" \ + "sub %[inptr1], %[inptr1], %[stride_h_w] \n" \ + "sub %[inptr2], %[inptr2], %[stride_h_w] \n" \ + "sub %[inptr3], %[inptr3], %[stride_h_w] \n" + +#define TRANS_C1 \ + "vtrn.8 d0, d4 @ trans data: \n" /* d0=00 10 02 12 04 14 06 16 */ \ + "vtrn.8 d8, d12 @ trans data: \n" /* d8=20 30 12 32 24 34 26 36 */ \ + \ + "vld1.8 {d1}, [%[inptr0]] @ zip load r0, d0 =00 01 02 03 04 05 06 07\n" \ + "vld1.8 {d5}, [%[inptr1]] @ zip load r1, d2 =10 11 12 13 14 15 16 17\n" \ + "vld1.8 {d9}, [%[inptr2]] @ zip load r1, d4 =20 21 22 23 24 25 26 27\n" \ + "vld1.8 {d13}, [%[inptr3]] @ zip load r1, d6 = 30 31 32 33 34 35 36 37\n" \ + \ + "vtrn.16 d0, d8 @ trans data: \n" /* d0=00 10 20 30 04 14 24 34 */ \ + "vtrn.16 d4, d12 @ trans data:\n" /* d4=01 11 21 31 05 15 25 35 */ \ + \ + "vtrn.8 d1, d5 @ trans data: \n" /* d0=40 50 42 52 04 14 06 16 */ \ + "vtrn.8 d9, d13 @ trans data: \n" /* d8=60 70 62 72 24 34 26 36 */ \ + \ + "sub %[inptr0], %[inptr0], %[stride_h_w] \n" \ + "sub %[inptr1], %[inptr1], %[stride_h_w] \n" \ + "sub %[inptr2], %[inptr2], %[stride_h_w] \n" \ + "sub %[inptr3], %[inptr3], %[stride_h_w] \n" \ + \ + "vtrn.16 d1, d5 @ trans data: \n" /* d0=40 50 60 70 04 14 24 34 */ \ + "vtrn.16 d9, d13 @ trans data:\n" /* d4=41 51 61 71 05 15 25 35 */ \ + \ + "vtrn.32 d0, d1 @ trans data: \n" \ + "vtrn.32 d8, d9 @ trans data: \n" \ + "vtrn.32 d4, d5 @ trans data: \n" \ + "vtrn.32 d12, d13 @ trans data: \n" + +#define REVERSE_C1 \ + "vrev64.8 d2, d0 @reverse 7 6 5 4 3 2 1 \n" \ + "vrev64.8 d3, d1 @reverse 7 6 5 4 3 2 1 \n" \ + "vrev64.8 d10, d8 @reverse 7 6 5 4 3 2 1 \n" \ + "vrev64.8 d11, d9 @reverse 7 6 5 4 3 2 1 \n" \ + "vrev64.8 d6, d4 @reverse 7 6 5 4 3 2 1 \n" \ + "vrev64.8 d7, d5 @reverse 7 6 5 4 3 2 1 \n" \ + "vrev64.8 d14, d12 @reverse 7 6 5 4 3 2 1 \n" \ + "vrev64.8 d15, d13 @reverse 7 6 5 4 3 2 1 \n" + +#define ADD_OUTPUT \ + "add %[outptr0], %[outptr0], %[stride_out] \n" \ + "add %[outptr2], %[outptr2], %[stride_out] \n" \ + "add %[outptr1], %[outptr1], %[stride_out] \n" \ + "add %[outptr3], %[outptr3], %[stride_out] \n" + +#define SUB_OUTPUT \ + "sub %[outptr0], %[outptr0], %[stride_out] \n" \ + "sub %[outptr2], %[outptr2], %[stride_out] \n" \ + "sub %[outptr1], %[outptr1], %[stride_out] \n" \ + "sub %[outptr3], %[outptr3], %[stride_out] \n" + +#define STORE_C1_4 \ + "vst1.8 {d0}, [%[outptr0]] @ write d0(q0,low),r00,r10 20 30\n" \ + "vst1.8 {d8}, [%[outptr2]] @ write d0(q0,low),r00,r10 20 30\n" \ + "vst1.8 {d4}, [%[outptr1]] @ write d0(q0,low),r00,r10 20 30\n" \ + "vst1.8 {d12}, [%[outptr3]] @ write d0(q0,low),r00,r10 20 30\n" + +#define STORE_C1_8 \ + "vst1.8 {d1}, [%[outptr0]] @ write d0(q0,low),r00,r10 20 30\n" \ + "vst1.8 {d9}, [%[outptr2]] @ write d0(q0,low),r00,r10 20 30\n" \ + "vst1.8 {d5}, [%[outptr1]] @ write d0(q0,low),r00,r10 20 30\n" \ + "vst1.8 {d13}, [%[outptr3]] @ write d0(q0,low),r00,r10 20 30\n" + +#define STORE_C1_R_4 \ + "vst1.8 {d2}, [%[outptr0]] @ write d0(q0,low),r00,r10 20 30\n" \ + "vst1.8 {d10}, [%[outptr2]] @ write d0(q0,low),r00,r10 20 30\n" \ + "vst1.8 {d6}, [%[outptr1]] @ write d0(q0,low),r00,r10 20 30\n" \ + "vst1.8 {d14}, [%[outptr3]] @ write d0(q0,low),r00,r10 20 30\n" + +#define STORE_C1_R_8 \ + "vst1.8 {d3}, [%[outptr0]] @ write d0(q0,low),r00,r10 20 30\n" \ + "vst1.8 {d11}, [%[outptr2]] @ write d0(q0,low),r00,r10 20 30\n" \ + "vst1.8 {d7}, [%[outptr1]] @ write d0(q0,low),r00,r10 20 30\n" \ + "vst1.8 {d15}, [%[outptr3]] @ write d0(q0,low),r00,r10 20 30\n" + +#define TRANS_C3 \ + "vtrn.8 d0, d4 @ trans data: \n" \ + "vtrn.8 d8, d12 @ trans data: \n" \ + "vtrn.8 d1, d5 @ trans data: \n" \ + "vtrn.8 d9, d13 @ trans data: \n" \ + "vtrn.8 d2, d6 @ trans data: \n" \ + "vtrn.8 d10, d14 @ trans data: \n" \ + \ + "vld3.8 {d16, d17, d18}, [%[inptr0]] @ zip load r0, d0 =40 01 02 03 04 " \ + "05 06 07\n" \ + "vld3.8 {d20, d21, d22}, [%[inptr1]] @ zip load r1, d2 =50 11 12 13 14 " \ + "15 16 17\n" \ + "vld3.8 {d24, d25, d26}, [%[inptr2]] @ zip load r1, d4 =60 21 22 23 24 " \ + "25 26 27\n" \ + "vld3.8 {d28, d29, d30}, [%[inptr3]] @ zip load r1, d6 =70 31 32 33 34 " \ + "35 36 37\n" \ + \ + "vtrn.16 d0, d8 @ trans data: \n" \ + "vtrn.16 d4, d12 @ trans data:\n" \ + "vtrn.16 d1, d9 @ trans data: \n" \ + "vtrn.16 d5, d13 @ trans data:\n" \ + "vtrn.16 d2, d10 @ trans data: \n" \ + "vtrn.16 d6, d14 @ trans data:\n" \ + \ + "vtrn.8 d16, d20 @ trans data: \n" \ + "vtrn.8 d24, d28 @ trans data: \n" \ + "vtrn.8 d17, d21 @ trans data: \n" \ + "vtrn.8 d25, d29 @ trans data: \n" \ + "vtrn.8 d18, d22 @ trans data: \n" \ + "vtrn.8 d26, d30 @ trans data: \n" \ + \ + "sub %[inptr0], %[inptr0], %[stride_h_w] \n" \ + "sub %[inptr1], %[inptr1], %[stride_h_w] \n" \ + "sub %[inptr2], %[inptr2], %[stride_h_w] \n" \ + "sub %[inptr3], %[inptr3], %[stride_h_w] \n" \ + \ + "vtrn.16 d16, d24 @ trans data: \n" \ + "vtrn.16 d20, d28 @ trans data: \n" \ + "vtrn.16 d17, d25 @ trans data: \n" \ + "vtrn.16 d21, d29 @ trans data: \n" \ + "vtrn.16 d18, d26 @ trans data: \n" \ + "vtrn.16 d22, d30 @ trans data: \n" \ + \ + "vtrn.32 d0, d16 @ trans data: \n" \ + "vtrn.32 d8, d24 @ trans data: \n" \ + "vtrn.32 d4, d20 @ trans data: \n" \ + "vtrn.32 d12, d28 @ trans data: \n" \ + \ + "vtrn.32 d1, d17 @ trans data: \n" \ + "vtrn.32 d9, d25 @ trans data: \n" \ + "vtrn.32 d5, d21 @ trans data: \n" \ + "vtrn.32 d13, d29 @ trans data: \n" \ + \ + "vtrn.32 d2, d18 @ trans data: \n" \ + "vtrn.32 d10, d26 @ trans data: \n" \ + "vtrn.32 d6, d22 @ trans data: \n" \ + "vtrn.32 d14, d30 @ trans data: \n" + +#define STORE_C3_4 \ + "vst3.8 {d0, d1, d2}, [%[outptr0]] \n" \ + "vst3.8 {d4, d5, d6}, [%[outptr1]] \n" \ + "vst3.8 {d8, d9, d10}, [%[outptr2]] \n" \ + "vst3.8 {d12, d13, d14}, [%[outptr3]] \n" + +#define STORE_C3_8 \ + "vst3.8 {d16, d17, d18}, [%[outptr0]] \n" \ + "vst3.8 {d20, d21, d22}, [%[outptr1]] \n" \ + "vst3.8 {d24, d25, d26}, [%[outptr2]] \n" \ + "vst3.8 {d28, d29, d30}, [%[outptr3]] \n" + +#define REVERSE_C3 \ + "vrev64.8 d3, d0 \n" /* b 00*/ \ + "vrev64.8 d7, d4 \n" /* b 01*/ \ + "vrev64.8 d15, d5 \n" /* g 01*/ \ + "vrev64.8 d11, d8 \n" /* b 02*/ \ + "vrev64.8 d4, d1 \n" /* g 00*/ \ + "vrev64.8 d5, d2 \n" /* r 00*/ \ + \ + "vrev64.8 d0, d12 \n" /* b 03*/ \ + "vrev64.8 d1, d13 \n" /* g 03*/ \ + "vrev64.8 d2, d14 \n" /* r 03*/ \ + \ + "vrev64.8 d12, d9 \n" /* g 02*/ \ + "vrev64.8 d13, d10 \n" /* r 02*/ \ + \ + "vmov d8, d15 \n" /* g 01*/ \ + "vrev64.8 d9, d6 \n" /* r 01*/ \ + \ + "vrev64.8 d14, d16 \n" /* b 04*/ \ + "vrev64.8 d15, d17 \n" /* g 04*/ \ + "vrev64.8 d16, d18 \n" /* r 04*/ \ + \ + "vrev64.8 d17, d20 \n" /* b 05*/ \ + "vrev64.8 d18, d21 \n" /* g 05*/ \ + "vrev64.8 d19, d22 \n" /* r 05*/ \ + \ + "vrev64.8 d20, d24 \n" /* b 06*/ \ + "vrev64.8 d21, d25 \n" /* g 06*/ \ + "vrev64.8 d22, d26 \n" /* r 06*/ \ + \ + "vrev64.8 d24, d28 \n" /* b 07*/ \ + "vrev64.8 d25, d29 \n" /* g 07*/ \ + "vrev64.8 d26, d30 \n" /* r 07*/ + +#define STORE_C3_R_4 \ + "vst3.8 {d3, d4, d5}, [%[outptr0]] \n" \ + "vst3.8 {d0, d1, d2}, [%[outptr3]] \n" \ + "vst3.8 {d11, d12, d13}, [%[outptr2]] \n" \ + "vst3.8 {d7, d8, d9}, [%[outptr1]] \n" + +#define STORE_C3_R_8 \ + "vst3.8 {d14, d15, d16}, [%[outptr0]] \n" \ + "vst3.8 {d17, d18, d19}, [%[outptr1]] \n" \ + "vst3.8 {d20, d21, d22}, [%[outptr2]] \n" \ + "vst3.8 {d24, d25, d26}, [%[outptr3]] \n" + +#endif +/* +1 2 3 +4 5 6 +7 8 9 +rotate: +7 4 1 +8 5 2 +9 6 3 +*/ +// transpose +void rotate_hwc1_90(const uint8_t* src, + uint8_t* dst, + int w_in, + int h_in, + int w_out, + int h_out) { + uint8_t zerobuff[8] = {0, 0, 0, 0, 0, 0, 0, 0}; + // block 4*8. -- 8*4 + int i = 0; + int stride_h = 4 * w_in; + int stride_h_w = 4 * w_in - 8; + int stride_out = 4 * w_out; +#pragma omp parallel for + for (i = 0; i < h_in - 7; i += 8) { + const uint8_t* inptr0 = src + i * w_in; + const uint8_t* inptr1 = inptr0 + w_in; + const uint8_t* inptr2 = inptr1 + w_in; + const uint8_t* inptr3 = inptr2 + w_in; + int j = 0; + for (; j < w_in - 7; j += 8) { + uint8_t* outptr0 = dst + j * w_out + i; + uint8_t* outptr1 = outptr0 + w_out; + uint8_t* outptr2 = outptr1 + w_out; + uint8_t* outptr3 = outptr2 + w_out; + uint8_t* outptr4 = outptr3 + w_out; + uint8_t* outptr5 = outptr4 + w_out; + uint8_t* outptr6 = outptr5 + w_out; + uint8_t* outptr7 = outptr6 + w_out; +#ifdef __aarch64__ + asm volatile(INPUT_C1 ADD_INPUT TRANS_C1_8 INPUT_C1 TRANS_C1_16 TRANS_C1_8 + SUB_INPUT TRANS_C1 REVERSE_C1 STORE_C1_R + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3), + [outptr4] "+r"(outptr4), + [outptr5] "+r"(outptr5), + [outptr6] "+r"(outptr6), + [outptr7] "+r"(outptr7) + : [stride_h] "r"(stride_h), [stride_h_w] "r"(stride_h_w) + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); +#else + asm volatile(INPUT_C1 ADD_INPUT TRANS_C1 REVERSE_C1 STORE_C1_R_4 + ADD_OUTPUT STORE_C1_R_8 + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3) + : [stride_h] "r"(stride_h), + [stride_h_w] "r"(stride_h_w), + [stride_out] "r"(stride_out) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); +#endif + } + const uint8_t* inptr4 = inptr3 + w_in; + const uint8_t* inptr5 = inptr4 + w_in; + const uint8_t* inptr6 = inptr5 + w_in; + const uint8_t* inptr7 = inptr6 + w_in; + for (; j < w_in; j++) { + uint8_t* outptr = dst + j * w_out + i; + *outptr++ = *inptr0++; + *outptr++ = *inptr1++; + *outptr++ = *inptr2++; + *outptr++ = *inptr3++; + *outptr++ = *inptr4++; + *outptr++ = *inptr5++; + *outptr++ = *inptr6++; + *outptr++ = *inptr7++; + } + } + for (; i < h_in; i++) { + const uint8_t* inptr0 = src + i * w_in; + for (int j = 0; j < w_in; j++) { + uint8_t* outptr0 = dst + j * w_out + i; + *outptr0 = *inptr0++; + } + } +} +/* +1 2 3 4 +4 5 6 7 +7 8 9 10 +rotate: +10 9 8 7 +7 6 5 4 +4 3 2 1 +*/ +void rotate_hwc1_180(const uint8_t* src, + uint8_t* dst, + int w_in, + int h_in, + int w_out, + int h_out) { + uint8_t zerobuff[10000]; + memset(zerobuff, 0, w_in * sizeof(uint8_t)); + int stride_w = 8; +#pragma omp parallel for + for (int i = 0; i < h_in; i += 4) { + const uint8_t* inptr0 = src + i * w_in; + const uint8_t* inptr1 = inptr0 + w_in; + const uint8_t* inptr2 = inptr1 + w_in; + const uint8_t* inptr3 = inptr2 + w_in; + + uint8_t* outptr0 = dst + (h_in - i) * w_out - stride_w; // last + uint8_t* outptr1 = outptr0 + w_out; + uint8_t* outptr2 = outptr1 + w_out; + uint8_t* outptr3 = outptr2 + w_out; + + if (i + 3 >= h_in) { + uint8_t* ptr = zerobuff + w_in - stride_w; + switch ((i + 3) - h_in) { + case 3: + inptr0 = zerobuff; + outptr0 = ptr; + case 2: + inptr1 = zerobuff; + outptr1 = ptr; + case 1: + inptr2 = zerobuff; + outptr2 = ptr; + case 0: + inptr3 = zerobuff; + outptr3 = ptr; + default: + break; + } + } + + int j = 0; + for (; j < w_in - 7; j += 8) { +#ifdef __aarch64__ + asm volatile( + "ld1 {v0.8b}, [%[inptr0]], #8 \n" // v0={00,01,02, 03, 04, 05, + // 06, 07}" + "ld1 {v1.8b}, [%[inptr1]], #8 \n" // v0={10,11,12, 13, 14, 15, + // 16, 17}" + "ld1 {v2.8b}, [%[inptr2]], #8 \n" // v0={20,21,22, 23, 24, 25, + // 26, 27}" + "ld1 {v3.8b}, [%[inptr3]], #8 \n" // v0={30,31,32, 33, 34, 35, + // 36, 37}" + + "rev64 v4.8b, v0.8b \n" //@ reverse 07 06 05 04 03 02 + // 01 00 + "rev64 v5.8b, v1.8b \n" //@ reverse 07 06 05 04 03 02 + // 01 00 + "rev64 v6.8b, v2.8b \n" //@ reverse 07 06 05 04 03 02 + // 01 00 + "rev64 v7.8b, v3.8b \n" //@ reverse 07 06 05 04 03 02 + // 01 00 + + "st1 {v4.8b}, [%[outptr0]] \n" // 00 10 20 30 04 14 24 34 + "st1 {v5.8b}, [%[outptr1]] \n" // 02 12 22 32 + "st1 {v6.8b}, [%[outptr2]] \n" // 01 11 21 31 + "st1 {v7.8b}, [%[outptr3]] \n" // 03 13 23 33 + + "sub %[outptr0], %[outptr0], %[stride_w] \n" //@ ptr - stride_w + "sub %[outptr1], %[outptr1], %[stride_w] \n" + "sub %[outptr2], %[outptr2], %[stride_w] \n" + "sub %[outptr3], %[outptr3], %[stride_w] \n" + + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3), + [stride_w] "+r"(stride_w) + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +#else + asm volatile( + "vld1.8 {d0}, [%[inptr0]]! @ zip load r0, d0 =00 01 02 03 04 05 " + "06 07\n" + "vld1.8 {d4}, [%[inptr1]]! @ zip load r1, d2 =10 11 12 13 14 15 " + "16 17\n" + "vld1.8 {d8}, [%[inptr2]]! @ zip load r1, d4 =20 21 22 23 24 25 " + "26 27\n" + "vld1.8 {d12}, [%[inptr3]]! @ zip load r1, d6 = 30 31 32 33 34 35 " + "36 37\n" + + "vrev64.8 d1, d0 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d5, d4 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d9, d8 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d13, d12 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + + "vst1.32 {d1}, [%[outptr0]] @ write d0(q0,low),r00,r10 20 30\n" + "vst1.32 {d5}, [%[outptr1]] @ write d4(q0,low),r01,r11 21 31\n" + "vst1.32 {d9}, [%[outptr2]] @ write d4(q0,low),r01,r11 21 31\n" + "vst1.32 {d13}, [%[outptr3]] @ write d4(q0,low),r01,r11 21 31\n" + + "sub %[outptr0], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr1], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr2], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr3], %[stride_w] @ ptr - stride_w \n" + + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3), + [stride_w] "+r"(stride_w) + : + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); +#endif + } + outptr3 += stride_w - 1; + outptr2 += stride_w - 1; + outptr1 += stride_w - 1; + outptr0 += stride_w - 1; + for (; j < w_in; j++) { + if (i + 3 >= h_in) { + switch ((i + 3) - h_in) { + case 0: + *outptr2-- = *inptr2++; + case 1: + *outptr1-- = *inptr1++; + case 2: + *outptr0-- = *inptr0++; + case 3: + default: + break; + } + } else { + *outptr3-- = *inptr3++; + *outptr2-- = *inptr2++; + *outptr1-- = *inptr1++; + *outptr0-- = *inptr0++; + } + } + } +} +/* +1 2 3 +4 5 6 +7 8 9 +rotate: +3 6 9 +2 5 8 +1 4 7 +*/ +// dst = (h_out - 1) * w_out +void rotate_hwc1_270(const uint8_t* src, + uint8_t* dst, + int w_in, + int h_in, + int w_out, + int h_out) { + int stride_h = 4 * w_in; + int stride_h_w = 4 * w_in - 8; + int hout = h_out - 1; + int stride_out = 4 * w_out; + + int i = 0; +// block 8*8. -- 8*8 +#pragma omp parallel for + for (i = 0; i < h_in - 7; i += 8) { + const uint8_t* inptr0 = src + i * w_in; + const uint8_t* inptr1 = inptr0 + w_in; + const uint8_t* inptr2 = inptr1 + w_in; + const uint8_t* inptr3 = inptr2 + w_in; + int j = 0; + for (; j < w_in - 7; j += 8) { + uint8_t* outptr0 = dst + (hout - j) * w_out + i; + uint8_t* outptr1 = outptr0 - w_out; + uint8_t* outptr2 = outptr1 - w_out; + uint8_t* outptr3 = outptr2 - w_out; + uint8_t* outptr4 = outptr3 - w_out; + uint8_t* outptr5 = outptr4 - w_out; + uint8_t* outptr6 = outptr5 - w_out; + uint8_t* outptr7 = outptr6 - w_out; + +#ifdef __aarch64__ + asm volatile(INPUT_C1 ADD_INPUT TRANS_C1_8 INPUT_C1 TRANS_C1_16 TRANS_C1_8 + SUB_INPUT TRANS_C1 STORE_C1 + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3), + [outptr4] "+r"(outptr4), + [outptr5] "+r"(outptr5), + [outptr6] "+r"(outptr6), + [outptr7] "+r"(outptr7) + : [stride_h] "r"(stride_h), [stride_h_w] "r"(stride_h_w) + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); +#else + asm volatile(INPUT_C1 ADD_INPUT TRANS_C1 STORE_C1_4 ADD_OUTPUT STORE_C1_8 + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3) + : [stride_h] "r"(stride_h), + [stride_h_w] "r"(stride_h_w), + [stride_out] "r"(stride_out) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); +#endif + } + const uint8_t* inptr4 = inptr3 + w_in; + const uint8_t* inptr5 = inptr4 + w_in; + const uint8_t* inptr6 = inptr5 + w_in; + const uint8_t* inptr7 = inptr6 + w_in; + for (; j < w_in; j++) { + uint8_t* outptr = dst + (hout - j) * w_out + i; + *outptr++ = *inptr0++; + *outptr++ = *inptr1++; + *outptr++ = *inptr2++; + *outptr++ = *inptr3++; + *outptr++ = *inptr4++; + *outptr++ = *inptr5++; + *outptr++ = *inptr6++; + *outptr++ = *inptr7++; + } + } + for (; i < h_in; i++) { + const uint8_t* inptr0 = src + i * w_in; + for (int j = 0; j < w_in; j++) { + uint8_t* outptr0 = dst + (hout - j) * w_out + i; + *outptr0 = *inptr0++; + } + } +} + +void rotate_hwc3_90(const uint8_t* src, + uint8_t* dst, + int w_in, + int h_in, + int w_out, + int h_out) { + int win = w_in * 3; + int wout = w_out * 3; + int stride_h = 4 * win; + int stride_h_w = 4 * win - 24; + int stride_out = 4 * wout; + int ww = w_out - 8; + uint8_t zerobuff[8] = {0, 0, 0, 0, 0, 0, 0, 0}; + // block 4*8. -- 8*4 + int i = 0; +#pragma omp parallel for + for (i = 0; i < h_in - 7; i += 8) { + const uint8_t* inptr0 = src + i * win; + const uint8_t* inptr1 = inptr0 + win; + const uint8_t* inptr2 = inptr1 + win; + const uint8_t* inptr3 = inptr2 + win; + int j = 0; + for (; j < w_in - 7; j += 8) { + uint8_t* outptr0 = dst + j * wout + (ww - i) * 3; + uint8_t* outptr1 = outptr0 + wout; + uint8_t* outptr2 = outptr1 + wout; + uint8_t* outptr3 = outptr2 + wout; + uint8_t* outptr4 = outptr3 + wout; + uint8_t* outptr5 = outptr4 + wout; + uint8_t* outptr6 = outptr5 + wout; + uint8_t* outptr7 = outptr6 + wout; +#ifdef __aarch64__ + asm volatile(INPUT_C3 ADD_INPUT TRANS_C3_8 INPUT_C3 TRANS_C3_16 TRANS_C3 + REVERSE_C3 STORE_C3 + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3), + [outptr4] "+r"(outptr4), + [outptr5] "+r"(outptr5), + [outptr6] "+r"(outptr6), + [outptr7] "+r"(outptr7), + [stride_h] "+r"(stride_h), + [stride_h_w] "+r"(stride_h_w) + : + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25", + "v26", + "v27", + "v28", + "v29", + "v30", + "v31"); +#else + asm volatile(INPUT_C3 ADD_INPUT TRANS_C3 REVERSE_C3 STORE_C3_R_4 + ADD_OUTPUT STORE_C3_R_8 + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3) + : [stride_h] "r"(stride_h), + [stride_h_w] "r"(stride_h_w), + [stride_out] "r"(stride_out) + : "q0", + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14"); +#endif + } + const uint8_t* inptr4 = inptr3 + win; + const uint8_t* inptr5 = inptr4 + win; + const uint8_t* inptr6 = inptr5 + win; + const uint8_t* inptr7 = inptr6 + win; + for (; j < w_in; j++) { + int tmpx = (ww - i) * 3; + uint8_t* outptr = dst + j * wout + tmpx; + *outptr++ = *inptr7++; + *outptr++ = *inptr7++; + *outptr++ = *inptr7++; + + *outptr++ = *inptr6++; + *outptr++ = *inptr6++; + *outptr++ = *inptr6++; + + *outptr++ = *inptr5++; + *outptr++ = *inptr5++; + *outptr++ = *inptr5++; + + *outptr++ = *inptr4++; + *outptr++ = *inptr4++; + *outptr++ = *inptr4++; + + *outptr++ = *inptr3++; + *outptr++ = *inptr3++; + *outptr++ = *inptr3++; + + *outptr++ = *inptr2++; + *outptr++ = *inptr2++; + *outptr++ = *inptr2++; + + *outptr++ = *inptr1++; + *outptr++ = *inptr1++; + *outptr++ = *inptr1++; + + *outptr++ = *inptr0++; + *outptr++ = *inptr0++; + *outptr++ = *inptr0++; + } + } + // remain + ww = w_out - 1; + for (; i < h_in; i++) { + const uint8_t* inptr0 = src + i * win; + for (int j = 0; j < w_in; j++) { + uint8_t* outptr0 = dst + j * wout + (ww - i) * 3; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + } + } +} + +void rotate_hwc3_180(const uint8_t* src, + uint8_t* dst, + int w_in, + int h_in, + int w_out, + int h_out) { + int win = w_in * 3; + uint8_t zerobuff[30000]; + memset(zerobuff, 0, win * sizeof(uint8_t)); + int stride_w = 24; +#pragma omp parallel for + for (int i = 0; i < h_in; i += 4) { + const uint8_t* inptr0 = src + i * win; + const uint8_t* inptr1 = inptr0 + win; + const uint8_t* inptr2 = inptr1 + win; + const uint8_t* inptr3 = inptr2 + win; + + uint8_t* outptr0 = dst + (h_in - i) * win - stride_w; // last col + uint8_t* outptr1 = outptr0 - win; + uint8_t* outptr2 = outptr1 - win; + uint8_t* outptr3 = outptr2 - win; + + if (i + 3 >= h_in) { + uint8_t* ptr = zerobuff + win - stride_w; + switch ((i + 3) - h_in) { + case 3: + inptr0 = zerobuff; + outptr0 = ptr; + case 2: + inptr1 = zerobuff; + outptr1 = ptr; + case 1: + inptr2 = zerobuff; + outptr2 = ptr; + case 0: + inptr3 = zerobuff; + outptr3 = ptr; + default: + break; + } + } + + int j = 0; + for (; j < w_in - 7; j += 8) { +#ifdef __aarch64__ + asm volatile( + "ld3 {v0.8b, v1.8b, v2.8b}, [%[inptr0]], #24 \n" // v0={00,01,02, + // 03, 04, 05, + // 06, 07}" + "ld3 {v3.8b, v4.8b, v5.8b}, [%[inptr1]], #24 \n" // v0={10,11,12, + // 13, 14, 15, + // 16, 17}" + "ld3 {v6.8b, v7.8b, v8.8b}, [%[inptr2]], #24 \n" // v0={20,21,22, + // 23, 24, 25, + // 26, 27}" + "ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24 \n" // v0={30,31,32, + // 33, 34, 35, + // 36, 37}" + + "rev64 v12.8b, v0.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 b + "rev64 v13.8b, v1.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 g + "rev64 v14.8b, v2.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 r + + "rev64 v15.8b, v3.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v16.8b, v4.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v17.8b, v5.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + + "rev64 v18.8b, v6.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v19.8b, v7.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v20.8b, v8.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + + "rev64 v21.8b, v9.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v22.8b, v10.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v23.8b, v11.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + + "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]] \n" // 00 10 + // 20 30 + // 04 14 + // 24 34 + "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr1]] \n" // 02 12 + // 22 32 + "st3 {v18.8b, v19.8b, v20.8b}, [%[outptr2]] \n" // 01 11 + // 21 31 + "st3 {v21.8b, v22.8b, v23.8b}, [%[outptr3]] \n" // 03 13 + // 23 33 + + "sub %[outptr0], %[outptr0], %[stride_w] \n" //@ ptr - stride_w + "sub %[outptr1], %[outptr1], %[stride_w] \n" + "sub %[outptr2], %[outptr2], %[stride_w] \n" + "sub %[outptr3], %[outptr3], %[stride_w] \n" + + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3), + [stride_w] "+r"(stride_w) + : + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23"); +#else + asm volatile( + "vld3.8 {d0, d1, d2}, [%[inptr0]]! @ zip load r0, d0 =00 01 02 03 " + "\n" + "vld3.8 {d3, d4, d5}, [%[inptr1]]! @ zip load r1, d2 =10 11 12 13 " + "\n" + "vld3.8 {d6, d7, d8}, [%[inptr2]]! @ zip load r1, d4 =20 21 22 23 " + "\n" + "vld3.8 {d9, d10, d11}, [%[inptr3]]! @ zip load r1, d6 = 30 31 32 " + "\n" + + "vrev64.8 d12, d0 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d13, d1 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d14, d2 @ reverse 07 06 05 04 03 02 01 00 \n" + + "vrev64.8 d15, d3 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d16, d4 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d17, d5 @ reverse 07 06 05 04 03 02 01 00 \n" + + "vrev64.8 d18, d6 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d19, d7 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d20, d8 @ reverse 07 06 05 04 03 02 01 00 \n" + + "vrev64.8 d21, d9 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d22, d10 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d23, d11 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + + "vst3.8 {d12, d13, d14}, [%[outptr0]] @ write \n" + "vst3.8 {d15, d16, d17}, [%[outptr1]] @ write \n" + "vst3.8 {d18, d19, d20}, [%[outptr2]] @ write \n" + "vst3.8 {d21, d22, d23}, [%[outptr3]] @ write \n" + + "sub %[outptr0], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr1], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr2], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr3], %[stride_w] @ ptr - stride_w \n" + + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3), + [stride_w] "+r"(stride_w) + : + : "q0", + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12"); +#endif + } + outptr3 += stride_w - 3; + outptr2 += stride_w - 3; + outptr1 += stride_w - 3; + outptr0 += stride_w - 3; + for (; j < w_in; j++) { + if (i + 3 >= h_in) { + switch ((i + 3) - h_in) { + case 0: + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + outptr2 -= 6; + case 1: + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + outptr1 -= 6; + case 2: + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + outptr0 -= 6; + case 3: + // inptr3 = zerobuff; + default: + break; + } + } else { + *outptr3++ = *inptr3++; + *outptr3++ = *inptr3++; + *outptr3++ = *inptr3++; + outptr3 -= 6; + + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + outptr2 -= 6; + + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + outptr1 -= 6; + + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + outptr0 -= 6; + } + } + } +} + +void rotate_hwc3_270(const uint8_t* src, + uint8_t* dst, + int w_in, + int h_in, + int w_out, + int h_out) { + int win = w_in * 3; + int wout = w_out * 3; + int stride_h = 4 * win; + int stride_h_w = 4 * win - 24; + int stride_out = 4 * wout; + int hout = h_out - 1; + // block 8*8. -- 8*8 + int i = 0; +#pragma omp parallel for + for (i = 0; i < h_in - 7; i += 8) { + const uint8_t* inptr0 = src + i * win; + const uint8_t* inptr1 = inptr0 + win; + const uint8_t* inptr2 = inptr1 + win; + const uint8_t* inptr3 = inptr2 + win; + + int j = 0; + for (; j < w_in - 7; j += 8) { + uint8_t* outptr0 = dst + (hout - j) * wout + i * 3; + uint8_t* outptr1 = outptr0 - wout; + uint8_t* outptr2 = outptr1 - wout; + uint8_t* outptr3 = outptr2 - wout; + uint8_t* outptr4 = outptr3 - wout; + uint8_t* outptr5 = outptr4 - wout; + uint8_t* outptr6 = outptr5 - wout; + uint8_t* outptr7 = outptr6 - wout; +#ifdef __aarch64__ + asm volatile(INPUT_C3 ADD_INPUT TRANS_C3_8 INPUT_C3 TRANS_C3_16 TRANS_C3 + MOV_C3 STORE_C3 + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3), + [outptr4] "+r"(outptr4), + [outptr5] "+r"(outptr5), + [outptr6] "+r"(outptr6), + [outptr7] "+r"(outptr7), + [stride_h] "+r"(stride_h), + [stride_h_w] "+r"(stride_h_w) + : + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25", + "v26", + "v27", + "v28", + "v29", + "v30", + "v31"); +#else + asm volatile(INPUT_C3 ADD_INPUT TRANS_C3 STORE_C3_4 SUB_OUTPUT STORE_C3_8 + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3) + : [stride_h] "r"(stride_h), + [stride_out] "r"(stride_out), + [stride_h_w] "r"(stride_h_w) + : "q0", + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14"); +#endif + } + const uint8_t* inptr4 = inptr3 + win; + const uint8_t* inptr5 = inptr4 + win; + const uint8_t* inptr6 = inptr5 + win; + const uint8_t* inptr7 = inptr6 + win; + for (; j < w_in; j++) { + int tmpx = i * 3; + uint8_t* outptr = dst + (hout - j) * wout + tmpx; + *outptr++ = *inptr0++; + *outptr++ = *inptr0++; + *outptr++ = *inptr0++; + + *outptr++ = *inptr1++; + *outptr++ = *inptr1++; + *outptr++ = *inptr1++; + + *outptr++ = *inptr2++; + *outptr++ = *inptr2++; + *outptr++ = *inptr2++; + + *outptr++ = *inptr3++; + *outptr++ = *inptr3++; + *outptr++ = *inptr3++; + + *outptr++ = *inptr4++; + *outptr++ = *inptr4++; + *outptr++ = *inptr4++; + + *outptr++ = *inptr5++; + *outptr++ = *inptr5++; + *outptr++ = *inptr5++; + + *outptr++ = *inptr6++; + *outptr++ = *inptr6++; + *outptr++ = *inptr6++; + + *outptr++ = *inptr7++; + *outptr++ = *inptr7++; + *outptr++ = *inptr7++; + } + } + for (; i < h_in; i++) { + const uint8_t* inptr0 = src + i * win; + for (int j = 0; j < w_in; j++) { + uint8_t* outptr0 = dst + (hout - j) * wout + i * 3; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + } + } +} + +void rotate_hwc4_90(const uint8_t* src, + uint8_t* dst, + int w_in, + int h_in, + int w_out, + int h_out) { + int win = w_in * 4; + int wout = w_out * 4; + int hremain = h_in % 8; + int stride_h = 4 * win; + int stride_h_w = 4 * win - 32; + int ww = w_out - 8; + // block 8*8. -- 8*8 + int i = 0; +#pragma omp parallel for + for (i = 0; i < h_in - 7; i += 8) { + const uint8_t* inptr0 = src + i * win; + const uint8_t* inptr1 = inptr0 + win; + const uint8_t* inptr2 = inptr1 + win; + const uint8_t* inptr3 = inptr2 + win; + const uint8_t* inptr4 = inptr3 + win; + const uint8_t* inptr5 = inptr4 + win; + const uint8_t* inptr6 = inptr5 + win; + const uint8_t* inptr7 = inptr6 + win; + + int j = 0; + for (; j < w_in; j++) { + int tmpx = (ww - i) * 4; + uint8_t* outptr = dst + j * wout + tmpx; + *outptr++ = *inptr7++; + *outptr++ = *inptr7++; + *outptr++ = *inptr7++; + *outptr++ = *inptr7++; + + *outptr++ = *inptr6++; + *outptr++ = *inptr6++; + *outptr++ = *inptr6++; + *outptr++ = *inptr6++; + + *outptr++ = *inptr5++; + *outptr++ = *inptr5++; + *outptr++ = *inptr5++; + *outptr++ = *inptr5++; + + *outptr++ = *inptr4++; + *outptr++ = *inptr4++; + *outptr++ = *inptr4++; + *outptr++ = *inptr4++; + + *outptr++ = *inptr3++; + *outptr++ = *inptr3++; + *outptr++ = *inptr3++; + *outptr++ = *inptr3++; + + *outptr++ = *inptr2++; + *outptr++ = *inptr2++; + *outptr++ = *inptr2++; + *outptr++ = *inptr2++; + + *outptr++ = *inptr1++; + *outptr++ = *inptr1++; + *outptr++ = *inptr1++; + *outptr++ = *inptr1++; + + *outptr++ = *inptr0++; + *outptr++ = *inptr0++; + *outptr++ = *inptr0++; + *outptr++ = *inptr0++; + } + } + ww = w_out - 1; + for (; i < h_in; i++) { + const uint8_t* inptr0 = src + i * win; + for (int j = 0; j < w_in; j++) { + uint8_t* outptr0 = dst + j * wout + (ww - i) * 4; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + } + } +} + +void rotate_hwc4_180(const uint8_t* src, + uint8_t* dst, + int w_in, + int h_in, + int w_out, + int h_out) { + int win = w_in * 4; + uint8_t zerobuff[40000]; + memset(zerobuff, 0, win * sizeof(uint8_t)); + int stride_w = 32; +#pragma omp parallel for + for (int i = 0; i < h_in; i += 4) { + const uint8_t* inptr0 = src + i * win; + const uint8_t* inptr1 = inptr0 + win; + const uint8_t* inptr2 = inptr1 + win; + const uint8_t* inptr3 = inptr2 + win; + + uint8_t* outptr0 = dst + (h_in - i) * win - stride_w; // last col + uint8_t* outptr1 = outptr0 - win; + uint8_t* outptr2 = outptr1 - win; + uint8_t* outptr3 = outptr2 - win; + + if (i + 3 >= h_in) { + uint8_t* ptr = zerobuff + win - stride_w; + switch ((i + 3) - h_in) { + case 3: + inptr0 = zerobuff; + outptr0 = ptr; + case 2: + inptr1 = zerobuff; + outptr1 = ptr; + case 1: + inptr2 = zerobuff; + outptr2 = ptr; + case 0: + inptr3 = zerobuff; + outptr3 = ptr; + default: + break; + } + } + + int j = 0; + for (; j < w_in - 7; j += 8) { +#ifdef __aarch64__ + asm volatile( + "ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[inptr0]], #32 \n" // v0={00,01,02, + // 03, 04, 05, + // 06, 07}" + "ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [%[inptr1]], #32 \n" // v0={10,11,12, + // 13, 14, 15, + // 16, 17}" + "ld4 {v8.8b, v9.8b, v10.8b, v11.8b}, [%[inptr2]], #32 \n" // v0={20,21,22, + // 23, 24, 25, + // 26, 27}" + "ld4 {v12.8b, v13.8b, v14.8b, v15.8b}, [%[inptr3]], #32 \n" // v0={30,31,32, + // 33, 34, 35, + // 36, 37}" + "rev64 v16.8b, v0.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 b + "rev64 v17.8b, v1.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 g + "rev64 v18.8b, v2.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 r + "rev64 v19.8b, v3.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 a + + "rev64 v20.8b, v4.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v21.8b, v5.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v22.8b, v6.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v23.8b, v7.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + + "rev64 v24.8b, v8.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v25.8b, v9.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v26.8b, v10.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v27.8b, v11.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + + "rev64 v28.8b, v12.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v29.8b, v13.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v30.8b, v14.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v31.8b, v15.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + + "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[outptr0]] \n" // 00 10 + "st4 {v20.8b, v21.8b, v22.8b, v23.8b}, [%[outptr1]] \n" // 02 12 + "st4 {v24.8b, v25.8b, v26.8b, v27.8b}, [%[outptr2]] \n" // 01 11 + "st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [%[outptr3]] \n" // 03 13 + + "sub %[outptr0], %[outptr0], %[stride_w] \n" //@ ptr - stride_w + "sub %[outptr1], %[outptr1], %[stride_w] \n" + "sub %[outptr2], %[outptr2], %[stride_w] \n" + "sub %[outptr3], %[outptr3], %[stride_w] \n" + + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3), + [stride_w] "+r"(stride_w) + : + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25", + "v26", + "v27", + "v28", + "v29", + "v30", + "v31"); +#else + asm volatile( + "vld4.8 {d0, d1, d2, d3}, [%[inptr0]]! @ zip load r0, d0 =00 01 " + "02 03 " + "04 05 06 07\n" + "vld4.8 {d4, d5, d6, d7}, [%[inptr1]]! @ zip load r1, d2 =10 11 " + "12 13 " + "14 15 16 17\n" + "vld4.8 {d8, d9, d10, d11}, [%[inptr2]]! @ zip load r1, d4 =20 21 " + "22 23 " + "24 25 26 27\n" + "vld4.8 {d12, d13, d14, d15}, [%[inptr3]]! @ zip load r1, d6 = 30 " + "31 32 " + "33 34 35 36 37\n" + + "vrev64.8 d16, d0 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d17, d1 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d18, d2 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d19, d3 @ reverse 07 06 05 04 03 02 01 00 \n" + + "vrev64.8 d20, d4 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d21, d5 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d22, d6 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d23, d7 @ reverse 07 06 05 04 03 02 01 00 \n" + + "vrev64.8 d24, d8 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d25, d9 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d26, d10 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d27, d11 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + + "vrev64.8 d28, d12 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d29, d13 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d30, d14 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d31, d15 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + + "vst4.8 {d16, d17, d18, d19}, [%[outptr0]] @ write \n" + "vst4.8 {d20, d21, d22, d23}, [%[outptr1]] @ write \n" + "vst4.8 {d24, d25, d26, d27}, [%[outptr2]] @ write \n" + "vst4.8 {d28, d29, d30, d31}, [%[outptr3]] @ write \n" + + "sub %[outptr0], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr1], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr2], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr3], %[stride_w] @ ptr - stride_w \n" + + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3), + [stride_w] "+r"(stride_w) + : + : "q0", + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); +#endif + } + outptr3 += stride_w - 4; + outptr2 += stride_w - 4; + outptr1 += stride_w - 4; + outptr0 += stride_w - 4; + for (; j < w_in; j++) { + if (i + 3 >= h_in) { + switch ((i + 3) - h_in) { + case 0: + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + outptr2 -= 8; + case 1: + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + outptr1 -= 8; + case 2: + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + outptr0 -= 8; + case 3: + // inptr3 = zerobuff; + default: + break; + } + } else { + *outptr3++ = *inptr3++; + *outptr3++ = *inptr3++; + *outptr3++ = *inptr3++; + *outptr3++ = *inptr3++; + outptr3 -= 8; + + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + outptr2 -= 8; + + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + outptr1 -= 8; + + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + outptr0 -= 8; + } + } + } +} + +void rotate_hwc4_270(const uint8_t* src, + uint8_t* dst, + int w_in, + int h_in, + int w_out, + int h_out) { + int win = w_in * 4; + int wout = w_out * 4; + int hremain = h_in % 8; + int stride_h = 4 * win; + int stride_h_w = 4 * win - 32; + int hout = h_out - 1; + // block 8*8. -- 8*8 + int i = 0; +#pragma omp parallel for + for (i = 0; i < h_in - 7; i += 8) { + const uint8_t* inptr0 = src + i * win; + const uint8_t* inptr1 = inptr0 + win; + const uint8_t* inptr2 = inptr1 + win; + const uint8_t* inptr3 = inptr2 + win; + const uint8_t* inptr4 = inptr3 + win; + const uint8_t* inptr5 = inptr4 + win; + const uint8_t* inptr6 = inptr5 + win; + const uint8_t* inptr7 = inptr6 + win; + int j = 0; + for (; j < w_in; j++) { + int tmpx = i * 4; + uint8_t* outptr = dst + (hout - j) * wout + tmpx; + *outptr++ = *inptr0++; + *outptr++ = *inptr0++; + *outptr++ = *inptr0++; + *outptr++ = *inptr0++; + + *outptr++ = *inptr1++; + *outptr++ = *inptr1++; + *outptr++ = *inptr1++; + *outptr++ = *inptr1++; + + *outptr++ = *inptr2++; + *outptr++ = *inptr2++; + *outptr++ = *inptr2++; + *outptr++ = *inptr2++; + + *outptr++ = *inptr3++; + *outptr++ = *inptr3++; + *outptr++ = *inptr3++; + *outptr++ = *inptr3++; + + *outptr++ = *inptr4++; + *outptr++ = *inptr4++; + *outptr++ = *inptr4++; + *outptr++ = *inptr4++; + + *outptr++ = *inptr5++; + *outptr++ = *inptr5++; + *outptr++ = *inptr5++; + *outptr++ = *inptr5++; + + *outptr++ = *inptr6++; + *outptr++ = *inptr6++; + *outptr++ = *inptr6++; + *outptr++ = *inptr6++; + + *outptr++ = *inptr7++; + *outptr++ = *inptr7++; + *outptr++ = *inptr7++; + *outptr++ = *inptr7++; + } + } + for (; i < h_in; i++) { + const uint8_t* inptr0 = src + i * win; + for (int j = 0; j < w_in; j++) { + uint8_t* outptr0 = dst + (hout - j) * wout + i * 4; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + } + } +} +} // namespace cv +} // namespace utils +} // namespace lite +} // namespace paddle diff --git a/lite/utils/cv/image_rotate.h b/lite/utils/cv/image_rotate.h new file mode 100644 index 0000000000000000000000000000000000000000..8335fca28051c3ba0ae5070464c32d5e804361f4 --- /dev/null +++ b/lite/utils/cv/image_rotate.h @@ -0,0 +1,32 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +namespace paddle { +namespace lite { +namespace utils { +namespace cv { +void rotate_hwc1( + const uint8_t* src, uint8_t* dst, int srcw, int srch, float degree); +void rotate_hwc3( + const uint8_t* src, uint8_t* dst, int srcw, int srch, float degree); +void rotate_hwc4( + const uint8_t* src, uint8_t* dst, int srcw, int srch, float degree); +} // namespace cv +} // namespace utils +} // namespace lite +} // namespace paddle diff --git a/lite/utils/cv/paddle_image_preprocess.cc b/lite/utils/cv/paddle_image_preprocess.cc new file mode 100644 index 0000000000000000000000000000000000000000..0bccfe2804a9ba17473575815bfe4b2e9635f234 --- /dev/null +++ b/lite/utils/cv/paddle_image_preprocess.cc @@ -0,0 +1,407 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/utils/cv/paddle_image_preprocess.h" +#include +#include +#include +#include "lite/utils/cv/image2tensor.h" +#include "lite/utils/cv/image_convert.h" +#include "lite/utils/cv/image_flip.h" +#include "lite/utils/cv/image_resize.h" +#include "lite/utils/cv/image_rotate.h" +namespace paddle { +namespace lite { +namespace utils { +namespace cv { + +#define PI 3.14159265f +#define Degrees2Radians(degrees) ((degrees) * (SK_ScalarPI / 180)) +#define Radians2Degrees(radians) ((radians) * (180 / SK_ScalarPI)) +#define ScalarNearlyZero (1.0f / (1 << 12)) +// init +ImagePreprocess::ImagePreprocess(ImageFormat srcFormat, + ImageFormat dstFormat, + TransParam param) { + this->srcFormat_ = srcFormat; + this->dstFormat_ = dstFormat; + this->transParam_ = param; +} +void ImagePreprocess::imageCovert(const uint8_t* src, uint8_t* dst) { + ImageConvert img_convert; + img_convert.choose(src, + dst, + this->srcFormat_, + this->dstFormat_, + this->transParam_.iw, + this->transParam_.ih); +} + +void ImagePreprocess::imageCovert(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + ImageFormat dstFormat) { + ImageConvert img_convert; + img_convert.choose(src, + dst, + srcFormat, + dstFormat, + this->transParam_.iw, + this->transParam_.ih); +} + +void ImagePreprocess::imageResize(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + int srcw, + int srch, + int dstw, + int dsth) { + resize(src, dst, srcFormat, srcw, srch, dstw, dsth); + /* + int size = srcw * srch; + if (srcw == dstw && srch == dsth) { + if (srcFormat == NV12 || srcFormat == NV21) { + size = srcw * (floor(1.5 * srch)); + } else if (srcFormat == BGR || srcFormat == RGB) { + size = 3 * srcw * srch; + } else if (srcFormat == BGRA || srcFormat == RGBA) { + size = 4 * srcw * srch; + } + memcpy(dst, src, sizeof(uint8_t) * size); + return; + } + double scale_x = static_cast(srcw / dstw); + double scale_y = static_cast(srch / dsth); + + int* buf = new int[dstw * 2 + dsth * 2]; + + int* xofs = buf; + int* yofs = buf + dstw; + int16_t* ialpha = reinterpret_cast(buf + dstw + dsth); + int16_t* ibeta = reinterpret_cast(buf + 2 * dstw + dsth); + + compute_xy( + srcw, srch, dstw, dsth, scale_x, scale_y, xofs, yofs, ialpha, ibeta); + + int w_out = dstw; + int w_in = srcw; + int num = 1; + int orih = dsth; + if (srcFormat == GRAY) { + num = 1; + } else if (srcFormat == NV12 || srcFormat == NV21) { + num = 1; + int hout = static_cast(0.5 * dsth); + dsth += hout; + } else if (srcFormat == BGR || srcFormat == RGB) { + w_in = srcw * 3; + w_out = dstw * 3; + num = 3; + + } else if (srcFormat == BGRA || srcFormat == RGBA) { + w_in = srcw * 4; + w_out = dstw * 4; + num = 4; + } + + int* xofs1 = nullptr; + int* yofs1 = nullptr; + int16_t* ialpha1 = nullptr; + if (orih < dsth) { // uv + int tmp = dsth - orih; + int w = dstw / 2; + xofs1 = new int[w]; + yofs1 = new int[tmp]; + ialpha1 = new int16_t[srcw]; + compute_xy(srcw / 2, + srch / 2, + w, + tmp, + scale_x, + scale_y, + xofs1, + yofs1, + ialpha1, + ibeta + orih); + } + int cnt = w_out >> 3; + int remain = w_out % 8; + int32x4_t _v2 = vdupq_n_s32(2); + #pragma omp parallel for + for (int dy = 0; dy < dsth; dy++) { + int16_t* rowsbuf0 = new int16_t[w_out]; + int16_t* rowsbuf1 = new int16_t[w_out]; + int sy = yofs[dy]; + if (dy >= orih) { + xofs = xofs1; + yofs = yofs1; + ialpha = ialpha1; + } + if (sy < 0) { + memset(rowsbuf0, 0, sizeof(uint16_t) * w_out); + const uint8_t* S1 = src + srcw * (sy + 1); + const int16_t* ialphap = ialpha; + int16_t* rows1p = rowsbuf1; + for (int dx = 0; dx < dstw; dx++) { + int sx = xofs[dx] * num; // num = 4 + int16_t a0 = ialphap[0]; + int16_t a1 = ialphap[1]; + + const uint8_t* S1pl = S1 + sx; + const uint8_t* S1pr = S1 + sx + num; + if (sx < 0) { + S1pl = S1; + } + for (int i = 0; i < num; i++) { + if (sx < 0) { + *rows1p++ = ((*S1pl++) * a1) >> 4; + } else { + *rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4; + } + } + ialphap += 2; + } + } else { + // hresize two rows + const uint8_t* S0 = src + w_in * (sy); + const uint8_t* S1 = src + w_in * (sy + 1); + const int16_t* ialphap = ialpha; + int16_t* rows0p = rowsbuf0; + int16_t* rows1p = rowsbuf1; + for (int dx = 0; dx < dstw; dx++) { + int sx = xofs[dx] * num; // num = 4 + int16_t a0 = ialphap[0]; + int16_t a1 = ialphap[1]; + + const uint8_t* S0pl = S0 + sx; + const uint8_t* S0pr = S0 + sx + num; + const uint8_t* S1pl = S1 + sx; + const uint8_t* S1pr = S1 + sx + num; + if (sx < 0) { + S0pl = S0; + S1pl = S1; + } + for (int i = 0; i < num; i++) { + if (sx < 0) { + *rows0p = ((*S0pl++) * a1) >> 4; + *rows1p = ((*S1pl++) * a1) >> 4; + rows0p++; + rows1p++; + } else { + *rows0p++ = ((*S0pl++) * a0 + (*S0pr++) * a1) >> 4; + *rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4; + } + } + ialphap += 2; + } + } + int ind = dy * 2; + int16_t b0 = ibeta[ind]; + int16_t b1 = ibeta[ind + 1]; + int16x8_t _b0 = vdupq_n_s16(b0); + int16x8_t _b1 = vdupq_n_s16(b1); + uint8_t* dp_ptr = dst + dy * w_out; + int16_t* rows0p = rowsbuf0; + int16_t* rows1p = rowsbuf1; + int re_cnt = cnt; + if (re_cnt > 0) { + #ifdef __aarch64__ + asm volatile( + "1: \n" + "ld1 {v0.8h}, [%[rows0p]], #16 \n" + "ld1 {v1.8h}, [%[rows1p]], #16 \n" + "orr v6.16b, %w[_v2].16b, %w[_v2].16b \n" + "orr v7.16b, %w[_v2].16b, %w[_v2].16b \n" + "smull v2.4s, v0.4h, %w[_b0].4h \n" + "smull2 v4.4s, v0.8h, %w[_b0].8h \n" + "smull v3.4s, v1.4h, %w[_b1].4h \n" + "smull2 v5.4s, v1.8h, %w[_b1].8h \n" + + "ssra v6.4s, v2.4s, #16 \n" + "ssra v7.4s, v4.4s, #16 \n" + "ssra v6.4s, v3.4s, #16 \n" + "ssra v7.4s, v5.4s, #16 \n" + + "shrn v0.4h, v6.4s, #2 \n" + "shrn2 v0.8h, v7.4s, #2 \n" + "subs %w[cnt], %w[cnt], #1 \n" + "sqxtun v1.8b, v0.8h \n" + "st1 {v1.8b}, [%[dp]], #8 \n" + "bne 1b \n" + : [rows0p] "+r"(rows0p), + [rows1p] "+r"(rows1p), + [cnt] "+r"(re_cnt), + [dp] "+r"(dp_ptr) + : [_b0] "w"(_b0), [_b1] "w"(_b1), [_v2] "w"(_v2) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); + #else + asm volatile( + "mov r4, #2 \n" + "vdup.s32 q12, r4 \n" + "0: \n" + "vld1.s16 {d2-d3}, [%[rows0p]]!\n" + "vld1.s16 {d6-d7}, [%[rows1p]]!\n" + "vorr.s32 q10, q12, q12 \n" + "vorr.s32 q11, q12, q12 \n" + + "vmull.s16 q0, d2, %[_b0] \n" + "vmull.s16 q1, d3, %[_b0] \n" + "vmull.s16 q2, d6, %[_b1] \n" + "vmull.s16 q3, d7, %[_b1] \n" + + "vsra.s32 q10, q0, #16 \n" + "vsra.s32 q11, q1, #16 \n" + "vsra.s32 q10, q2, #16 \n" + "vsra.s32 q11, q3, #16 \n" + + "vshrn.s32 d20, q10, #2 \n" + "vshrn.s32 d21, q11, #2 \n" + "subs %[cnt], #1 \n" + "vqmovun.s16 d20, q10 \n" + "vst1.8 {d20}, [%[dp]]! \n" + "bne 0b \n" + : [rows0p] "+r"(rows0p), + [rows1p] "+r"(rows1p), + [cnt] "+r"(re_cnt), + [dp] "+r"(dp_ptr) + : [_b0] "w"(_b0), [_b1] "w"(_b1) + : "cc", + "memory", + "r4", + "q0", + "q1", + "q2", + "q3", + "q8", + "q9", + "q10", + "q11", + "q12"); + + #endif // __aarch64__ + } + for (int i = 0; i < remain; i++) { + // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> + // INTER_RESIZE_COEF_BITS; + *dp_ptr++ = + (uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) + + (int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >> + 2); + } + } + delete[] buf; + */ +} + +void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst) { + int srcw = this->transParam_.iw; + int srch = this->transParam_.ih; + int dstw = this->transParam_.ow; + int dsth = this->transParam_.oh; + auto srcFormat = this->dstFormat_; + resize(src, dst, srcFormat, srcw, srch, dstw, dsth); +} + +void ImagePreprocess::imageRotate(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + int srcw, + int srch, + float degree) { + if (degree != 90 && degree != 180 && degree != 270) { + printf("this degree: %f not support \n", degree); + } + if (srcFormat == GRAY) { + rotate_hwc1(src, dst, srcw, srch, degree); + } else if (srcFormat == BGR || srcFormat == RGB) { + rotate_hwc3(src, dst, srcw, srch, degree); + } else if (srcFormat == BGRA || srcFormat == RGBA) { + rotate_hwc4(src, dst, srcw, srch, degree); + } else { + printf("this srcFormat: %d does not support! \n", srcFormat); + return; + } +} + +void ImagePreprocess::imageRotate(const uint8_t* src, uint8_t* dst) { + auto srcw = this->transParam_.ow; + auto srch = this->transParam_.oh; + auto srcFormat = this->dstFormat_; + auto degree = this->transParam_.rotate_param; + if (degree != 90 && degree != 180 && degree != 270) { + printf("this degree: %f not support \n", degree); + } + ImagePreprocess::imageRotate(src, dst, srcFormat, srcw, srch, degree); +} + +void ImagePreprocess::imageFlip(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + int srcw, + int srch, + FlipParam flip_param) { + if (srcFormat == GRAY) { + flip_hwc1(src, dst, srcw, srch, flip_param); + } else if (srcFormat == BGR || srcFormat == RGB) { + flip_hwc3(src, dst, srcw, srch, flip_param); + } else if (srcFormat == BGRA || srcFormat == RGBA) { + flip_hwc4(src, dst, srcw, srch, flip_param); + } else { + printf("this srcFormat: %d does not support! \n", srcFormat); + return; + } +} + +void ImagePreprocess::imageFlip(const uint8_t* src, uint8_t* dst) { + auto srcw = this->transParam_.ow; + auto srch = this->transParam_.oh; + auto srcFormat = this->dstFormat_; + auto flip_param = this->transParam_.flip_param; + ImagePreprocess::imageFlip(src, dst, srcFormat, srcw, srch, flip_param); +} + +void ImagePreprocess::image2Tensor(const uint8_t* src, + Tensor* dstTensor, + ImageFormat srcFormat, + int srcw, + int srch, + LayoutType layout, + float* means, + float* scales) { + Image2Tensor img2tensor; + img2tensor.choose( + src, dstTensor, srcFormat, layout, srcw, srch, means, scales); +} + +void ImagePreprocess::image2Tensor(const uint8_t* src, + Tensor* dstTensor, + LayoutType layout, + float* means, + float* scales) { + Image2Tensor img2tensor; + img2tensor.choose(src, + dstTensor, + this->dstFormat_, + layout, + this->transParam_.ow, + this->transParam_.oh, + means, + scales); +} + +} // namespace cv +} // namespace utils +} // namespace lite +} // namespace paddle diff --git a/lite/utils/cv/paddle_image_preprocess.h b/lite/utils/cv/paddle_image_preprocess.h new file mode 100644 index 0000000000000000000000000000000000000000..11673e19041883bfa6ca7a45f03ca3feca76dd20 --- /dev/null +++ b/lite/utils/cv/paddle_image_preprocess.h @@ -0,0 +1,217 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "lite/api/paddle_api.h" +#include "lite/api/paddle_place.h" +namespace paddle { +namespace lite { +namespace utils { +namespace cv { +typedef paddle::lite_api::Tensor Tensor; +typedef paddle::lite_api::DataLayoutType LayoutType; +// color enum +enum ImageFormat { + RGBA = 0, + BGRA, + RGB, + BGR, + GRAY, + NV21 = 11, + NV12, +}; +// flip enum +enum FlipParam { + X = 0, // flip along the X axis + Y, // flip along the Y axis + XY // flip along the XY axis +}; +// transform param +typedef struct { + int ih; // input height + int iw; // input width + int oh; // outpu theight + int ow; // output width + FlipParam flip_param; // flip, support x, y, xy + float rotate_param; // rotate, support 90, 180, 270 +} TransParam; + +class ImagePreprocess { + public: + /* + * init + * param srcFormat: input image color + * param dstFormat: output image color + * param param: input image parameter, egs: input size + */ + ImagePreprocess(ImageFormat srcFormat, + ImageFormat dstFormat, + TransParam param); + + /* + * image color convert + * support NV12/NV21_to_BGR(RGB), NV12/NV21_to_BGRA(RGBA), + * BGR(RGB)and BGRA(RGBA) transform, + * BGR(RGB)and RGB(BGR) transform, + * BGR(RGB)and RGBA(BGRA) transform, + * BGR(RGB)and GRAY transform, + * param src: input image data + * param dst: output image data + */ + void imageCovert(const uint8_t* src, uint8_t* dst); + /* + * image color convert + * support NV12/NV21_to_BGR(RGB), NV12/NV21_to_BGRA(RGBA), + * BGR(RGB)and BGRA(RGBA) transform, + * BGR(RGB)and RGB(BGR) transform, + * BGR(RGB)and RGBA(BGRA) transform, + * BGR(RGB)and GRAY transform, + * param src: input image data + * param dst: output image data + * param srcFormat: input image image format support: GRAY, NV12(NV21), + * BGR(RGB) and BGRA(RGBA) + * param dstFormat: output image image format, support GRAY, BGR(RGB) and + * BGRA(RGBA) + */ + void imageCovert(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + ImageFormat dstFormat); + /* + * image resize, use bilinear method + * support image format: 1-channel image (egs: GRAY, 2-channel image (egs: + * NV12, NV21), 3-channel(egs: BGR), 4-channel(egs: BGRA) + * param src: input image data + * param dst: output image data + */ + void imageResize(const uint8_t* src, uint8_t* dst); + /* + image resize, use bilinear method + * support image format: 1-channel image (egs: GRAY, 2-channel image (egs: + NV12, NV21), 3-channel image(egs: BGR), 4-channel image(egs: BGRA) + * param src: input image data + * param dst: output image data + * param srcw: input image width + * param srch: input image height + * param dstw: output image width + * param dsth: output image height + */ + void imageResize(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + int srcw, + int srch, + int dstw, + int dsth); + + /* + * image Rotate + * support 90, 180 and 270 Rotate process + * color format support 1-channel image, 3-channel image and 4-channel image + * param src: input image data + * param dst: output image data + */ + void imageRotate(const uint8_t* src, uint8_t* dst); + /* + * image Rotate + * support 90, 180 and 270 Rotate process + * color format support 1-channel image, 3-channel image and 4-channel image + * param src: input image data + * param dst: output image data + * param srcFormat: input image format, support GRAY, BGR(GRB) and BGRA(RGBA) + * param srcw: input image width + * param srch: input image height + * param degree: Rotate degree, support 90, 180 and 270 + */ + void imageRotate(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + int srcw, + int srch, + float degree); + /* + * image Flip + * support X, Y and XY flip process + * color format support 1-channel image, 3-channel image and 4-channel image + * param src: input image data + * param dst: output image data + */ + void imageFlip(const uint8_t* src, uint8_t* dst); + /* + * image Flip + * support X, Y and XY flip process + * color format support 1-channel image, 3-channel image and 4-channel image + * param src: input image data + * param dst: output image data + * param srcFormat: input image format, support GRAY, BGR(GRB) and BGRA(RGBA) + * param srcw: input image width + * param srch: input image height + * param flip_param: flip parameter, support X, Y and XY + */ + void imageFlip(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + int srcw, + int srch, + FlipParam flip_param); + /* + * change image data to tensor data + * support image format is BGR(RGB) and BGRA(RGBA), Data layout is NHWC and + * NCHW + * param src: input image data + * param dstTensor: output tensor data + * param layout: output tensor layout,support NHWC and NCHW + * param means: means of image + * param scales: scales of image + */ + void image2Tensor(const uint8_t* src, + Tensor* dstTensor, + LayoutType layout, + float* means, + float* scales); + /* + * change image data to tensor data + * support image format is BGR(RGB) and BGRA(RGBA), Data layout is NHWC and + * NCHW + * param src: input image data + * param dstTensor: output tensor data + * param srcFormat: input image format, support BGR(GRB) and BGRA(RGBA) + * param srcw: input image width + * param srch: input image height + * param layout: output tensor layout,support NHWC and NCHW + * param means: means of image + * param scales: scales of image + */ + void image2Tensor(const uint8_t* src, + Tensor* dstTensor, + ImageFormat srcFormat, + int srcw, + int srch, + LayoutType layout, + float* means, + float* scales); + + private: + ImageFormat srcFormat_; + ImageFormat dstFormat_; + TransParam transParam_; +}; +} // namespace cv +} // namespace utils +} // namespace lite +} // namespace paddle