未验证 提交 515f9a6a 编写于 作者: H HappyAngel 提交者: GitHub

[arm] add cv unit_test (#4250)

add cv_ut. test=develop
add Anakin implement
add image_profiler test
上级 339c2e53
...@@ -91,14 +91,24 @@ ImagePreprocess::ImagePreprocess(ImageFormat srcFormat, ImageFormat dstFormat, T ...@@ -91,14 +91,24 @@ ImagePreprocess::ImagePreprocess(ImageFormat srcFormat, ImageFormat dstFormat, T
// 方法二 // 方法二
void ImagePreprocess::imageCovert(const uint8_t* src, void ImagePreprocess::imageCovert(const uint8_t* src,
uint8_t* dst, ImageFormat srcFormat, ImageFormat dstFormat); uint8_t* dst, ImageFormat srcFormat, ImageFormat dstFormat);
// 方法三
void ImagePreprocess::imageCovert(const uint8_t* src,
uint8_t* dst, ImageFormat srcFormat, ImageFormat dstFormat,
int srcw, int srch);
``` ```
+ 第一个 `imageCovert` 接口,缺省参数来源于 `ImagePreprocess` 类的成员变量。故在初始化 `ImagePreprocess` 类的对象时,必须要给以下成员变量赋值: + 第一个 `imageCovert` 接口,缺省参数来源于 `ImagePreprocess` 类的成员变量。故在初始化 `ImagePreprocess` 类的对象时,必须要给以下成员变量赋值:
- param srcFormat:`ImagePreprocess` 类的成员变量`srcFormat_` - param srcFormat:`ImagePreprocess` 类的成员变量`srcFormat_`
- param dstFormat:`ImagePreprocess` 类的成员变量`dstFormat_` - param dstFormat:`ImagePreprocess` 类的成员变量`dstFormat_`
- param srcw: `ImagePreprocess` 类的成员变量`transParam_`结构体中的`iw`变量
- param srch: `ImagePreprocess` 类的成员变量`transParam_`结构体中的`ih`变量
- 第二个`imageCovert` 接口,可以直接使用 - 第二个`imageCovert` 接口,缺省参数来源于 `ImagePreprocess` 类的成员变量。故在初始化 `ImagePreprocess` 类的对象时,必须要给以下成员变量赋值:
- param srcw: `ImagePreprocess` 类的成员变量`transParam_`结构体中的`iw`变量
- param srch: `ImagePreprocess` 类的成员变量`transParam_`结构体中的`ih`变量
- 第二个`imageCovert` 接口, 可以直接使用
### 缩放 Resize ### 缩放 Resize
`Resize` 功能支持颜色空间:GRAY、NV12(NV21)、RGB(BGR)和RGBA(BGRA) `Resize` 功能支持颜色空间:GRAY、NV12(NV21)、RGB(BGR)和RGBA(BGRA)
......
add_subdirectory(kernels) add_subdirectory(kernels)
add_subdirectory(math) add_subdirectory(math)
add_subdirectory(cv) add_subdirectory(cv)
add_subdirectory(cv/anakin)
add_subdirectory(api) add_subdirectory(api)
if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_MLU) AND LITE_WITH_ARM) if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_MLU) AND LITE_WITH_ARM)
lite_cc_test(image_convert_test SRCS image_convert_test.cc DEPS paddle_cv_arm) lite_cc_test(image_convert_test SRCS image_convert_test.cc DEPS paddle_cv_arm)
lite_cc_test(image_profiler_test SRCS image_profiler_test.cc DEPS paddle_cv_arm anakin_cv_arm)
endif() endif()
if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_MLU) AND LITE_WITH_ARM)
lite_cc_library(anakin_cv_arm SRCS
bgr_resize.cc
bgr_flip_hwc.cc
bgr_rotate_hwc.cc
bgr_to_tensor_hwc.cc
bgra_resize.cc
bgra_flip_hwc.cc
bgra_rotate_hwc.cc
bgra_to_tensor_hwc.cc
cv_utils.cc
nv12_to_bgr.cc
nv12_to_bgra.cc
nv21_to_bgr.cc
nv21_to_bgra.cc
nv21_resize.cc
DEPS paddle_api place)
endif()
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/tests/cv/anakin/cv_utils.h"
void flip_x_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
void flip_y_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
void flip_xy_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
// x: flip_num = 1 y: flip_num = -1 xy: flip_num = 0;
void bgr_flip_hwc(
const uint8_t* src, uint8_t* dst, int w_in, int h_in, int flip_num) {
if (flip_num == 1) { // x
flip_x_hwc(src, dst, w_in, h_in);
}
if (flip_num == -1) { // y
flip_y_hwc(src, dst, w_in, h_in);
}
if (flip_num == 0) { // xy
flip_xy_hwc(src, dst, w_in, h_in);
}
}
/*
bgr1 bgr2 bgr3
bgr4 bgr5 bgr6
bgr7 bgr8 bgr9
rotate:
bgr7 bgr8 bgr9
bgr4 bgr5 bgr6
bgr1 bgr2 bgr3
*/
#ifdef __aarch64__
void flip_x_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
int h = h_in - 1;
int win = w_in * 3;
uint8_t zerobuff[win]; // NOLINT
memset(zerobuff, 0, win * sizeof(uint8_t));
uint8_t zerobuff2[win]; // NOLINT
memset(zerobuff2, 0, win * sizeof(uint8_t));
for (int i = 0; i < h_in; i += 4) {
const uint8_t* inptr0 = src + i * win;
const uint8_t* inptr1 = inptr0 + win;
const uint8_t* inptr2 = inptr1 + win;
const uint8_t* inptr3 = inptr2 + win;
uint8_t* outptr0 = dst + (h - i) * win; // last
uint8_t* outptr1 = outptr0 - win;
uint8_t* outptr2 = outptr1 - win;
uint8_t* outptr3 = outptr2 - win;
asm volatile(
"prfm pldl1keep, [%[ptr0]] \n"
"prfm pldl1keep, [%[ptr1]] \n"
"prfm pldl1keep, [%[ptr2]] \n"
"prfm pldl1keep, [%[ptr3]] \n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3)
: "memory");
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 3:
inptr0 = zerobuff;
outptr0 = zerobuff2;
case 2:
inptr1 = zerobuff;
outptr1 = zerobuff2;
case 1:
inptr2 = zerobuff;
outptr2 = zerobuff2;
case 0:
inptr3 = zerobuff;
outptr3 = zerobuff2;
default:
break;
}
}
int j = 0;
for (; j < w_in - 7; j += 8) {
asm volatile(
"ld3 {v0.8b, v1.8b, v2.8b}, [%[inptr0]], #24 \n" // v0={00,01,02,
// 03, 04, 05,
// 06, 07}"
"ld3 {v3.8b, v4.8b, v5.8b}, [%[inptr1]], #24 \n" // v0={10,11,12,
// 13, 14, 15,
// 16, 17}"
"ld3 {v6.8b, v7.8b, v8.8b}, [%[inptr2]], #24 \n" // v0={20,21,22,
// 23, 24, 25,
// 26, 27}"
"ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24 \n" // v0={30,31,32,
// 33, 34, 35,
// 36, 37}"
"prfm pldl1keep, [%[inptr0]] \n"
"prfm pldl1keep, [%[inptr1]] \n"
"prfm pldl1keep, [%[inptr2]] \n"
"prfm pldl1keep, [%[inptr3]] \n"
"st3 {v0.8b, v1.8b, v2.8b}, [%[outptr0]], #24 \n" // 00
// 10
// 20
// 30
// 04
// 14
// 24
// 34
"st3 {v3.8b, v4.8b, v5.8b}, [%[outptr1]], #24 \n" // 02
// 12
// 22
// 32
"st3 {v6.8b, v7.8b, v8.8b}, [%[outptr2]], #24 \n" // 01
// 11
// 21
// 31
"st3 {v9.8b, v10.8b, v11.8b}, [%[outptr3]], #24 \n" // 03 13 23 33
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3)
:
: "v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11");
}
for (; j < w_in; j++) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 0:
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
case 1:
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
case 2:
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
case 3:
// inptr3 = zerobuff;
default:
break;
}
} else {
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
}
}
}
}
#else
void flip_x_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
int win = w_in * 3;
uint8_t zerobuff[win]; // NOLINT
memset(zerobuff, 0, win * sizeof(uint8_t));
uint8_t zerobuff2[win]; // NOLINT
memset(zerobuff2, 0, win * sizeof(uint8_t));
int h = h_in - 1;
// 4*8
for (int i = 0; i < h_in; i += 4) {
const uint8_t* inptr0 = src + i * win;
const uint8_t* inptr1 = inptr0 + win;
const uint8_t* inptr2 = inptr1 + win;
const uint8_t* inptr3 = inptr2 + win;
uint8_t* outptr0 = dst + (h - i) * win; // last
uint8_t* outptr1 = outptr0 - win;
uint8_t* outptr2 = outptr1 - win;
uint8_t* outptr3 = outptr2 - win;
asm volatile(
"pld [%[ptr0]] @ preload a, 64byte\n"
"pld [%[ptr1]] @ preload a, 64byte\n"
"pld [%[ptr2]] @ preload a, 64byte\n"
"pld [%[ptr3]] @ preload a, 64byte\n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3)
: "memory");
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 3:
inptr0 = zerobuff;
outptr0 = zerobuff2;
case 2:
inptr1 = zerobuff;
outptr1 = zerobuff2;
case 1:
inptr2 = zerobuff;
outptr2 = zerobuff2;
case 0:
inptr3 = zerobuff;
outptr3 = zerobuff2;
default:
break;
}
}
int j = 0;
for (; j < w_in - 7; j += 8) {
asm volatile(
"vld3.8 {d0, d1, d2}, [%[inptr0]]! @ zip load r0, d0 =00 01 02 03 "
"04 05 06 07\n"
"vld3.8 {d3, d4, d5}, [%[inptr1]]! @ zip load r1, d2 =10 11 12 13 "
"14 15 16 17\n"
"vld3.8 {d6, d7, d8}, [%[inptr2]]! @ zip load r1, d4 =20 21 22 23 "
"24 25 26 27\n"
"vld3.8 {d9, d10, d11}, [%[inptr3]]! @ zip load r1, d6 = 30 31 32 "
"33 34 35 36 37\n"
"pld [%[inptr0]] @ preload a, 64byte\n"
"pld [%[inptr1]] @ preload a, 64byte\n"
"pld [%[inptr2]] @ preload a, 64byte\n"
"pld [%[inptr3]] @ preload a, 64byte\n"
"vst3.8 {d0, d1, d2}, [%[outptr0]]! @ write d0(q0,low),r00,r10 "
"20 30\n"
"vst3.8 {d3, d4, d5}, [%[outptr1]]! @ write d4(q0,low),r01,r11 "
"21 31\n"
"vst3.8 {d6, d7, d8}, [%[outptr2]]! @ write d4(q0,low),r01,r11 "
"21 31\n"
"vst3.8 {d9, d10, d11}, [%[outptr3]]! @ write "
"d4(q0,low),r01,r11 21 31\n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3)
:
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
}
for (; j < w_in; j++) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 0:
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
case 1:
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
case 2:
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
case 3:
// inptr3 = zerobuff;
default:
break;
}
} else {
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
}
}
}
}
#endif
/*
bgr1 bgr2 bgr3
bgr4 bgr5 bgr6
bgr7 bgr8 bgr9
flip:
bgr3 bgr2 bgr1
bgr6 bgr5 bgr4
bgr9 bgr8 bgr7
*/
#ifdef __aarch64__
void flip_y_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) {
int w_in = w * 3;
uint8_t zerobuff[w_in]; // NOLINT
memset(zerobuff, 0, w_in * sizeof(uint8_t));
uint8_t zerobuff2[w_in]; // NOLINT
memset(zerobuff2, 0, w_in * sizeof(uint8_t));
int stride_w = 24;
for (int i = 0; i < h_in; i += 4) {
const uint8_t* inptr0 = src + i * w_in;
const uint8_t* inptr1 = inptr0 + w_in;
const uint8_t* inptr2 = inptr1 + w_in;
const uint8_t* inptr3 = inptr2 + w_in;
uint8_t* outptr0 = dst + (i + 1) * w_in - stride_w; // last col
uint8_t* outptr1 = outptr0 + w_in;
uint8_t* outptr2 = outptr1 + w_in;
uint8_t* outptr3 = outptr2 + w_in;
asm volatile(
"prfm pldl1keep, [%[ptr0]] \n"
"prfm pldl1keep, [%[ptr1]] \n"
"prfm pldl1keep, [%[ptr2]] \n"
"prfm pldl1keep, [%[ptr3]] \n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3)
: "memory");
int j = 0;
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 3:
inptr0 = zerobuff;
outptr0 = zerobuff2;
case 2:
inptr1 = zerobuff;
outptr1 = zerobuff2;
case 1:
inptr2 = zerobuff;
outptr2 = zerobuff2;
case 0:
inptr3 = zerobuff;
outptr3 = zerobuff2;
default:
break;
}
}
for (; j < w - 7; j += 8) {
asm volatile(
"ld3 {v0.8b, v1.8b, v2.8b}, [%[inptr0]], #24 \n" // v0={00,01,02,
// 03, 04, 05,
// 06, 07}"
"ld3 {v3.8b, v4.8b, v5.8b}, [%[inptr1]], #24 \n" // v0={10,11,12,
// 13, 14, 15,
// 16, 17}"
"ld3 {v6.8b, v7.8b, v8.8b}, [%[inptr2]], #24 \n" // v0={20,21,22,
// 23, 24, 25,
// 26, 27}"
"ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24 \n" // v0={30,31,32,
// 33, 34, 35,
// 36, 37}"
"rev64 v12.8b, v0.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 b
"rev64 v13.8b, v1.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 g
"rev64 v14.8b, v2.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 r
"rev64 v15.8b, v3.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v16.8b, v4.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v17.8b, v5.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v18.8b, v6.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v19.8b, v7.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v20.8b, v8.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v21.8b, v9.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v22.8b, v10.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v23.8b, v11.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"prfm pldl1keep, [%[inptr0]] \n"
"prfm pldl1keep, [%[inptr1]] \n"
"prfm pldl1keep, [%[inptr2]] \n"
"prfm pldl1keep, [%[inptr3]] \n"
"st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]] \n" // 00 10
// 20 30
// 04 14
// 24 34
"st3 {v15.8b, v16.8b, v17.8b}, [%[outptr1]] \n" // 02 12
// 22 32
"st3 {v18.8b, v19.8b, v20.8b}, [%[outptr2]] \n" // 01 11
// 21 31
"st3 {v21.8b, v22.8b, v23.8b}, [%[outptr3]] \n" // 03 13
// 23 33
"sub %[outptr0], %[outptr0], %[stride_w] \n" //@ ptr - stride_w
"sub %[outptr1], %[outptr1], %[stride_w] \n"
"sub %[outptr2], %[outptr2], %[stride_w] \n"
"sub %[outptr3], %[outptr3], %[stride_w] \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23");
}
outptr3 += stride_w - 3;
outptr2 += stride_w - 3;
outptr1 += stride_w - 3;
outptr0 += stride_w - 3;
for (; j < w; j++) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 0:
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 6;
case 1:
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 6;
case 2:
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 6;
case 3:
// inptr3 = zerobuff;
default:
break;
}
} else {
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
outptr3 -= 6;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 6;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 6;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 6;
}
}
}
}
#else
void flip_y_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) {
int w_in = w * 3;
uint8_t zerobuff[w_in]; // NOLINT
memset(zerobuff, 0, w_in * sizeof(uint8_t));
uint8_t zerobuff2[w_in]; // NOLINT
memset(zerobuff2, 0, w_in * sizeof(uint8_t));
int stride_w = 24;
// 4*8
for (int i = 0; i < h_in; i += 4) {
const uint8_t* inptr0 = src + i * w_in;
const uint8_t* inptr1 = inptr0 + w_in;
const uint8_t* inptr2 = inptr1 + w_in;
const uint8_t* inptr3 = inptr2 + w_in;
uint8_t* outptr0 = dst + (i + 1) * w_in - stride_w; // last
uint8_t* outptr1 = outptr0 + w_in;
uint8_t* outptr2 = outptr1 + w_in;
uint8_t* outptr3 = outptr2 + w_in;
asm volatile(
"pld [%[ptr0]] @ preload a, 64byte\n"
"pld [%[ptr1]] @ preload a, 64byte\n"
"pld [%[ptr2]] @ preload a, 64byte\n"
"pld [%[ptr3]] @ preload a, 64byte\n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3)
: "memory");
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 3:
inptr0 = zerobuff;
outptr0 = zerobuff2;
case 2:
inptr1 = zerobuff;
outptr1 = zerobuff2;
case 1:
inptr2 = zerobuff;
outptr2 = zerobuff2;
case 0:
inptr3 = zerobuff;
outptr3 = zerobuff2;
default:
break;
}
}
int j = 0;
for (; j < w - 7; j += 8) {
asm volatile(
"vld3.8 {d0, d1, d2}, [%[inptr0]]! @ zip load r0, d0 =00 01 02 03 "
"04 05 06 07\n"
"vld3.8 {d3, d4, d5}, [%[inptr1]]! @ zip load r1, d2 =10 11 12 13 "
"14 15 16 17\n"
"vld3.8 {d6, d7, d8}, [%[inptr2]]! @ zip load r1, d4 =20 21 22 23 "
"24 25 26 27\n"
"vld3.8 {d9, d10, d11}, [%[inptr3]]! @ zip load r1, d6 = 30 31 32 "
"33 34 35 36 37\n"
"vrev64.8 d12, d0 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d13, d1 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d14, d2 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d15, d3 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d16, d4 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d17, d5 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d18, d6 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d19, d7 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d20, d8 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d21, d9 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d22, d10 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d23, d11 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"pld [%[inptr0]] @ preload a, 64byte\n"
"pld [%[inptr1]] @ preload a, 64byte\n"
"pld [%[inptr2]] @ preload a, 64byte\n"
"pld [%[inptr3]] @ preload a, 64byte\n"
"vst3.8 {d12, d13, d14}, [%[outptr0]] @ write "
"d0(q0,low),r00,r10 20 30\n"
"vst3.8 {d15, d16, d17}, [%[outptr1]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"vst3.8 {d18, d19, d20}, [%[outptr2]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"vst3.8 {d21, d22, d23}, [%[outptr3]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"sub %[outptr0], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr1], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr2], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr3], %[stride_w] @ ptr - stride_w \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "q0",
"q1",
"q2",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12");
}
outptr3 += stride_w - 3;
outptr2 += stride_w - 3;
outptr1 += stride_w - 3;
outptr0 += stride_w - 3;
for (; j < w; j++) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 0:
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 6;
case 1:
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 6;
case 2:
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 6;
case 3:
// inptr3 = zerobuff;
default:
break;
}
} else {
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
outptr3 -= 6;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 6;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 6;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 6;
}
}
}
}
#endif
/*
bgr1 bgr2 bgr3
bgr4 bgr5 bgr6
bgr7 bgr8 bgr9
flip:
bgr9 bgr8 bgr7
bgr6 bgr5 bgr4
bgr3 bgr2 bgr1
*/
#ifdef __aarch64__
void flip_xy_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) {
int stride_w = 24;
int w_in = w * 3;
uint8_t zerobuff[w_in]; // NOLINT
memset(zerobuff, 0, w_in * sizeof(uint8_t));
uint8_t zerobuff2[w_in]; // NOLINT
memset(zerobuff2, 0, w_in * sizeof(uint8_t));
for (int i = 0; i < h_in; i += 4) {
const uint8_t* inptr0 = src + i * w_in;
const uint8_t* inptr1 = inptr0 + w_in;
const uint8_t* inptr2 = inptr1 + w_in;
const uint8_t* inptr3 = inptr2 + w_in;
uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w; // last col
uint8_t* outptr1 = outptr0 - w_in;
uint8_t* outptr2 = outptr1 - w_in;
uint8_t* outptr3 = outptr2 - w_in;
asm volatile(
"prfm pldl1keep, [%[ptr0]] \n"
"prfm pldl1keep, [%[ptr1]] \n"
"prfm pldl1keep, [%[ptr2]] \n"
"prfm pldl1keep, [%[ptr3]] \n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3)
: "memory");
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 3:
inptr0 = zerobuff;
outptr0 = zerobuff2;
case 2:
inptr1 = zerobuff;
outptr1 = zerobuff2;
case 1:
inptr2 = zerobuff;
outptr2 = zerobuff2;
case 0:
inptr3 = zerobuff;
outptr3 = zerobuff2;
default:
break;
}
}
int j = 0;
for (; j < w - 7; j += 8) {
asm volatile(
"ld3 {v0.8b, v1.8b, v2.8b}, [%[inptr0]], #24 \n" // v0={00,01,02,
// 03, 04, 05,
// 06, 07}"
"ld3 {v3.8b, v4.8b, v5.8b}, [%[inptr1]], #24 \n" // v0={10,11,12,
// 13, 14, 15,
// 16, 17}"
"ld3 {v6.8b, v7.8b, v8.8b}, [%[inptr2]], #24 \n" // v0={20,21,22,
// 23, 24, 25,
// 26, 27}"
"ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24 \n" // v0={30,31,32,
// 33, 34, 35,
// 36, 37}"
"rev64 v12.8b, v0.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 b
"rev64 v13.8b, v1.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 g
"rev64 v14.8b, v2.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 r
"rev64 v15.8b, v3.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v16.8b, v4.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v17.8b, v5.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v18.8b, v6.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v19.8b, v7.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v20.8b, v8.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v21.8b, v9.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v22.8b, v10.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v23.8b, v11.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"prfm pldl1keep, [%[inptr0]] \n"
"prfm pldl1keep, [%[inptr1]] \n"
"prfm pldl1keep, [%[inptr2]] \n"
"prfm pldl1keep, [%[inptr3]] \n"
"st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]] \n" // 00 10
// 20 30
// 04 14
// 24 34
"st3 {v15.8b, v16.8b, v17.8b}, [%[outptr1]] \n" // 02 12
// 22 32
"st3 {v18.8b, v19.8b, v20.8b}, [%[outptr2]] \n" // 01 11
// 21 31
"st3 {v21.8b, v22.8b, v23.8b}, [%[outptr3]] \n" // 03 13
// 23 33
"sub %[outptr0], %[outptr0], %[stride_w] \n" //@ ptr - stride_w
"sub %[outptr1], %[outptr1], %[stride_w] \n"
"sub %[outptr2], %[outptr2], %[stride_w] \n"
"sub %[outptr3], %[outptr3], %[stride_w] \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23");
}
outptr3 += stride_w - 3;
outptr2 += stride_w - 3;
outptr1 += stride_w - 3;
outptr0 += stride_w - 3;
for (; j < w; j++) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 0:
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 6;
case 1:
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 6;
case 2:
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 6;
case 3:
// inptr3 = zerobuff;
default:
break;
}
} else {
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
outptr3 -= 6;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 6;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 6;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 6;
}
}
}
}
#else
void flip_xy_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) {
int w_in = w * 3;
uint8_t zerobuff[w_in]; // NOLINT
memset(zerobuff, 0, w_in * sizeof(uint8_t));
uint8_t zerobuff2[w_in]; // NOLINT
memset(zerobuff2, 0, w_in * sizeof(uint8_t));
int stride_w = 24;
// 4*8
for (int i = 0; i < h_in; i += 4) {
const uint8_t* inptr0 = src + i * w_in;
const uint8_t* inptr1 = inptr0 + w_in;
const uint8_t* inptr2 = inptr1 + w_in;
const uint8_t* inptr3 = inptr2 + w_in;
uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w; // last
uint8_t* outptr1 = outptr0 - w_in;
uint8_t* outptr2 = outptr1 - w_in;
uint8_t* outptr3 = outptr2 - w_in;
asm volatile(
"pld [%[ptr0]] @ preload a, 64byte\n"
"pld [%[ptr1]] @ preload a, 64byte\n"
"pld [%[ptr2]] @ preload a, 64byte\n"
"pld [%[ptr3]] @ preload a, 64byte\n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3)
: "memory");
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 3:
inptr0 = zerobuff;
outptr0 = zerobuff2;
case 2:
inptr1 = zerobuff;
outptr1 = zerobuff2;
case 1:
inptr2 = zerobuff;
outptr2 = zerobuff2;
case 0:
inptr3 = zerobuff;
outptr3 = zerobuff2;
default:
break;
}
}
int j = 0;
for (; j < w - 7; j += 8) {
asm volatile(
"vld3.8 {d0, d1, d2}, [%[inptr0]]! @ zip load r0, d0 =00 01 02 03 "
"04 05 06 07\n"
"vld3.8 {d3, d4, d5}, [%[inptr1]]! @ zip load r1, d2 =10 11 12 13 "
"14 15 16 17\n"
"vld3.8 {d6, d7, d8}, [%[inptr2]]! @ zip load r1, d4 =20 21 22 23 "
"24 25 26 27\n"
"vld3.8 {d9, d10, d11}, [%[inptr3]]! @ zip load r1, d6 = 30 31 32 "
"33 34 35 36 37\n"
"vrev64.8 d12, d0 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d13, d1 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d14, d2 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d15, d3 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d16, d4 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d17, d5 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d18, d6 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d19, d7 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d20, d8 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d21, d9 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d22, d10 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d23, d11 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"pld [%[inptr0]] @ preload a, 64byte\n"
"pld [%[inptr1]] @ preload a, 64byte\n"
"pld [%[inptr2]] @ preload a, 64byte\n"
"pld [%[inptr3]] @ preload a, 64byte\n"
"vst3.8 {d12, d13, d14}, [%[outptr0]] @ write "
"d0(q0,low),r00,r10 20 30\n"
"vst3.8 {d15, d16, d17}, [%[outptr1]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"vst3.8 {d18, d19, d20}, [%[outptr2]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"vst3.8 {d21, d22, d23}, [%[outptr3]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"sub %[outptr0], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr1], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr2], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr3], %[stride_w] @ ptr - stride_w \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "q0",
"q1",
"q2",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12");
}
outptr3 += stride_w - 3;
outptr2 += stride_w - 3;
outptr1 += stride_w - 3;
outptr0 += stride_w - 3;
for (; j < w; j++) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 0:
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 6;
case 1:
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 6;
case 2:
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 6;
case 3:
// inptr3 = zerobuff;
default:
break;
}
} else {
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
outptr3 -= 6;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 6;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 6;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 6;
}
}
}
}
#endif
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <limits.h>
#include <math.h>
#include "lite/tests/cv/anakin/cv_utils.h"
void resize_three_channel(
const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
void bgr_resize(const uint8_t* src,
uint8_t* dst,
int w_in,
int h_in,
int w_out,
int h_out) {
if (w_out == w_in && h_out == h_in) {
memcpy(dst, src, sizeof(char) * w_in * h_in * 3);
return;
}
// y
resize_three_channel(src, w_in * 3, h_in, dst, w_out * 3, h_out);
}
void resize_three_channel(const uint8_t* src,
int w_in,
int h_in,
uint8_t* dst,
int w_out,
int h_out) {
const int resize_coef_bits = 11;
const int resize_coef_scale = 1 << resize_coef_bits;
double scale_x = static_cast<double>(w_in) / w_out;
double scale_y = static_cast<double>(h_in) / h_out;
int* buf = new int[w_out * 2 + h_out * 2];
int* xofs = buf; // new int[w];
int* yofs = buf + w_out; // new int[h];
int16_t* ialpha =
reinterpret_cast<int16_t*>(buf + w_out + h_out); // new int16_t[w * 2];
int16_t* ibeta =
reinterpret_cast<int16_t*>(buf + w_out * 2 + h_out); // new short[h * 2];
float fx = 0.f;
float fy = 0.f;
int sx = 0.f;
int sy = 0.f;
#define SATURATE_CAST_SHORT(X) \
(int16_t)::std::min( \
::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
SHRT_MAX);
for (int dx = 0; dx < w_out / 3; dx++) {
fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
sx = floor(fx);
fx -= sx;
if (sx < 0) {
sx = 0;
fx = 0.f;
}
if (sx >= w_in - 1) {
sx = w_in - 2;
fx = 1.f;
}
xofs[dx] = sx * 3;
float a0 = (1.f - fx) * resize_coef_scale;
float a1 = fx * resize_coef_scale;
ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
}
for (int dy = 0; dy < h_out; dy++) {
fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
sy = floor(fy);
fy -= sy;
if (sy < 0) {
sy = 0;
fy = 0.f;
}
if (sy >= h_in - 1) {
sy = h_in - 2;
fy = 1.f;
}
yofs[dy] = sy;
float b0 = (1.f - fy) * resize_coef_scale;
float b1 = fy * resize_coef_scale;
ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
}
#undef SATURATE_CAST_SHORT
// loop body
int16_t* rowsbuf0 = new int16_t[w_out + 1];
int16_t* rowsbuf1 = new int16_t[w_out + 1];
int16_t* rows0 = rowsbuf0;
int16_t* rows1 = rowsbuf1;
int prev_sy1 = -1;
for (int dy = 0; dy < h_out; dy++) {
int sy = yofs[dy];
if (sy == prev_sy1) {
// hresize one row
int16_t* rows0_old = rows0;
rows0 = rows1;
rows1 = rows0_old;
const uint8_t* S1 = src + w_in * (sy + 1);
const int16_t* ialphap = ialpha;
int16_t* rows1p = rows1;
for (int dx = 0; dx < w_out / 3; dx++) {
int sx = xofs[dx];
int16_t a0 = ialphap[0];
int16_t a1 = ialphap[1];
const uint8_t* S1p = S1 + sx;
int tmp = dx * 3;
rows1p[tmp] = (S1p[0] * a0 + S1p[3] * a1) >> 4;
rows1p[tmp + 1] = (S1p[1] * a0 + S1p[4] * a1) >> 4;
rows1p[tmp + 2] = (S1p[2] * a0 + S1p[5] * a1) >> 4;
ialphap += 2;
}
} else {
// hresize two rows
const uint8_t* S0 = src + w_in * (sy);
const uint8_t* S1 = src + w_in * (sy + 1);
const int16_t* ialphap = ialpha;
int16_t* rows0p = rows0;
int16_t* rows1p = rows1;
for (int dx = 0; dx < w_out / 3; dx++) {
int sx = xofs[dx];
int16_t a0 = ialphap[0];
int16_t a1 = ialphap[1];
const uint8_t* S0p = S0 + sx;
const uint8_t* S1p = S1 + sx;
int tmp = dx * 3;
rows0p[tmp] = (S0p[0] * a0 + S0p[3] * a1) >> 4;
rows1p[tmp] = (S1p[0] * a0 + S1p[3] * a1) >> 4;
rows0p[tmp + 1] = (S0p[1] * a0 + S0p[4] * a1) >> 4;
rows1p[tmp + 1] = (S1p[1] * a0 + S1p[4] * a1) >> 4;
rows0p[tmp + 2] = (S0p[2] * a0 + S0p[5] * a1) >> 4;
rows1p[tmp + 2] = (S1p[2] * a0 + S1p[5] * a1) >> 4;
ialphap += 2;
}
}
prev_sy1 = sy + 1;
// vresize
int16_t b0 = ibeta[0];
int16_t b1 = ibeta[1];
int16_t* rows0p = rows0;
int16_t* rows1p = rows1;
uint8_t* dp_ptr = dst + w_out * (dy);
int cnt = w_out >> 3;
int remain = w_out - (cnt << 3);
int16x4_t _b0 = vdup_n_s16(b0);
int16x4_t _b1 = vdup_n_s16(b1);
int32x4_t _v2 = vdupq_n_s32(2);
for (cnt = w_out >> 3; cnt > 0; cnt--) {
int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
int32x4_t _acc = _v2;
_acc = vsraq_n_s32(
_acc, _rows0p_sr4_mb0, 16); // _acc >> 16 + _rows0p_sr4_mb0 >> 16
_acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
int32x4_t _acc_1 = _v2;
_acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
_acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
int16x4_t _acc16 = vshrn_n_s32(_acc, 2); // _acc >> 2
int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
vst1_u8(dp_ptr, _dout);
dp_ptr += 8;
rows0p += 8;
rows1p += 8;
}
for (; remain; --remain) {
// D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
*dp_ptr++ =
(uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
(int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
2);
}
ibeta += 2;
}
delete[] buf;
delete[] rowsbuf0;
delete[] rowsbuf1;
}
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/tests/cv/anakin/cv_utils.h"
void rotate90_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
void rotate270_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
void rotate180_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
void bgr_rotate_hwc(
const uint8_t* src, uint8_t* dst, int w_in, int h_in, int angle) {
if (angle == 90) {
rotate90_hwc(src, dst, w_in, h_in);
}
if (angle == 270) {
rotate270_hwc(src, dst, w_in, h_in);
}
if (angle == 180) {
rotate180_hwc(src, dst, w_in, h_in);
}
}
/*
bgr1 bgr2 bgr3
bgr4 bgr5 bgr6
bgr7 bgr8 bgr9
rotate:
bgr7 bgr4 bgr1
bgr8 bgr5 bgr2
bgr9 bgr6 bgr3
*/
#ifdef __aarch64__
void rotate90_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
int w_out = h_in;
int h_out = w_in;
int win = w_in * 3;
int wout = w_out * 3;
int stride_h = 4 * win;
int stride_h_w = 4 * win - 24;
int ww = w_out - 8;
// uint8_t* dst = new uint8_t[w_out * h_out * 3];
// block 8*8. -- 8*8
int i = 0;
for (i = 0; i < h_in - 7; i += 8) {
const uint8_t* inptr0 = src + i * win;
const uint8_t* inptr1 = inptr0 + win;
const uint8_t* inptr2 = inptr1 + win;
const uint8_t* inptr3 = inptr2 + win;
asm volatile(
"prfm pldl1keep, [%[ptr0]] \n"
"prfm pldl1keep, [%[ptr0], #64] \n"
"prfm pldl1keep, [%[ptr1]] \n"
"prfm pldl1keep, [%[ptr1], #64] \n"
"prfm pldl1keep, [%[ptr2]] \n"
"prfm pldl1keep, [%[ptr2], #64] \n"
"prfm pldl1keep, [%[ptr3]] \n"
"prfm pldl1keep, [%[ptr3], #64] \n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3)
: "memory");
int j = 0;
for (; j < w_in - 7; j += 8) {
uint8_t* outptr0 = dst + j * wout + (ww - i) * 3;
uint8_t* outptr1 = outptr0 + wout;
uint8_t* outptr2 = outptr1 + wout;
uint8_t* outptr3 = outptr2 + wout;
uint8_t* outptr4 = outptr3 + wout;
uint8_t* outptr5 = outptr4 + wout;
uint8_t* outptr6 = outptr5 + wout;
uint8_t* outptr7 = outptr6 + wout;
asm volatile(
"ld3 {v0.8b, v1.8b, v2.8b}, [%[inptr0]] \n" // v0={00,01,02, 03,
// 04, 05, 06, 07}"
"ld3 {v3.8b, v4.8b, v5.8b}, [%[inptr1]] \n" // v0={10,11,12, 13,
// 14, 15, 16, 17}"
"ld3 {v6.8b, v7.8b, v8.8b}, [%[inptr2]] \n" // v0={20,21,22, 23,
// 24, 25, 26, 27}"
"ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]] \n" // v0={30,31,32,
// 33, 34, 35, 36,
// 37}"
"add %[inptr0], %[inptr0], %[stride_h] \n" // 4 + 4*w_in
"add %[inptr1], %[inptr1], %[stride_h] \n" // 5
"add %[inptr2], %[inptr2], %[stride_h] \n" // 6
"add %[inptr3], %[inptr3], %[stride_h] \n" // 7
// b
"trn1 v12.8b, v0.8b, v3.8b \n" // v4={00 10 02 12 04 14
// 06 16 }
"trn1 v15.8b, v6.8b, v9.8b \n" // v4={20 30 22 32 24 34
// 26 36 }
"trn2 v18.8b, v0.8b, v3.8b \n" // v5={01 11 03 13 05 15
// 07 17 }
"trn2 v21.8b, v6.8b, v9.8b \n" // v7={21 31 23 33 25 35
// 27 37 }
// g
"trn1 v13.8b, v1.8b, v4.8b \n" // v4={00 10 02 12 04 14
// 06 16 }
"trn1 v16.8b, v7.8b, v10.8b \n" // v4={20 30 22 32 24 34
// 26 36 }
"trn2 v19.8b, v1.8b, v4.8b \n" // v5={01 11 03 13 05 15
// 07 17 }
"trn2 v22.8b, v7.8b, v10.8b \n" // v7={21 31 23 33 25 35
// 27 37 }
// r
"trn1 v14.8b, v2.8b, v5.8b \n" // v4={00 10 02 12 04 14
// 06 16 }
"trn1 v17.8b, v8.8b, v11.8b \n" // v4={20 30 22 32 24 34
// 26 36 }
"trn2 v20.8b, v2.8b, v5.8b \n" // v5={01 11 03 13 05 15
// 07 17 }
"trn2 v23.8b, v8.8b, v11.8b \n" // v7={21 31 23 33 25 35
// 27 37 }
// b1
"trn1 v24.4h, v12.4h, v15.4h \n" // v0={00 10 20 30 04 14
// 24 34}
"trn1 v27.4h, v18.4h, v21.4h \n" // v2={01 11 21 31 05 15
// 25 35}
"trn2 v0.4h, v12.4h, v15.4h \n" // v1={02 12 22 32 06 16
// 26 36}
"trn2 v3.4h, v18.4h, v21.4h \n" // v3={03 13 23 33 07 17
// 27 37}
// g1
"trn1 v25.4h, v13.4h, v16.4h \n" // v0={00 10 20 30 04 14
// 24 34}
"trn1 v28.4h, v19.4h, v22.4h \n" // v2={01 11 21 31 05 15
// 25 35}
"trn2 v1.4h, v13.4h, v16.4h \n" // v1={02 12 22 32 06 16
// 26 36}
"trn2 v4.4h, v19.4h, v22.4h \n" // v3={03 13 23 33 07 17
// 27 37}
// r1
"trn1 v26.4h, v14.4h, v17.4h \n" // v0={00 10 20 30 04 14
// 24 34}
"trn1 v29.4h, v20.4h, v23.4h \n" // v2={01 11 21 31 05 15
// 25 35}
"trn2 v2.4h, v14.4h, v17.4h \n" // v1={02 12 22 32 06 16
// 26 36}
"trn2 v5.4h, v20.4h, v23.4h \n" // v3={03 13 23 33 07 17
// 27 37}
"ld3 {v12.8b, v13.8b, v14.8b}, [%[inptr0]] \n" // v0={00,01,02,
// 03, 04, 05, 06,
// 07}"
"ld3 {v15.8b, v16.8b, v17.8b}, [%[inptr1]] \n" // v0={10,11,12,
// 13, 14, 15, 16,
// 17}"
"ld3 {v6.8b, v7.8b, v8.8b}, [%[inptr2]] \n" // v0={20,21,22, 23,
// 24, 25, 26, 27}"
"ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]] \n" // v0={30,31,32,
// 33, 34, 35, 36,
// 37}"
"sub %[inptr0], %[inptr0], %[stride_h_w] \n" // 4 - 4*w_in + 8
"sub %[inptr1], %[inptr1], %[stride_h_w] \n" // 5
"sub %[inptr2], %[inptr2], %[stride_h_w] \n" // 6
"sub %[inptr3], %[inptr3], %[stride_h_w] \n" // 7
// b2
"trn1 v18.8b, v12.8b, v15.8b \n" // v4={00 10 02 12 04 14
// 06 16 }
"trn1 v21.8b, v6.8b, v9.8b \n" // v4={20 30 22 32 24 34
// 26 36 }
// g2
"trn1 v19.8b, v13.8b, v16.8b \n" // v4={00 10 02 12 04 14
// 06 16 }
"trn1 v22.8b, v7.8b, v10.8b \n" // v4={20 30 22 32 24 34
// 26 36 }
// r2
"trn1 v20.8b, v14.8b, v17.8b \n" // v4={00 10 02 12 04 14
// 06 16 }
"trn1 v23.8b, v8.8b, v11.8b \n" // v4={20 30 22 32 24 34
// 26 36 }
"trn2 v12.8b, v12.8b, v15.8b \n" // v5={01 11 03 13 05 15
// 07 17 }
"trn2 v13.8b, v13.8b, v16.8b \n" // v5={01 11 03 13 05 15
// 07 17 }
"trn2 v14.8b, v14.8b, v17.8b \n" // v5={01 11 03 13 05 15
// 07 17 }
"trn2 v15.8b, v6.8b, v9.8b \n" // v7={21 31 23 33 25 35
// 27 37 }
"trn2 v16.8b, v7.8b, v10.8b \n" // v7={21 31 23 33 25 35
// 27 37 }
"trn2 v17.8b, v8.8b, v11.8b \n" // v7={21 31 23 33 25 35
// 27 37 }
// b2
"trn1 v6.4h, v18.4h, v21.4h \n" // v0={00 10 20 30 04 14
// 24 34}
// g2
"trn1 v7.4h, v19.4h, v22.4h \n" // v0={00 10 20 30 04 14
// 24 34}
// r2
"trn1 v8.4h, v20.4h, v23.4h \n" // v0={00 10 20 30 04 14
// 24 34}
// bgr
"trn1 v9.4h, v12.4h, v15.4h \n" // v2={01 11 21 31 05 15
// 25 35}
"trn1 v10.4h, v13.4h, v16.4h \n" // v2={01 11 21 31 05 15
// 25 35}
"trn1 v11.4h, v14.4h, v17.4h \n" // v2={01 11 21 31 05 15
// 25 35}
// bgr
"trn2 v18.4h, v18.4h, v21.4h \n" // v1={02 12 22 32 06 16
// 26 36}
"trn2 v19.4h, v19.4h, v22.4h \n" // v1={02 12 22 32 06 16
// 26 36}
"trn2 v20.4h, v20.4h, v23.4h \n" // v1={02 12 22 32 06 16
// 26 36}
// bgr
"trn2 v21.4h, v12.4h, v15.4h \n" // v3={03 13 23 33 07 17
// 27 37}
"trn2 v22.4h, v13.4h, v16.4h \n" // v3={03 13 23 33 07 17
// 27 37}
"trn2 v23.4h, v14.4h, v17.4h \n" // v3={03 13 23 33 07 17
// 27 37}
// b1 b2
"trn1 v12.2s, v24.2s, v6.2s \n" // v8={00 10 20 30 40 50
// 60 70} b
"trn1 v13.2s, v25.2s, v7.2s \n" // v6={00 10 20 30 40 50
// 60 70} g
"trn1 v14.2s, v26.2s, v8.2s \n" // v6={00 10 20 30 40 50
// 60 70} r
// b1 b2
"trn2 v15.2s, v24.2s, v6.2s \n" // v8={04 14 24 34 44 54
// 64 74} b
"trn2 v16.2s, v25.2s, v7.2s \n" // v6={04 14 24 34 44 54
// 64 74} g
"trn2 v17.2s, v26.2s, v8.2s \n" // v6={04 14 24 34 44 54
// 64 74} r
// b1 b2
"trn1 v6.2s, v27.2s, v9.2s \n" // v8={01 11 20 30 40 50
// 60 70} b
"trn1 v7.2s, v28.2s, v10.2s \n" // v6={01 10 20 30 40 50
// 60 70} g
"trn1 v8.2s, v29.2s, v11.2s \n" // v6={01 10 20 30 40 50
// 60 70} r
"rev64 v12.8b, v12.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 b
"rev64 v13.8b, v13.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 g
"rev64 v14.8b, v14.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 r
"rev64 v15.8b, v15.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 b
"rev64 v16.8b, v16.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 g
"rev64 v17.8b, v17.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 r
// b1 b2
"trn2 v24.2s, v27.2s, v9.2s \n" // v8={05 10 20 30 40 50
// 60 70} b
"trn2 v25.2s, v28.2s, v10.2s \n" // v6={05 10 20 30 40 50
// 60 70} g
"trn2 v26.2s, v29.2s, v11.2s \n" // v6={05 10 20 30 40 50
// 60 70} r
// "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]], #24 \n"
// //00 10 20 30 04 14 24 34
// "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr4]], #24 \n"
// //02 12 22 32
"st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]], #24 \n" // 00 10 20 30 04 14 24 34
"st3 {v15.8b, v16.8b, v17.8b}, [%[outptr4]], #24 \n" // 02 12 22 32
// b1 b2
"trn1 v9.2s, v0.2s, v18.2s \n" // v8={02 11 20 30 40 50
// 60 70} b
"trn1 v10.2s, v1.2s, v19.2s \n" // v6={02 10 20 30 40 50
// 60 70} g
"trn1 v11.2s, v2.2s, v20.2s \n" // v6={02 10 20 30 40 50
// 60 70} r
"trn2 v27.2s, v0.2s, v18.2s \n" // v8={06 11 20 30 40 50
// 60 70} b
"trn2 v28.2s, v1.2s, v19.2s \n" // v6={06 10 20 30 40 50
// 60 70} g
"trn2 v29.2s, v2.2s, v20.2s \n" // v6={06 10 20 30 40 50
// 60 70} r
// b1 b2
"trn1 v0.2s, v3.2s, v21.2s \n" // v8={03 11 20 30 40 50
// 60 70} b
"trn1 v1.2s, v4.2s, v22.2s \n" // v6={03 10 20 30 40 50
// 60 70} g
"trn1 v2.2s, v5.2s, v23.2s \n" // v6={03 10 20 30 40 50
// 60 70} r
"trn2 v18.2s, v3.2s, v21.2s \n" // v8={07 11 20 30 40 50
// 60 70} b
"trn2 v19.2s, v4.2s, v22.2s \n" // v6={07 10 20 30 40 50
// 60 70} g
"trn2 v20.2s, v5.2s, v23.2s \n" // v6={07 10 20 30 40 50
// 60 70} r
"rev64 v6.8b, v6.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00 b
"rev64 v7.8b, v7.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00 g
"rev64 v8.8b, v8.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00 r
"rev64 v24.8b, v24.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 b
"rev64 v25.8b, v25.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 g
"rev64 v26.8b, v26.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 r
"rev64 v9.8b, v9.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00 b
"rev64 v10.8b, v10.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 g
"rev64 v11.8b, v11.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 r
"rev64 v27.8b, v27.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 b
"rev64 v28.8b, v28.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 g
"rev64 v29.8b, v29.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 r
"rev64 v0.8b, v0.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00 b
"rev64 v1.8b, v1.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00 g
"rev64 v2.8b, v2.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00 r
"rev64 v18.8b, v18.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 b
"rev64 v19.8b, v19.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 g
"rev64 v20.8b, v20.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 r
"st3 {v6.8b, v7.8b, v8.8b}, [%[outptr1]], #24 \n" // 00
// 10
// 20
// 30
// 04
// 14
// 24
// 34
"st3 {v24.8b, v25.8b, v26.8b}, [%[outptr5]], #24 \n" // 02 12 22 32
"st3 {v9.8b, v10.8b, v11.8b}, [%[outptr2]], #24 \n" // 00
// 10
// 20
// 30
// 04
// 14
// 24
// 34
"st3 {v27.8b, v28.8b, v29.8b}, [%[outptr6]], #24 \n" // 02 12 22 32
"st3 {v0.8b, v1.8b, v2.8b}, [%[outptr3]], #24 \n" // 00
// 10
// 20
// 30
// 04
// 14
// 24
// 34
"st3 {v18.8b, v19.8b, v20.8b}, [%[outptr7]], #24 \n" // 02 12 22 32
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[outptr4] "+r"(outptr4),
[outptr5] "+r"(outptr5),
[outptr6] "+r"(outptr6),
[outptr7] "+r"(outptr7),
[stride_h] "+r"(stride_h),
[stride_h_w] "+r"(stride_h_w)
:
: "v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30");
}
const uint8_t* inptr4 = inptr3 + win;
const uint8_t* inptr5 = inptr4 + win;
const uint8_t* inptr6 = inptr5 + win;
const uint8_t* inptr7 = inptr6 + win;
for (; j < w_in; j++) {
int tmpx = (ww - i) * 3;
uint8_t* outptr = dst + j * wout + tmpx;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
}
}
for (; i < h_in; i++) {
const uint8_t* inptr0 = src + i * win;
for (int j = 0; j < w_in; j++) {
uint8_t* outptr0 = dst + j * wout + (w_out - 1 - i) * 3;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
}
}
}
#else
void rotate90_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
int w_out = h_in;
int h_out = w_in;
int win = w_in * 3;
int wout = w_out * 3;
int hremain = h_in % 8;
int stride_h = 4 * win;
int stride_h_w = 4 * win - 24;
int ww = w_out - 8;
// block 8*8. -- 8*8
int i = 0;
for (i = 0; i < h_in - 7; i += 8) {
const uint8_t* inptr0 = src + i * win;
const uint8_t* inptr1 = inptr0 + win;
const uint8_t* inptr2 = inptr1 + win;
const uint8_t* inptr3 = inptr2 + win;
const uint8_t* inptr4 = inptr3 + win;
const uint8_t* inptr5 = inptr4 + win;
const uint8_t* inptr6 = inptr5 + win;
const uint8_t* inptr7 = inptr6 + win;
asm volatile(
"pld [%[ptr0]] @ preload a, 64byte\n"
"pld [%[ptr0], #64] @ preload a, 64byte\n"
"pld [%[ptr1]] @ preload a, 64byte\n"
"pld [%[ptr1], #64] @ preload a, 64byte\n"
"pld [%[ptr2]] @ preload a, 64byte\n"
"pld [%[ptr2], #64] @ preload a, 64byte\n"
"pld [%[ptr3]] @ preload a, 64byte\n"
"pld [%[ptr3], #64] @ preload a, 64byte\n"
"pld [%[ptr4]] @ preload a, 64byte\n"
"pld [%[ptr4], #64] @ preload a, 64byte\n"
"pld [%[ptr5]] @ preload a, 64byte\n"
"pld [%[ptr5], #64] @ preload a, 64byte\n"
"pld [%[ptr6]] @ preload a, 64byte\n"
"pld [%[ptr6], #64] @ preload a, 64byte\n"
"pld [%[ptr7]] @ preload a, 64byte\n"
"pld [%[ptr7], #64] @ preload a, 64byte\n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3),
[ptr4] "r"(inptr4),
[ptr5] "r"(inptr5),
[ptr6] "r"(inptr6),
[ptr7] "r"(inptr7)
: "memory");
int j = 0;
for (; j < w_in; j++) {
int tmpx = (ww - i) * 3;
uint8_t* outptr = dst + j * wout + tmpx;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
}
}
ww = w_out - 1;
for (; i < h_in; i++) {
const uint8_t* inptr0 = src + i * win;
for (int j = 0; j < w_in; j++) {
uint8_t* outptr0 = dst + j * wout + (ww - i) * 3;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
}
}
}
#endif
/*
bgr1 bgr2 bgr3
bgr4 bgr5 bgr6
bgr7 bgr8 bgr9
rotate:
bgr3 bgr6 bgr9
bgr2 bgr5 bgr8
bgr1 bgr4 bgr7
*/
// dst = (h_out - 1) * w_out
// 类似rotate90,将输出结果倒着输出 或者先rotate90,然后沿Y轴翻转
#ifdef __aarch64__
void rotate270_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
int w_out = h_in;
int h_out = w_in;
int win = w_in * 3;
int wout = w_out * 3;
int stride_h = 4 * win;
int stride_h_w = 4 * win - 24;
int hout = h_out - 1;
// block 8*8. -- 8*8
int i = 0;
for (; i < h_in - 7; i += 8) {
const uint8_t* inptr0 = src + i * win;
const uint8_t* inptr1 = inptr0 + win;
const uint8_t* inptr2 = inptr1 + win;
const uint8_t* inptr3 = inptr2 + win;
asm volatile(
"prfm pldl1keep, [%[ptr0]] \n"
"prfm pldl1keep, [%[ptr0], #64] \n"
"prfm pldl1keep, [%[ptr1]] \n"
"prfm pldl1keep, [%[ptr1], #64] \n"
"prfm pldl1keep, [%[ptr2]] \n"
"prfm pldl1keep, [%[ptr2], #64] \n"
"prfm pldl1keep, [%[ptr3]] \n"
"prfm pldl1keep, [%[ptr3], #64] \n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3)
: "memory");
int j = 0;
for (; j < w_in - 7; j += 8) {
uint8_t* outptr0 = dst + (hout - j) * wout + i * 3;
uint8_t* outptr1 = outptr0 - wout;
uint8_t* outptr2 = outptr1 - wout;
uint8_t* outptr3 = outptr2 - wout;
uint8_t* outptr4 = outptr3 - wout;
uint8_t* outptr5 = outptr4 - wout;
uint8_t* outptr6 = outptr5 - wout;
uint8_t* outptr7 = outptr6 - wout;
asm volatile(
"ld3 {v0.8b, v1.8b, v2.8b}, [%[inptr0]] \n" // v0={00,01,02, 03,
// 04, 05, 06, 07}"
"ld3 {v3.8b, v4.8b, v5.8b}, [%[inptr1]] \n" // v0={10,11,12, 13,
// 14, 15, 16, 17}"
"ld3 {v6.8b, v7.8b, v8.8b}, [%[inptr2]] \n" // v0={20,21,22, 23,
// 24, 25, 26, 27}"
"ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]] \n" // v0={30,31,32,
// 33, 34, 35, 36,
// 37}"
"add %[inptr0], %[inptr0], %[stride_h] \n" // 4 + 4*w_in
"add %[inptr1], %[inptr1], %[stride_h] \n" // 5
"add %[inptr2], %[inptr2], %[stride_h] \n" // 6
"add %[inptr3], %[inptr3], %[stride_h] \n" // 7
// b
"trn1 v12.8b, v0.8b, v3.8b \n" // v4={00 10 02 12 04 14
// 06 16 }
"trn1 v15.8b, v6.8b, v9.8b \n" // v4={20 30 22 32 24 34
// 26 36 }
"trn2 v18.8b, v0.8b, v3.8b \n" // v5={01 11 03 13 05 15
// 07 17 }
"trn2 v21.8b, v6.8b, v9.8b \n" // v7={21 31 23 33 25 35
// 27 37 }
// g
"trn1 v13.8b, v1.8b, v4.8b \n" // v4={00 10 02 12 04 14
// 06 16 }
"trn1 v16.8b, v7.8b, v10.8b \n" // v4={20 30 22 32 24 34
// 26 36 }
"trn2 v19.8b, v1.8b, v4.8b \n" // v5={01 11 03 13 05 15
// 07 17 }
"trn2 v22.8b, v7.8b, v10.8b \n" // v7={21 31 23 33 25 35
// 27 37 }
// r
"trn1 v14.8b, v2.8b, v5.8b \n" // v4={00 10 02 12 04 14
// 06 16 }
"trn1 v17.8b, v8.8b, v11.8b \n" // v4={20 30 22 32 24 34
// 26 36 }
"trn2 v20.8b, v2.8b, v5.8b \n" // v5={01 11 03 13 05 15
// 07 17 }
"trn2 v23.8b, v8.8b, v11.8b \n" // v7={21 31 23 33 25 35
// 27 37 }
// b1
"trn1 v24.4h, v12.4h, v15.4h \n" // v0={00 10 20 30 04 14
// 24 34}
"trn1 v27.4h, v18.4h, v21.4h \n" // v2={01 11 21 31 05 15
// 25 35}
"trn2 v0.4h, v12.4h, v15.4h \n" // v1={02 12 22 32 06 16
// 26 36}
"trn2 v3.4h, v18.4h, v21.4h \n" // v3={03 13 23 33 07 17
// 27 37}
// g1
"trn1 v25.4h, v13.4h, v16.4h \n" // v0={00 10 20 30 04 14
// 24 34}
"trn1 v28.4h, v19.4h, v22.4h \n" // v2={01 11 21 31 05 15
// 25 35}
"trn2 v1.4h, v13.4h, v16.4h \n" // v1={02 12 22 32 06 16
// 26 36}
"trn2 v4.4h, v19.4h, v22.4h \n" // v3={03 13 23 33 07 17
// 27 37}
// r1
"trn1 v26.4h, v14.4h, v17.4h \n" // v0={00 10 20 30 04 14
// 24 34}
"trn1 v29.4h, v20.4h, v23.4h \n" // v2={01 11 21 31 05 15
// 25 35}
"trn2 v2.4h, v14.4h, v17.4h \n" // v1={02 12 22 32 06 16
// 26 36}
"trn2 v5.4h, v20.4h, v23.4h \n" // v3={03 13 23 33 07 17
// 27 37}
"ld3 {v12.8b, v13.8b, v14.8b}, [%[inptr0]] \n" // v0={00,01,02,
// 03, 04, 05, 06,
// 07}"
"ld3 {v15.8b, v16.8b, v17.8b}, [%[inptr1]] \n" // v0={10,11,12,
// 13, 14, 15, 16,
// 17}"
"ld3 {v6.8b, v7.8b, v8.8b}, [%[inptr2]] \n" // v0={20,21,22, 23,
// 24, 25, 26, 27}"
"ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]] \n" // v0={30,31,32,
// 33, 34, 35, 36,
// 37}"
"sub %[inptr0], %[inptr0], %[stride_h_w] \n" // 4 - 4*w_in + 8
"sub %[inptr1], %[inptr1], %[stride_h_w] \n" // 5
"sub %[inptr2], %[inptr2], %[stride_h_w] \n" // 6
"sub %[inptr3], %[inptr3], %[stride_h_w] \n" // 7
// b2
"trn1 v18.8b, v12.8b, v15.8b \n" // v4={00 10 02 12 04 14
// 06 16 }
"trn1 v21.8b, v6.8b, v9.8b \n" // v4={20 30 22 32 24 34
// 26 36 }
// g2
"trn1 v19.8b, v13.8b, v16.8b \n" // v4={00 10 02 12 04 14
// 06 16 }
"trn1 v22.8b, v7.8b, v10.8b \n" // v4={20 30 22 32 24 34
// 26 36 }
// r2
"trn1 v20.8b, v14.8b, v17.8b \n" // v4={00 10 02 12 04 14
// 06 16 }
"trn1 v23.8b, v8.8b, v11.8b \n" // v4={20 30 22 32 24 34
// 26 36 }
"trn2 v12.8b, v12.8b, v15.8b \n" // v5={01 11 03 13 05 15
// 07 17 }
"trn2 v13.8b, v13.8b, v16.8b \n" // v5={01 11 03 13 05 15
// 07 17 }
"trn2 v14.8b, v14.8b, v17.8b \n" // v5={01 11 03 13 05 15
// 07 17 }
"trn2 v15.8b, v6.8b, v9.8b \n" // v7={21 31 23 33 25 35
// 27 37 }
"trn2 v16.8b, v7.8b, v10.8b \n" // v7={21 31 23 33 25 35
// 27 37 }
"trn2 v17.8b, v8.8b, v11.8b \n" // v7={21 31 23 33 25 35
// 27 37 }
// b2
"trn1 v6.4h, v18.4h, v21.4h \n" // v0={00 10 20 30 04 14
// 24 34}
// g2
"trn1 v7.4h, v19.4h, v22.4h \n" // v0={00 10 20 30 04 14
// 24 34}
// r2
"trn1 v8.4h, v20.4h, v23.4h \n" // v0={00 10 20 30 04 14
// 24 34}
// bgr
"trn1 v9.4h, v12.4h, v15.4h \n" // v2={01 11 21 31 05 15
// 25 35}
"trn1 v10.4h, v13.4h, v16.4h \n" // v2={01 11 21 31 05 15
// 25 35}
"trn1 v11.4h, v14.4h, v17.4h \n" // v2={01 11 21 31 05 15
// 25 35}
// bgr
"trn2 v18.4h, v18.4h, v21.4h \n" // v1={02 12 22 32 06 16
// 26 36}
"trn2 v19.4h, v19.4h, v22.4h \n" // v1={02 12 22 32 06 16
// 26 36}
"trn2 v20.4h, v20.4h, v23.4h \n" // v1={02 12 22 32 06 16
// 26 36}
// bgr
"trn2 v21.4h, v12.4h, v15.4h \n" // v3={03 13 23 33 07 17
// 27 37}
"trn2 v22.4h, v13.4h, v16.4h \n" // v3={03 13 23 33 07 17
// 27 37}
"trn2 v23.4h, v14.4h, v17.4h \n" // v3={03 13 23 33 07 17
// 27 37}
// b1 b2
"trn1 v12.2s, v24.2s, v6.2s \n" // v8={00 10 20 30 40 50
// 60 70} b
"trn1 v13.2s, v25.2s, v7.2s \n" // v6={00 10 20 30 40 50
// 60 70} g
"trn1 v14.2s, v26.2s, v8.2s \n" // v6={00 10 20 30 40 50
// 60 70} r
// b1 b2
"trn2 v15.2s, v24.2s, v6.2s \n" // v8={04 14 24 34 44 54
// 64 74} b
"trn2 v16.2s, v25.2s, v7.2s \n" // v6={04 14 24 34 44 54
// 64 74} g
"trn2 v17.2s, v26.2s, v8.2s \n" // v6={04 14 24 34 44 54
// 64 74} r
// b1 b2
"trn1 v6.2s, v27.2s, v9.2s \n" // v8={01 11 20 30 40 50
// 60 70} b
"trn1 v7.2s, v28.2s, v10.2s \n" // v6={01 10 20 30 40 50
// 60 70} g
"trn1 v8.2s, v29.2s, v11.2s \n" // v6={01 10 20 30 40 50
// 60 70} r
// b1 b2
"trn2 v24.2s, v27.2s, v9.2s \n" // v8={05 10 20 30 40 50
// 60 70} b
"trn2 v25.2s, v28.2s, v10.2s \n" // v6={05 10 20 30 40 50
// 60 70} g
"trn2 v26.2s, v29.2s, v11.2s \n" // v6={05 10 20 30 40 50
// 60 70} r
"st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]], #24 \n" // 00 10 20 30 04 14 24 34
"st3 {v15.8b, v16.8b, v17.8b}, [%[outptr4]], #24 \n" // 02 12 22 32
// b1 b2
"trn1 v9.2s, v0.2s, v18.2s \n" // v8={02 11 20 30 40 50
// 60 70} b
"trn1 v10.2s, v1.2s, v19.2s \n" // v6={02 10 20 30 40 50
// 60 70} g
"trn1 v11.2s, v2.2s, v20.2s \n" // v6={02 10 20 30 40 50
// 60 70} r
"trn2 v27.2s, v0.2s, v18.2s \n" // v8={06 11 20 30 40 50
// 60 70} b
"trn2 v28.2s, v1.2s, v19.2s \n" // v6={06 10 20 30 40 50
// 60 70} g
"trn2 v29.2s, v2.2s, v20.2s \n" // v6={06 10 20 30 40 50
// 60 70} r
// b1 b2
"trn1 v0.2s, v3.2s, v21.2s \n" // v8={03 11 20 30 40 50
// 60 70} b
"trn1 v1.2s, v4.2s, v22.2s \n" // v6={03 10 20 30 40 50
// 60 70} g
"trn1 v2.2s, v5.2s, v23.2s \n" // v6={03 10 20 30 40 50
// 60 70} r
"trn2 v18.2s, v3.2s, v21.2s \n" // v8={07 11 20 30 40 50
// 60 70} b
"trn2 v19.2s, v4.2s, v22.2s \n" // v6={07 10 20 30 40 50
// 60 70} g
"trn2 v20.2s, v5.2s, v23.2s \n" // v6={07 10 20 30 40 50
// 60 70} r
"st3 {v6.8b, v7.8b, v8.8b}, [%[outptr1]], #24 \n" // 00
// 10
// 20
// 30
// 04
// 14
// 24
// 34
"st3 {v24.8b, v25.8b, v26.8b}, [%[outptr5]], #24 \n" // 02 12 22 32
"st3 {v9.8b, v10.8b, v11.8b}, [%[outptr2]], #24 \n" // 00
// 10
// 20
// 30
// 04
// 14
// 24
// 34
"st3 {v27.8b, v28.8b, v29.8b}, [%[outptr6]], #24 \n" // 02 12 22 32
"st3 {v0.8b, v1.8b, v2.8b}, [%[outptr3]], #24 \n" // 00
// 10
// 20
// 30
// 04
// 14
// 24
// 34
"st3 {v18.8b, v19.8b, v20.8b}, [%[outptr7]], #24 \n" // 02 12 22 32
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[outptr4] "+r"(outptr4),
[outptr5] "+r"(outptr5),
[outptr6] "+r"(outptr6),
[outptr7] "+r"(outptr7),
[stride_h] "+r"(stride_h),
[stride_h_w] "+r"(stride_h_w)
:
: "v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29");
}
const uint8_t* inptr4 = inptr3 + win;
const uint8_t* inptr5 = inptr4 + win;
const uint8_t* inptr6 = inptr5 + win;
const uint8_t* inptr7 = inptr6 + win;
for (; j < w_in; j++) {
int tmpx = i * 3;
uint8_t* outptr = dst + (hout - j) * wout + tmpx;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
}
}
for (; i < h_in; i++) {
const uint8_t* inptr0 = src + i * win;
for (int j = 0; j < w_in; j++) {
uint8_t* outptr0 = dst + (hout - j) * wout + i * 3;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
}
}
}
#else
void rotate270_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
int w_out = h_in;
int h_out = w_in;
int win = w_in * 3;
int wout = w_out * 3;
int hremain = h_in % 8;
int stride_h = 4 * win;
int stride_h_w = 4 * win - 24;
int hout = h_out - 1;
// block 8*8. -- 8*8
int i = 0;
for (; i < h_in - 7; i += 8) {
const uint8_t* inptr0 = src + i * win;
const uint8_t* inptr1 = inptr0 + win;
const uint8_t* inptr2 = inptr1 + win;
const uint8_t* inptr3 = inptr2 + win;
const uint8_t* inptr4 = inptr3 + win;
const uint8_t* inptr5 = inptr4 + win;
const uint8_t* inptr6 = inptr5 + win;
const uint8_t* inptr7 = inptr6 + win;
asm volatile(
"pld [%[ptr0]] @ preload a, 64byte\n"
"pld [%[ptr0], #64] @ preload a, 64byte\n"
"pld [%[ptr1]] @ preload a, 64byte\n"
"pld [%[ptr1], #64] @ preload a, 64byte\n"
"pld [%[ptr2]] @ preload a, 64byte\n"
"pld [%[ptr2], #64] @ preload a, 64byte\n"
"pld [%[ptr3]] @ preload a, 64byte\n"
"pld [%[ptr3], #64] @ preload a, 64byte\n"
"pld [%[ptr4]] @ preload a, 64byte\n"
"pld [%[ptr4], #64] @ preload a, 64byte\n"
"pld [%[ptr5]] @ preload a, 64byte\n"
"pld [%[ptr5], #64] @ preload a, 64byte\n"
"pld [%[ptr6]] @ preload a, 64byte\n"
"pld [%[ptr6], #64] @ preload a, 64byte\n"
"pld [%[ptr7]] @ preload a, 64byte\n"
"pld [%[ptr7], #64] @ preload a, 64byte\n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3),
[ptr4] "r"(inptr4),
[ptr5] "r"(inptr5),
[ptr6] "r"(inptr6),
[ptr7] "r"(inptr7)
: "memory");
int j = 0;
for (; j < w_in; j++) {
int tmpx = i * 3;
uint8_t* outptr = dst + (hout - j) * wout + tmpx;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
}
}
for (; i < h_in; i++) {
const uint8_t* inptr0 = src + i * win;
for (int j = 0; j < w_in; j++) {
uint8_t* outptr0 = dst + (hout - j) * wout + i * 3;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
}
}
}
#endif
/*
bgr1 bgr2 bgr3
bgr4 bgr5 bgr6
bgr7 bgr8 bgr9
rotate:
bgr9 bgr8 bgr7
bgr6 bgr5 bgr4
bgr3 bgr2 bgr1
*/
// filp y
#ifdef __aarch64__
void rotate180_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) {
int w_in = w * 3;
uint8_t zerobuff[w_in]; // NOLINT
memset(zerobuff, 0, w_in * sizeof(uint8_t));
int stride_w = 24;
for (int i = 0; i < h_in; i += 4) {
const uint8_t* inptr0 = src + i * w_in;
const uint8_t* inptr1 = inptr0 + w_in;
const uint8_t* inptr2 = inptr1 + w_in;
const uint8_t* inptr3 = inptr2 + w_in;
uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w; // last col
uint8_t* outptr1 = outptr0 - w_in;
uint8_t* outptr2 = outptr1 - w_in;
uint8_t* outptr3 = outptr2 - w_in;
asm volatile(
"prfm pldl1keep, [%[ptr0]] \n"
"prfm pldl1keep, [%[ptr1]] \n"
"prfm pldl1keep, [%[ptr2]] \n"
"prfm pldl1keep, [%[ptr3]] \n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3)
: "memory");
int j = 0;
for (; j < w - 7; j += 8) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 3:
inptr0 = zerobuff;
outptr0 = zerobuff;
case 2:
inptr1 = zerobuff;
outptr1 = zerobuff;
case 1:
inptr2 = zerobuff;
outptr2 = zerobuff;
case 0:
inptr3 = zerobuff;
outptr3 = zerobuff;
default:
break;
}
}
asm volatile(
"ld3 {v0.8b, v1.8b, v2.8b}, [%[inptr0]], #24 \n" // v0={00,01,02,
// 03, 04, 05,
// 06, 07}"
"ld3 {v3.8b, v4.8b, v5.8b}, [%[inptr1]], #24 \n" // v0={10,11,12,
// 13, 14, 15,
// 16, 17}"
"ld3 {v6.8b, v7.8b, v8.8b}, [%[inptr2]], #24 \n" // v0={20,21,22,
// 23, 24, 25,
// 26, 27}"
"ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24 \n" // v0={30,31,32,
// 33, 34, 35,
// 36, 37}"
"rev64 v12.8b, v0.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 b
"rev64 v13.8b, v1.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 g
"rev64 v14.8b, v2.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 r
"rev64 v15.8b, v3.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v16.8b, v4.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v17.8b, v5.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v18.8b, v6.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v19.8b, v7.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v20.8b, v8.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v21.8b, v9.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v22.8b, v10.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v23.8b, v11.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"prfm pldl1keep, [%[inptr0]] \n"
"prfm pldl1keep, [%[inptr1]] \n"
"prfm pldl1keep, [%[inptr2]] \n"
"prfm pldl1keep, [%[inptr3]] \n"
"st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]] \n" // 00 10
// 20 30
// 04 14
// 24 34
"st3 {v15.8b, v16.8b, v17.8b}, [%[outptr1]] \n" // 02 12
// 22 32
"st3 {v18.8b, v19.8b, v20.8b}, [%[outptr2]] \n" // 01 11
// 21 31
"st3 {v21.8b, v22.8b, v23.8b}, [%[outptr3]] \n" // 03 13
// 23 33
"sub %[outptr0], %[outptr0], %[stride_w] \n" //@ ptr - stride_w
"sub %[outptr1], %[outptr1], %[stride_w] \n"
"sub %[outptr2], %[outptr2], %[stride_w] \n"
"sub %[outptr3], %[outptr3], %[stride_w] \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23");
}
outptr3 += stride_w - 3;
outptr2 += stride_w - 3;
outptr1 += stride_w - 3;
outptr0 += stride_w - 3;
for (; j < w; j++) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 0:
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 6;
case 1:
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 6;
case 2:
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 6;
case 3:
// inptr3 = zerobuff;
default:
break;
}
} else {
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
outptr3 -= 6;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 6;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 6;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 6;
}
}
}
}
#else
void rotate180_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) {
int w_in = w * 3;
uint8_t zerobuff[w_in]; // NOLINT
memset(zerobuff, 0, w_in * sizeof(uint8_t));
int stride_w = 24;
// 4*8
for (int i = 0; i < h_in; i += 4) {
const uint8_t* inptr0 = src + i * w_in;
const uint8_t* inptr1 = inptr0 + w_in;
const uint8_t* inptr2 = inptr1 + w_in;
const uint8_t* inptr3 = inptr2 + w_in;
uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w; // last
uint8_t* outptr1 = outptr0 - w_in;
uint8_t* outptr2 = outptr1 - w_in;
uint8_t* outptr3 = outptr2 - w_in;
asm volatile(
"pld [%[ptr0]] @ preload a, 64byte\n"
"pld [%[ptr1]] @ preload a, 64byte\n"
"pld [%[ptr2]] @ preload a, 64byte\n"
"pld [%[ptr3]] @ preload a, 64byte\n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3)
: "memory");
int j = 0;
for (; j < w - 7; j += 8) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 3:
inptr0 = zerobuff;
outptr0 = zerobuff;
case 2:
inptr1 = zerobuff;
outptr1 = zerobuff;
case 1:
inptr2 = zerobuff;
outptr2 = zerobuff;
case 0:
inptr3 = zerobuff;
outptr3 = zerobuff;
default:
break;
}
}
asm volatile(
"vld3.8 {d0, d1, d2}, [%[inptr0]]! @ zip load r0, d0 =00 01 02 03 "
"04 05 06 07\n"
"vld3.8 {d3, d4, d5}, [%[inptr1]]! @ zip load r1, d2 =10 11 12 13 "
"14 15 16 17\n"
"vld3.8 {d6, d7, d8}, [%[inptr2]]! @ zip load r1, d4 =20 21 22 23 "
"24 25 26 27\n"
"vld3.8 {d9, d10, d11}, [%[inptr3]]! @ zip load r1, d6 = 30 31 32 "
"33 34 35 36 37\n"
"vrev64.8 d12, d0 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d13, d1 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d14, d2 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d15, d3 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d16, d4 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d17, d5 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d18, d6 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d19, d7 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d20, d8 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d21, d9 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d22, d10 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d23, d11 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"pld [%[inptr0]] @ preload a, 64byte\n"
"pld [%[inptr1]] @ preload a, 64byte\n"
"pld [%[inptr2]] @ preload a, 64byte\n"
"pld [%[inptr3]] @ preload a, 64byte\n"
"vst3.8 {d12, d13, d14}, [%[outptr0]] @ write "
"d0(q0,low),r00,r10 20 30\n"
"vst3.8 {d15, d16, d17}, [%[outptr1]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"vst3.8 {d18, d19, d20}, [%[outptr2]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"vst3.8 {d21, d22, d23}, [%[outptr3]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"sub %[outptr0], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr1], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr2], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr3], %[stride_w] @ ptr - stride_w \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "q0",
"q1",
"q2",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12");
}
outptr3 += stride_w - 3;
outptr2 += stride_w - 3;
outptr1 += stride_w - 3;
outptr0 += stride_w - 3;
for (; j < w; j++) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 0:
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 6;
case 1:
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 6;
case 2:
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 6;
case 3:
// inptr3 = zerobuff;
default:
break;
}
} else {
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
outptr3 -= 6;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 6;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 6;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 6;
}
}
}
}
#endif
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/tests/cv/anakin/cv_utils.h"
void bgr_to_tensor_hwc(const uint8_t* bgr,
Tensor& output, // NOLINT
int width,
int height,
float* means,
float* scales) {
int size = width * height;
float* ptr0 = output.mutable_data<float>();
float r_means = means[0];
float g_means = means[1];
float b_means = means[2];
float r_scales = scales[0];
float g_scales = scales[1];
float b_scales = scales[2];
int w = width;
int dim8 = w >> 3;
int remain = w - (dim8 << 3);
float32x4_t vrmean = vdupq_n_f32(r_means);
float32x4_t vgmean = vdupq_n_f32(g_means);
float32x4_t vbmean = vdupq_n_f32(b_means);
float32x4_t vrscale = vdupq_n_f32(r_scales);
float32x4_t vgscale = vdupq_n_f32(g_scales);
float32x4_t vbscale = vdupq_n_f32(b_scales);
for (int i = 0; i < height; i++) {
const uint8_t* ptr_bgr = bgr + i * width * 3;
float* ptr0_b = ptr0 + i * width;
float* ptr1_g = ptr0_b + size;
float* ptr2_r = ptr1_g + size;
for (int j = 0; j < dim8; j++) {
uint8x8x3_t vbgr = vld3_u8(ptr_bgr);
uint8x8_t vb = vbgr.val[0];
uint8x8_t vg = vbgr.val[1];
uint8x8_t vr = vbgr.val[2];
uint16x8_t vb_16 = vmovl_u8(vb);
uint16x8_t vg_16 = vmovl_u8(vg);
uint16x8_t vr_16 = vmovl_u8(vr);
uint32x4_t vb_low_32 = vmovl_u16(vget_low_u16(vb_16));
uint32x4_t vg_low_32 = vmovl_u16(vget_low_u16(vg_16));
uint32x4_t vr_low_32 = vmovl_u16(vget_low_u16(vr_16));
uint32x4_t vb_high_32 = vmovl_u16(vget_high_u16(vb_16));
uint32x4_t vg_high_32 = vmovl_u16(vget_high_u16(vg_16));
uint32x4_t vr_high_32 = vmovl_u16(vget_high_u16(vr_16));
float32x4_t vb_low_f32 = vcvtq_f32_u32(vb_low_32);
float32x4_t vr_low_f32 = vcvtq_f32_u32(vr_low_32);
float32x4_t vg_low_f32 = vcvtq_f32_u32(vg_low_32);
float32x4_t vb_high_f32 = vcvtq_f32_u32(vb_high_32);
float32x4_t vg_high_f32 = vcvtq_f32_u32(vg_high_32);
float32x4_t vr_high_f32 = vcvtq_f32_u32(vr_high_32);
vb_low_f32 = vsubq_f32(vb_low_f32, vbmean);
vg_low_f32 = vsubq_f32(vg_low_f32, vgmean);
vr_low_f32 = vsubq_f32(vr_low_f32, vrmean);
vb_high_f32 = vsubq_f32(vb_high_f32, vbmean);
vg_high_f32 = vsubq_f32(vg_high_f32, vgmean);
vr_high_f32 = vsubq_f32(vr_high_f32, vrmean);
vb_low_f32 = vmulq_f32(vb_low_f32, vbscale);
vg_low_f32 = vmulq_f32(vg_low_f32, vgscale);
vr_low_f32 = vmulq_f32(vr_low_f32, vrscale);
vb_high_f32 = vmulq_f32(vb_high_f32, vbscale);
vg_high_f32 = vmulq_f32(vg_high_f32, vgscale);
vr_high_f32 = vmulq_f32(vr_high_f32, vrscale);
vst1q_f32(ptr0_b, vb_low_f32);
vst1q_f32(ptr1_g, vg_low_f32);
vst1q_f32(ptr2_r, vr_low_f32);
ptr_bgr += 24;
vst1q_f32(ptr0_b + 4, vb_high_f32);
vst1q_f32(ptr1_g + 4, vg_high_f32);
vst1q_f32(ptr2_r + 4, vr_high_f32);
ptr0_b += 8;
ptr1_g += 8;
ptr2_r += 8;
}
for (int j = 0; j < remain; j++) {
*ptr0_b++ = (*ptr_bgr - b_means) * b_scales; // NOLINT
ptr_bgr++;
*ptr1_g++ = (*ptr_bgr - g_means) * g_scales; // NOLINT
ptr_bgr++;
*ptr2_r++ = (*ptr_bgr - r_means) * r_scales; // NOLINT
ptr_bgr++;
}
}
}
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/tests/cv/anakin/cv_utils.h"
void flip_x_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
void flip_y_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
void flip_xy_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
// x: flip_num = 1 y: flip_num = -1 xy: flip_num = 0;
void bgra_flip_hwc(
const uint8_t* src, uint8_t* dst, int w_in, int h_in, int flip_num) {
if (flip_num == 1) { // x
flip_x_hwc_bgra(src, dst, w_in, h_in);
}
if (flip_num == -1) { // y
flip_y_hwc_bgra(src, dst, w_in, h_in);
}
if (flip_num == 0) { // xy
flip_xy_hwc_bgra(src, dst, w_in, h_in);
}
}
/*
bgr1 bgr2 bgr3
bgr4 bgr5 bgr6
bgr7 bgr8 bgr9
rotate:
bgr7 bgr8 bgr9
bgr4 bgr5 bgr6
bgr1 bgr2 bgr3
*/
#ifdef __aarch64__
void flip_x_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
int h = h_in - 1;
int win = w_in * 4;
uint8_t zerobuff[win]; // NOLINT
memset(zerobuff, 0, win * sizeof(uint8_t));
uint8_t zerobuff2[win]; // NOLINT
memset(zerobuff2, 0, win * sizeof(uint8_t));
for (int i = 0; i < h_in; i += 4) {
const uint8_t* inptr0 = src + i * win;
const uint8_t* inptr1 = inptr0 + win;
const uint8_t* inptr2 = inptr1 + win;
const uint8_t* inptr3 = inptr2 + win;
uint8_t* outptr0 = dst + (h - i) * win; // last
uint8_t* outptr1 = outptr0 - win;
uint8_t* outptr2 = outptr1 - win;
uint8_t* outptr3 = outptr2 - win;
asm volatile(
"prfm pldl1keep, [%[ptr0]] \n"
"prfm pldl1keep, [%[ptr1]] \n"
"prfm pldl1keep, [%[ptr2]] \n"
"prfm pldl1keep, [%[ptr3]] \n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3)
: "memory");
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 3:
inptr0 = zerobuff;
outptr0 = zerobuff2;
case 2:
inptr1 = zerobuff;
outptr1 = zerobuff2;
case 1:
inptr2 = zerobuff;
outptr2 = zerobuff2;
case 0:
inptr3 = zerobuff;
outptr3 = zerobuff2;
default:
break;
}
}
int j = 0;
for (; j < w_in - 7; j += 8) {
asm volatile(
"ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[inptr0]], #32 \n" // v0={00,01,02,
// 03,
// 04,
// 05,
// 06,
// 07}"
"ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [%[inptr1]], #32 \n" // v0={10,11,12,
// 13,
// 14,
// 15,
// 16,
// 17}"
"ld4 {v8.8b, v9.8b, v10.8b, v11.8b}, [%[inptr2]], #32 \n" // v0={20,21,22,
// 23,
// 24,
// 25,
// 26,
// 27}"
"ld4 {v12.8b, v13.8b, v14.8b, v15.8b}, [%[inptr3]], #32 \n" // v0={30,31,32,
// 33,
// 34,
// 35,
// 36,
// 37}"
"prfm pldl1keep, [%[inptr0]] \n"
"prfm pldl1keep, [%[inptr1]] \n"
"prfm pldl1keep, [%[inptr2]] \n"
"prfm pldl1keep, [%[inptr3]] \n"
"st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[outptr0]], #32 \n" // 00 10 20
// 30 04 14
// 24 34
"st4 {v4.8b, v5.8b, v6.8b, v7.8b}, [%[outptr1]], #32 \n" // 02 12 22 32
"st4 {v8.8b, v9.8b, v10.8b, v11.8b}, [%[outptr2]], #32 \n" // 01 11 21 31
"st4 {v12.8b, v13.8b, v14.8b, v15.8b}, [%[outptr3]], #32 "
" \n" // 03 13 23 33
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3)
:
: "v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15");
}
for (; j < w_in; j++) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 0:
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
case 1:
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
case 2:
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
case 3:
// inptr3 = zerobuff;
default:
break;
}
} else {
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
}
}
}
}
#else
void flip_x_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
// uint8_t zerobuff[8] = {0, 0, 0, 0, 0, 0, 0, 0};
int win = w_in * 4;
uint8_t zerobuff[win]; // NOLINT
memset(zerobuff, 0, win * sizeof(uint8_t));
uint8_t zerobuff2[win]; // NOLINT
memset(zerobuff2, 0, win * sizeof(uint8_t));
int h = h_in - 1;
// 4*8
for (int i = 0; i < h_in; i += 4) {
const uint8_t* inptr0 = src + i * win;
const uint8_t* inptr1 = inptr0 + win;
const uint8_t* inptr2 = inptr1 + win;
const uint8_t* inptr3 = inptr2 + win;
uint8_t* outptr0 = dst + (h - i) * win; // last
uint8_t* outptr1 = outptr0 - win;
uint8_t* outptr2 = outptr1 - win;
uint8_t* outptr3 = outptr2 - win;
asm volatile(
"pld [%[ptr0]] @ preload a, 64byte\n"
"pld [%[ptr1]] @ preload a, 64byte\n"
"pld [%[ptr2]] @ preload a, 64byte\n"
"pld [%[ptr3]] @ preload a, 64byte\n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3)
: "memory");
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 3:
inptr0 = zerobuff;
outptr0 = zerobuff2;
case 2:
inptr1 = zerobuff;
outptr1 = zerobuff2;
case 1:
inptr2 = zerobuff;
outptr2 = zerobuff2;
case 0:
inptr3 = zerobuff;
outptr3 = zerobuff2;
default:
break;
}
}
int j = 0;
for (; j < w_in - 7; j += 8) {
asm volatile(
"vld4.8 {d0, d1, d2, d3}, [%[inptr0]]! @ zip load r0, d0 =00 01 "
"02 03 04 05 06 07\n"
"vld4.8 {d4, d5, d6, d7}, [%[inptr1]]! @ zip load r1, d2 =10 11 "
"12 13 14 15 16 17\n"
"vld4.8 {d8, d9, d10, d11}, [%[inptr2]]! @ zip load r1, d4 =20 21 "
"22 23 24 25 26 27\n"
"vld4.8 {d12, d13, d14, d15}, [%[inptr3]]! @ zip load r1, d6 = 30 "
"31 32 33 34 35 36 37\n"
"pld [%[inptr0]] @ preload a, 64byte\n"
"pld [%[inptr1]] @ preload a, 64byte\n"
"pld [%[inptr2]] @ preload a, 64byte\n"
"pld [%[inptr3]] @ preload a, 64byte\n"
"vst4.8 {d0, d1, d2, d3}, [%[outptr0]]! @ write "
"d0(q0,low),r00,r10 20 30\n"
"vst4.8 {d4, d5, d6, d7}, [%[outptr1]]! @ write "
"d4(q0,low),r01,r11 21 31\n"
"vst4.8 {d8, d9, d10, d11}, [%[outptr2]]! @ write "
"d4(q0,low),r01,r11 21 31\n"
"vst4.8 {d12, d13, d14, d15}, [%[outptr3]]! @ write "
"d4(q0,low),r01,r11 21 31\n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3)
:
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
}
for (; j < w_in; j++) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 0:
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
case 1:
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
case 2:
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
case 3:
// inptr3 = zerobuff;
default:
break;
}
} else {
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
}
}
}
}
#endif
/*
bgr1 bgr2 bgr3
bgr4 bgr5 bgr6
bgr7 bgr8 bgr9
flip:
bgr3 bgr2 bgr1
bgr6 bgr5 bgr4
bgr9 bgr8 bgr7
*/
#ifdef __aarch64__
void flip_y_hwc_bgra(const uint8_t* src, uint8_t* dst, int w, int h_in) {
// uint8_t zerobuff[24] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
// 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int w_in = w * 4;
uint8_t zerobuff[w_in]; // NOLINT
memset(zerobuff, 0, w_in * sizeof(uint8_t));
uint8_t zerobuff2[w_in]; // NOLINT
memset(zerobuff2, 0, w_in * sizeof(uint8_t));
int stride_w = 32;
for (int i = 0; i < h_in; i += 4) {
const uint8_t* inptr0 = src + i * w_in;
const uint8_t* inptr1 = inptr0 + w_in;
const uint8_t* inptr2 = inptr1 + w_in;
const uint8_t* inptr3 = inptr2 + w_in;
uint8_t* outptr0 = dst + (i + 1) * w_in - stride_w; // last col
uint8_t* outptr1 = outptr0 + w_in;
uint8_t* outptr2 = outptr1 + w_in;
uint8_t* outptr3 = outptr2 + w_in;
asm volatile(
"prfm pldl1keep, [%[ptr0]] \n"
"prfm pldl1keep, [%[ptr1]] \n"
"prfm pldl1keep, [%[ptr2]] \n"
"prfm pldl1keep, [%[ptr3]] \n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3)
: "memory");
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 3:
inptr0 = zerobuff;
outptr0 = zerobuff2;
case 2:
inptr1 = zerobuff;
outptr1 = zerobuff2;
case 1:
inptr2 = zerobuff;
outptr2 = zerobuff2;
case 0:
inptr3 = zerobuff;
outptr3 = zerobuff2;
default:
break;
}
}
int j = 0;
for (; j < w - 7; j += 8) {
asm volatile(
"ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[inptr0]], #32 \n" // v0={00,01,02,
// 03,
// 04,
// 05,
// 06,
// 07}"
"ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [%[inptr1]], #32 \n" // v0={10,11,12,
// 13,
// 14,
// 15,
// 16,
// 17}"
"ld4 {v8.8b, v9.8b, v10.8b, v11.8b}, [%[inptr2]], #32 \n" // v0={20,21,22,
// 23,
// 24,
// 25,
// 26,
// 27}"
"ld4 {v12.8b, v13.8b, v14.8b, v15.8b}, [%[inptr3]], #32 \n" // v0={30,31,32,
// 33,
// 34,
// 35,
// 36,
// 37}"
"rev64 v16.8b, v0.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 b
"rev64 v17.8b, v1.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 g
"rev64 v18.8b, v2.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 r
"rev64 v19.8b, v3.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v20.8b, v4.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v21.8b, v5.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v22.8b, v6.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v23.8b, v7.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v0.8b, v8.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00
"rev64 v1.8b, v9.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00
"rev64 v2.8b, v10.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v3.8b, v11.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v4.8b, v12.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v5.8b, v13.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v6.8b, v14.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v7.8b, v15.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"prfm pldl1keep, [%[inptr0]] \n"
"prfm pldl1keep, [%[inptr1]] \n"
"prfm pldl1keep, [%[inptr2]] \n"
"prfm pldl1keep, [%[inptr3]] \n"
"st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[outptr0]] \n" // 00 10 20 30 04 14 24 34
"st4 {v20.8b, v21.8b, v22.8b, v23.8b}, [%[outptr1]] \n" // 02 12 22 32
"st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[outptr2]] \n" // 01
// 11
// 21
// 31
"st4 {v4.8b, v5.8b, v6.8b, v7.8b}, [%[outptr3]] \n" // 03 13 23 33
"sub %[outptr0], %[outptr0], %[stride_w] \n" //@ ptr - stride_w
"sub %[outptr1], %[outptr1], %[stride_w] \n"
"sub %[outptr2], %[outptr2], %[stride_w] \n"
"sub %[outptr3], %[outptr3], %[stride_w] \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23");
}
outptr3 += stride_w - 4;
outptr2 += stride_w - 4;
outptr1 += stride_w - 4;
outptr0 += stride_w - 4;
for (; j < w; j++) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 0:
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 8;
case 1:
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 8;
case 2:
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 8;
case 3:
// inptr3 = zerobuff;
default:
break;
}
} else {
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
outptr3 -= 8;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 8;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 8;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 8;
}
}
}
}
#else
void flip_y_hwc_bgra(const uint8_t* src, uint8_t* dst, int w, int h_in) {
int w_in = w * 4;
uint8_t zerobuff[w_in]; // NOLINT
memset(zerobuff, 0, w_in * sizeof(uint8_t));
uint8_t zerobuff2[w_in]; // NOLINT
memset(zerobuff2, 0, w_in * sizeof(uint8_t));
int stride_w = 32;
// 4*8
for (int i = 0; i < h_in; i += 4) {
const uint8_t* inptr0 = src + i * w_in;
const uint8_t* inptr1 = inptr0 + w_in;
const uint8_t* inptr2 = inptr1 + w_in;
const uint8_t* inptr3 = inptr2 + w_in;
uint8_t* outptr0 = dst + (i + 1) * w_in - stride_w; // last
uint8_t* outptr1 = outptr0 + w_in;
uint8_t* outptr2 = outptr1 + w_in;
uint8_t* outptr3 = outptr2 + w_in;
asm volatile(
"pld [%[ptr0]] @ preload a, 64byte\n"
"pld [%[ptr1]] @ preload a, 64byte\n"
"pld [%[ptr2]] @ preload a, 64byte\n"
"pld [%[ptr3]] @ preload a, 64byte\n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3)
: "memory");
int j = 0;
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 3:
inptr0 = zerobuff;
outptr0 = zerobuff2;
case 2:
inptr1 = zerobuff;
outptr1 = zerobuff2;
case 1:
inptr2 = zerobuff;
outptr2 = zerobuff2;
case 0:
inptr3 = zerobuff;
outptr3 = zerobuff2;
default:
break;
}
}
for (; j < w - 7; j += 8) {
asm volatile(
"vld4.8 {d0, d1, d2, d3}, [%[inptr0]]! @ zip load r0, d0 =00 01 "
"02 03 04 05 06 07\n"
"vld4.8 {d4, d5, d6, d7}, [%[inptr1]]! @ zip load r1, d2 =10 11 "
"12 13 14 15 16 17\n"
"vld4.8 {d8, d9, d10, d11}, [%[inptr2]]! @ zip load r1, d4 =20 21 "
"22 23 24 25 26 27\n"
"vld4.8 {d12, d13, d14, d15}, [%[inptr3]]! @ zip load r1, d6 = 30 "
"31 32 33 34 35 36 37\n"
"vrev64.8 d16, d0 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d17, d1 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d18, d2 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d19, d3 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d20, d4 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d21, d5 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d22, d6 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d23, d7 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d0, d8 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d1, d9 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d2, d10 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d3, d11 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d4, d12 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d5, d13 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d6, d14 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d7, d15 @ reverse 07 06 05 04 03 02 01 00 \n"
"pld [%[inptr0]] @ preload a, 64byte\n"
"pld [%[inptr1]] @ preload a, 64byte\n"
"pld [%[inptr2]] @ preload a, 64byte\n"
"pld [%[inptr3]] @ preload a, 64byte\n"
"vst4.8 {d16, d17, d18, d19}, [%[outptr0]] @ write "
"d0(q0,low),r00,r10 20 30\n"
"vst4.8 {d20, d21, d22, d23}, [%[outptr1]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"vst4.8 {d0, d1, d2, d3}, [%[outptr2]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"vst4.8 {d4, d5, d6, d7}, [%[outptr3]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"sub %[outptr0], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr1], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr2], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr3], %[stride_w] @ ptr - stride_w \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "q0",
"q1",
"q2",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12");
}
outptr3 += stride_w - 4;
outptr2 += stride_w - 4;
outptr1 += stride_w - 4;
outptr0 += stride_w - 4;
for (; j < w; j++) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 0:
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 8;
case 1:
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 8;
case 2:
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 8;
case 3:
// inptr3 = zerobuff;
default:
break;
}
} else {
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
outptr3 -= 8;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 8;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 8;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 8;
}
}
}
}
#endif
/*
bgr1 bgr2 bgr3
bgr4 bgr5 bgr6
bgr7 bgr8 bgr9
flip:
bgr9 bgr8 bgr7
bgr6 bgr5 bgr4
bgr3 bgr2 bgr1
*/
#ifdef __aarch64__
void flip_xy_hwc_bgra(const uint8_t* src, uint8_t* dst, int w, int h_in) {
int stride_w = 32;
int w_in = w * 4;
uint8_t zerobuff[w_in]; // NOLINT
memset(zerobuff, 0, w_in * sizeof(uint8_t));
uint8_t zerobuff2[w_in]; // NOLINT
memset(zerobuff2, 0, w_in * sizeof(uint8_t));
for (int i = 0; i < h_in; i += 4) {
const uint8_t* inptr0 = src + i * w_in;
const uint8_t* inptr1 = inptr0 + w_in;
const uint8_t* inptr2 = inptr1 + w_in;
const uint8_t* inptr3 = inptr2 + w_in;
uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w; // last col
uint8_t* outptr1 = outptr0 - w_in;
uint8_t* outptr2 = outptr1 - w_in;
uint8_t* outptr3 = outptr2 - w_in;
asm volatile(
"prfm pldl1keep, [%[ptr0]] \n"
"prfm pldl1keep, [%[ptr1]] \n"
"prfm pldl1keep, [%[ptr2]] \n"
"prfm pldl1keep, [%[ptr3]] \n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3)
: "memory");
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 3:
inptr0 = zerobuff;
outptr0 = zerobuff2;
case 2:
inptr1 = zerobuff;
outptr1 = zerobuff2;
case 1:
inptr2 = zerobuff;
outptr2 = zerobuff2;
case 0:
inptr3 = zerobuff;
outptr3 = zerobuff2;
default:
break;
}
}
int j = 0;
for (; j < w - 7; j += 8) {
asm volatile(
"ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[inptr0]], #32 \n" // v0={00,01,02,
// 03,
// 04,
// 05,
// 06,
// 07}"
"ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [%[inptr1]], #32 \n" // v0={10,11,12,
// 13,
// 14,
// 15,
// 16,
// 17}"
"ld4 {v8.8b, v9.8b, v10.8b, v11.8b}, [%[inptr2]], #32 \n" // v0={20,21,22,
// 23,
// 24,
// 25,
// 26,
// 27}"
"ld4 {v12.8b, v13.8b, v14.8b, v15.8b}, [%[inptr3]], #32 \n" // v0={30,31,32,
// 33,
// 34,
// 35,
// 36,
// 37}"
"rev64 v16.8b, v0.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 b
"rev64 v17.8b, v1.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 g
"rev64 v18.8b, v2.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 r
"rev64 v19.8b, v3.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v20.8b, v4.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v21.8b, v5.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v22.8b, v6.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v23.8b, v7.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v0.8b, v8.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00
"rev64 v1.8b, v9.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00
"rev64 v2.8b, v10.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v3.8b, v11.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v4.8b, v12.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v5.8b, v13.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v6.8b, v14.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v7.8b, v15.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"prfm pldl1keep, [%[inptr0]] \n"
"prfm pldl1keep, [%[inptr1]] \n"
"prfm pldl1keep, [%[inptr2]] \n"
"prfm pldl1keep, [%[inptr3]] \n"
"st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[outptr0]] \n" // 00 10 20 30 04 14 24 34
"st4 {v20.8b, v21.8b, v22.8b, v23.8b}, [%[outptr1]] \n" // 02 12 22 32
"st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[outptr2]] \n" // 01
// 11
// 21
// 31
"st4 {v4.8b, v5.8b, v6.8b, v7.8b}, [%[outptr3]] \n" // 03 13 23 33
"sub %[outptr0], %[outptr0], %[stride_w] \n" //@ ptr - stride_w
"sub %[outptr1], %[outptr1], %[stride_w] \n"
"sub %[outptr2], %[outptr2], %[stride_w] \n"
"sub %[outptr3], %[outptr3], %[stride_w] \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23");
}
outptr3 += stride_w - 4;
outptr2 += stride_w - 4;
outptr1 += stride_w - 4;
outptr0 += stride_w - 4;
for (; j < w; j++) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 0:
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 8;
case 1:
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 8;
case 2:
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 8;
case 3:
// inptr3 = zerobuff;
default:
break;
}
} else {
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
outptr3 -= 8;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 8;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 8;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 8;
}
}
}
}
#else
void flip_xy_hwc_bgra(const uint8_t* src, uint8_t* dst, int w, int h_in) {
int w_in = w * 4;
uint8_t zerobuff[w_in]; // NOLINT
memset(zerobuff, 0, w_in * sizeof(uint8_t));
uint8_t zerobuff2[w_in]; // NOLINT
memset(zerobuff2, 0, w_in * sizeof(uint8_t));
int stride_w = 32;
// 4*8
for (int i = 0; i < h_in; i += 4) {
const uint8_t* inptr0 = src + i * w_in;
const uint8_t* inptr1 = inptr0 + w_in;
const uint8_t* inptr2 = inptr1 + w_in;
const uint8_t* inptr3 = inptr2 + w_in;
uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w; // last
uint8_t* outptr1 = outptr0 - w_in;
uint8_t* outptr2 = outptr1 - w_in;
uint8_t* outptr3 = outptr2 - w_in;
asm volatile(
"pld [%[ptr0]] @ preload a, 64byte\n"
"pld [%[ptr1]] @ preload a, 64byte\n"
"pld [%[ptr2]] @ preload a, 64byte\n"
"pld [%[ptr3]] @ preload a, 64byte\n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3)
: "memory");
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 3:
inptr0 = zerobuff;
outptr0 = zerobuff2;
case 2:
inptr1 = zerobuff;
outptr1 = zerobuff2;
case 1:
inptr2 = zerobuff;
outptr2 = zerobuff2;
case 0:
inptr3 = zerobuff;
outptr3 = zerobuff2;
default:
break;
}
}
int j = 0;
for (; j < w - 7; j += 8) {
asm volatile(
"vld4.8 {d0, d1, d2, d3}, [%[inptr0]]! @ zip load r0, d0 =00 01 "
"02 03 04 05 06 07\n"
"vld4.8 {d4, d5, d6, d7}, [%[inptr1]]! @ zip load r1, d2 =10 11 "
"12 13 14 15 16 17\n"
"vld4.8 {d8, d9, d10, d11}, [%[inptr2]]! @ zip load r1, d4 =20 21 "
"22 23 24 25 26 27\n"
"vld4.8 {d12, d13, d14, d15}, [%[inptr3]]! @ zip load r1, d6 = 30 "
"31 32 33 34 35 36 37\n"
"vrev64.8 d16, d0 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d17, d1 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d18, d2 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d19, d3 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d20, d4 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d21, d5 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d22, d6 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d23, d7 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d0, d8 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d1, d9 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d2, d10 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d3, d11 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d4, d12 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d5, d13 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d6, d14 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d7, d15 @ reverse 07 06 05 04 03 02 01 00 \n"
"pld [%[inptr0]] @ preload a, 64byte\n"
"pld [%[inptr1]] @ preload a, 64byte\n"
"pld [%[inptr2]] @ preload a, 64byte\n"
"pld [%[inptr3]] @ preload a, 64byte\n"
"vst4.8 {d16, d17, d18, d19}, [%[outptr0]] @ write "
"d0(q0,low),r00,r10 20 30\n"
"vst4.8 {d20, d21, d22, d23}, [%[outptr1]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"vst4.8 {d0, d1, d2, d3}, [%[outptr2]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"vst4.8 {d4, d5, d6, d7}, [%[outptr3]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"sub %[outptr0], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr1], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr2], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr3], %[stride_w] @ ptr - stride_w \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "q0",
"q1",
"q2",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12");
}
outptr3 += stride_w - 4;
outptr2 += stride_w - 4;
outptr1 += stride_w - 4;
outptr0 += stride_w - 4;
for (; j < w; j++) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 0:
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 8;
case 1:
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 8;
case 2:
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 8;
case 3:
// inptr3 = zerobuff;
default:
break;
}
} else {
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
outptr3 -= 8;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 8;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 8;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 8;
}
}
}
}
#endif
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <limits.h>
#include <math.h>
#include "lite/tests/cv/anakin/cv_utils.h"
void resize_four_channel(
const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
void bgra_resize(const uint8_t* src,
uint8_t* dst,
int w_in,
int h_in,
int w_out,
int h_out) {
if (w_out == w_in && h_out == h_in) {
memcpy(dst, src, sizeof(char) * w_in * h_in * 4);
return;
}
// y
resize_four_channel(src, w_in * 4, h_in, dst, w_out * 4, h_out);
}
void resize_four_channel(const uint8_t* src,
int w_in,
int h_in,
uint8_t* dst,
int w_out,
int h_out) {
const int resize_coef_bits = 11;
const int resize_coef_scale = 1 << resize_coef_bits;
double scale_x = static_cast<double>(w_in) / w_out;
double scale_y = static_cast<double>(h_in) / h_out;
int* buf = new int[w_out * 2 + h_out * 2];
int* xofs = buf; // new int[w];
int* yofs = buf + w_out; // new int[h];
int16_t* ialpha =
reinterpret_cast<int16_t*>(buf + w_out + h_out); // new int16_t[w * 2];
int16_t* ibeta = reinterpret_cast<int16_t*>(buf + w_out * 2 +
h_out); // new int16_t[h * 2];
float fx = 0.f;
float fy = 0.f;
int sx = 0.f;
int sy = 0.f;
#define SATURATE_CAST_int16_t(X) \
(int16_t)::std::min( \
::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
SHRT_MAX);
for (int dx = 0; dx < w_out / 4; dx++) {
fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
sx = floor(fx);
fx -= sx;
if (sx < 0) {
sx = 0;
fx = 0.f;
}
if (sx >= w_in - 1) {
sx = w_in - 2;
fx = 1.f;
}
xofs[dx] = sx * 4;
float a0 = (1.f - fx) * resize_coef_scale;
float a1 = fx * resize_coef_scale;
ialpha[dx * 2] = SATURATE_CAST_int16_t(a0);
ialpha[dx * 2 + 1] = SATURATE_CAST_int16_t(a1);
}
for (int dy = 0; dy < h_out; dy++) {
fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
sy = floor(fy);
fy -= sy;
if (sy < 0) {
sy = 0;
fy = 0.f;
}
if (sy >= h_in - 1) {
sy = h_in - 2;
fy = 1.f;
}
yofs[dy] = sy;
float b0 = (1.f - fy) * resize_coef_scale;
float b1 = fy * resize_coef_scale;
ibeta[dy * 2] = SATURATE_CAST_int16_t(b0);
ibeta[dy * 2 + 1] = SATURATE_CAST_int16_t(b1);
}
#undef SATURATE_CAST_int16_t
// loop body
int16_t* rowsbuf0 = new int16_t[w_out + 1];
int16_t* rowsbuf1 = new int16_t[w_out + 1];
int16_t* rows0 = rowsbuf0;
int16_t* rows1 = rowsbuf1;
int prev_sy1 = -1;
for (int dy = 0; dy < h_out; dy++) {
int sy = yofs[dy];
if (sy == prev_sy1) {
// hresize one row
int16_t* rows0_old = rows0;
rows0 = rows1;
rows1 = rows0_old;
const uint8_t* S1 = src + w_in * (sy + 1);
const int16_t* ialphap = ialpha;
int16_t* rows1p = rows1;
for (int dx = 0; dx < w_out / 4; dx++) {
int sx = xofs[dx];
int16_t a0 = ialphap[0];
int16_t a1 = ialphap[1];
const uint8_t* S1p = S1 + sx;
int tmp = dx * 4;
rows1p[tmp] = (S1p[0] * a0 + S1p[4] * a1) >> 4;
rows1p[tmp + 1] = (S1p[1] * a0 + S1p[5] * a1) >> 4;
rows1p[tmp + 2] = (S1p[2] * a0 + S1p[6] * a1) >> 4;
rows1p[tmp + 3] = (S1p[3] * a0 + S1p[7] * a1) >> 4;
ialphap += 2;
}
} else {
// hresize two rows
const uint8_t* S0 = src + w_in * (sy);
const uint8_t* S1 = src + w_in * (sy + 1);
const int16_t* ialphap = ialpha;
int16_t* rows0p = rows0;
int16_t* rows1p = rows1;
for (int dx = 0; dx < w_out / 4; dx++) {
int sx = xofs[dx];
int16_t a0 = ialphap[0];
int16_t a1 = ialphap[1];
const uint8_t* S0p = S0 + sx;
const uint8_t* S1p = S1 + sx;
int tmp = dx * 4;
rows0p[tmp] = (S0p[0] * a0 + S0p[4] * a1) >> 4;
rows1p[tmp] = (S1p[0] * a0 + S1p[4] * a1) >> 4;
rows0p[tmp + 1] = (S0p[1] * a0 + S0p[5] * a1) >> 4;
rows1p[tmp + 1] = (S1p[1] * a0 + S1p[5] * a1) >> 4;
rows0p[tmp + 2] = (S0p[2] * a0 + S0p[6] * a1) >> 4;
rows1p[tmp + 2] = (S1p[2] * a0 + S1p[6] * a1) >> 4;
rows0p[tmp + 3] = (S0p[3] * a0 + S0p[7] * a1) >> 4;
rows1p[tmp + 3] = (S1p[3] * a0 + S1p[7] * a1) >> 4;
ialphap += 2;
}
}
prev_sy1 = sy + 1;
// vresize
int16_t b0 = ibeta[0];
int16_t b1 = ibeta[1];
int16_t* rows0p = rows0;
int16_t* rows1p = rows1;
uint8_t* dp_ptr = dst + w_out * (dy);
int cnt = w_out >> 3;
int remain = w_out - (cnt << 3);
int16x4_t _b0 = vdup_n_s16(b0);
int16x4_t _b1 = vdup_n_s16(b1);
int32x4_t _v2 = vdupq_n_s32(2);
for (cnt = w_out >> 3; cnt > 0; cnt--) {
int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
int32x4_t _acc = _v2;
_acc = vsraq_n_s32(
_acc, _rows0p_sr4_mb0, 16); // _acc >> 16 + _rows0p_sr4_mb0 >> 16
_acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
int32x4_t _acc_1 = _v2;
_acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
_acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
int16x4_t _acc16 = vshrn_n_s32(_acc, 2); // _acc >> 2
int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
vst1_u8(dp_ptr, _dout);
dp_ptr += 8;
rows0p += 8;
rows1p += 8;
}
for (; remain; --remain) {
// D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
*dp_ptr++ =
(uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
(int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
2);
}
ibeta += 2;
}
delete[] buf;
delete[] rowsbuf0;
delete[] rowsbuf1;
}
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/tests/cv/anakin/cv_utils.h"
void rotate90_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
void rotate270_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
void rotate180_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
void bgra_rotate_hwc(
const uint8_t* src, uint8_t* dst, int w_in, int h_in, int angle) {
if (angle == 90) {
rotate90_hwc_bgra(src, dst, w_in, h_in);
}
if (angle == 270) {
rotate270_hwc_bgra(src, dst, w_in, h_in);
}
if (angle == 180) {
rotate180_hwc_bgra(src, dst, w_in, h_in);
}
}
/*
bgr1 bgr2 bgr3
bgr4 bgr5 bgr6
bgr7 bgr8 bgr9
rotate:
bgr7 bgr4 bgr1
bgr8 bgr5 bgr2
bgr9 bgr6 bgr3
*/
void rotate90_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
int w_out = h_in;
int h_out = w_in;
int win = w_in * 4;
int wout = w_out * 4;
int hremain = h_in % 8;
int stride_h = 4 * win;
int stride_h_w = 4 * win - 32;
int ww = w_out - 8;
// block 8*8. -- 8*8
int i = 0;
for (i = 0; i < h_in - 7; i += 8) {
const uint8_t* inptr0 = src + i * win;
const uint8_t* inptr1 = inptr0 + win;
const uint8_t* inptr2 = inptr1 + win;
const uint8_t* inptr3 = inptr2 + win;
const uint8_t* inptr4 = inptr3 + win;
const uint8_t* inptr5 = inptr4 + win;
const uint8_t* inptr6 = inptr5 + win;
const uint8_t* inptr7 = inptr6 + win;
#ifdef __aarch64__
asm volatile(
"prfm pldl1keep, [%[ptr0]] \n"
"prfm pldl1keep, [%[ptr0], #64] \n"
"prfm pldl1keep, [%[ptr1]] \n"
"prfm pldl1keep, [%[ptr1], #64] \n"
"prfm pldl1keep, [%[ptr2]] \n"
"prfm pldl1keep, [%[ptr2], #64] \n"
"prfm pldl1keep, [%[ptr3]] \n"
"prfm pldl1keep, [%[ptr3], #64] \n"
"prfm pldl1keep, [%[ptr4]] \n"
"prfm pldl1keep, [%[ptr4], #64] \n"
"prfm pldl1keep, [%[ptr5]] \n"
"prfm pldl1keep, [%[ptr5], #64] \n"
"prfm pldl1keep, [%[ptr6]] \n"
"prfm pldl1keep, [%[ptr6], #64] \n"
"prfm pldl1keep, [%[ptr7]] \n"
"prfm pldl1keep, [%[ptr7], #64] \n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3),
[ptr4] "r"(inptr4),
[ptr5] "r"(inptr5),
[ptr6] "r"(inptr6),
[ptr7] "r"(inptr7)
: "memory");
#else
asm volatile(
"pld [%[ptr0]] @ preload a, 64byte\n"
"pld [%[ptr0], #64] @ preload a, 64byte\n"
"pld [%[ptr1]] @ preload a, 64byte\n"
"pld [%[ptr1], #64] @ preload a, 64byte\n"
"pld [%[ptr2]] @ preload a, 64byte\n"
"pld [%[ptr2], #64] @ preload a, 64byte\n"
"pld [%[ptr3]] @ preload a, 64byte\n"
"pld [%[ptr3], #64] @ preload a, 64byte\n"
"pld [%[ptr4]] @ preload a, 64byte\n"
"pld [%[ptr4], #64] @ preload a, 64byte\n"
"pld [%[ptr5]] @ preload a, 64byte\n"
"pld [%[ptr5], #64] @ preload a, 64byte\n"
"pld [%[ptr6]] @ preload a, 64byte\n"
"pld [%[ptr6], #64] @ preload a, 64byte\n"
"pld [%[ptr7]] @ preload a, 64byte\n"
"pld [%[ptr7], #64] @ preload a, 64byte\n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3),
[ptr4] "r"(inptr4),
[ptr5] "r"(inptr5),
[ptr6] "r"(inptr6),
[ptr7] "r"(inptr7)
: "memory");
#endif
int j = 0;
for (; j < w_in; j++) {
int tmpx = (ww - i) * 4;
uint8_t* outptr = dst + j * wout + tmpx;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
}
}
ww = w_out - 1;
for (; i < h_in; i++) {
const uint8_t* inptr0 = src + i * win;
for (int j = 0; j < w_in; j++) {
uint8_t* outptr0 = dst + j * wout + (ww - i) * 4;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
}
}
}
/*
bgr1 bgr2 bgr3
bgr4 bgr5 bgr6
bgr7 bgr8 bgr9
rotate:
bgr3 bgr6 bgr9
bgr2 bgr5 bgr8
bgr1 bgr4 bgr7
*/
// dst = (h_out - 1) * w_out
// 类似rotate90,将输出结果倒着输出 或者先rotate90,然后沿Y轴翻转
void rotate270_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
int w_out = h_in;
int h_out = w_in;
int win = w_in * 4;
int wout = w_out * 4;
int hremain = h_in % 8;
int stride_h = 4 * win;
int stride_h_w = 4 * win - 32;
int hout = h_out - 1;
// block 8*8. -- 8*8
int i = 0;
for (; i < h_in - 7; i += 8) {
const uint8_t* inptr0 = src + i * win;
const uint8_t* inptr1 = inptr0 + win;
const uint8_t* inptr2 = inptr1 + win;
const uint8_t* inptr3 = inptr2 + win;
const uint8_t* inptr4 = inptr3 + win;
const uint8_t* inptr5 = inptr4 + win;
const uint8_t* inptr6 = inptr5 + win;
const uint8_t* inptr7 = inptr6 + win;
int j = 0;
#ifdef __aarch64__
asm volatile(
"prfm pldl1keep, [%[ptr0]] \n"
"prfm pldl1keep, [%[ptr0], #64] \n"
"prfm pldl1keep, [%[ptr1]] \n"
"prfm pldl1keep, [%[ptr1], #64] \n"
"prfm pldl1keep, [%[ptr2]] \n"
"prfm pldl1keep, [%[ptr2], #64] \n"
"prfm pldl1keep, [%[ptr3]] \n"
"prfm pldl1keep, [%[ptr3], #64] \n"
"prfm pldl1keep, [%[ptr4]] \n"
"prfm pldl1keep, [%[ptr4], #64] \n"
"prfm pldl1keep, [%[ptr5]] \n"
"prfm pldl1keep, [%[ptr5], #64] \n"
"prfm pldl1keep, [%[ptr6]] \n"
"prfm pldl1keep, [%[ptr6], #64] \n"
"prfm pldl1keep, [%[ptr7]] \n"
"prfm pldl1keep, [%[ptr7], #64] \n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3),
[ptr4] "r"(inptr4),
[ptr5] "r"(inptr5),
[ptr6] "r"(inptr6),
[ptr7] "r"(inptr7)
: "memory");
#else
asm volatile(
"pld [%[ptr0]] @ preload a, 64byte\n"
"pld [%[ptr0], #64] @ preload a, 64byte\n"
"pld [%[ptr1]] @ preload a, 64byte\n"
"pld [%[ptr1], #64] @ preload a, 64byte\n"
"pld [%[ptr2]] @ preload a, 64byte\n"
"pld [%[ptr2], #64] @ preload a, 64byte\n"
"pld [%[ptr3]] @ preload a, 64byte\n"
"pld [%[ptr3], #64] @ preload a, 64byte\n"
"pld [%[ptr4]] @ preload a, 64byte\n"
"pld [%[ptr4], #64] @ preload a, 64byte\n"
"pld [%[ptr5]] @ preload a, 64byte\n"
"pld [%[ptr5], #64] @ preload a, 64byte\n"
"pld [%[ptr6]] @ preload a, 64byte\n"
"pld [%[ptr6], #64] @ preload a, 64byte\n"
"pld [%[ptr7]] @ preload a, 64byte\n"
"pld [%[ptr7], #64] @ preload a, 64byte\n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3),
[ptr4] "r"(inptr4),
[ptr5] "r"(inptr5),
[ptr6] "r"(inptr6),
[ptr7] "r"(inptr7)
: "memory");
#endif
for (; j < w_in; j++) {
int tmpx = i * 4;
uint8_t* outptr = dst + (hout - j) * wout + tmpx;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
}
}
for (; i < h_in; i++) {
const uint8_t* inptr0 = src + i * win;
for (int j = 0; j < w_in; j++) {
uint8_t* outptr0 = dst + (hout - j) * wout + i * 4;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
}
}
}
/*
bgr1 bgr2 bgr3
bgr4 bgr5 bgr6
bgr7 bgr8 bgr9
rotate:
bgr9 bgr8 bgr7
bgr6 bgr5 bgr4
bgr3 bgr2 bgr1
*/
// filp y
void rotate180_hwc_bgra(const uint8_t* src, uint8_t* dst, int w, int h_in) {
int w_in = w * 4;
uint8_t zerobuff[w_in]; // NOLINT
memset(zerobuff, 0, w_in * sizeof(uint8_t));
int stride_w = 4;
// 4*8
for (int i = 0; i < h_in; i += 4) {
const uint8_t* inptr0 = src + i * w_in;
const uint8_t* inptr1 = inptr0 + w_in;
const uint8_t* inptr2 = inptr1 + w_in;
const uint8_t* inptr3 = inptr2 + w_in;
uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w; // last
uint8_t* outptr1 = outptr0 - w_in;
uint8_t* outptr2 = outptr1 - w_in;
uint8_t* outptr3 = outptr2 - w_in;
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 3:
inptr0 = zerobuff;
case 2:
inptr1 = zerobuff;
case 1:
inptr2 = zerobuff;
case 0:
inptr3 = zerobuff;
default:
break;
}
}
#ifdef __aarch64__
asm volatile(
"prfm pldl1keep, [%[ptr0]] \n"
"prfm pldl1keep, [%[ptr0], #64] \n"
"prfm pldl1keep, [%[ptr1]] \n"
"prfm pldl1keep, [%[ptr1], #64] \n"
"prfm pldl1keep, [%[ptr2]] \n"
"prfm pldl1keep, [%[ptr2], #64] \n"
"prfm pldl1keep, [%[ptr3]] \n"
"prfm pldl1keep, [%[ptr3], #64] \n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3)
: "memory");
#else
asm volatile(
"pld [%[ptr0]] @ preload a, 64byte\n"
"pld [%[ptr0], #64] @ preload a, 64byte\n"
"pld [%[ptr1]] @ preload a, 64byte\n"
"pld [%[ptr1], #64] @ preload a, 64byte\n"
"pld [%[ptr2]] @ preload a, 64byte\n"
"pld [%[ptr2], #64] @ preload a, 64byte\n"
"pld [%[ptr3]] @ preload a, 64byte\n"
"pld [%[ptr3], #64] @ preload a, 64byte\n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3)
: "memory");
#endif
int j = 0;
for (; j < w; j++) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 0:
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 8;
case 1:
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 8;
case 2:
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 8;
case 3:
// inptr3 = zerobuff;
default:
break;
}
} else {
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
outptr3 -= 8;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 8;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 8;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 8;
}
}
}
}
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/tests/cv/anakin/cv_utils.h"
void bgra_to_tensor_hwc(const uint8_t* bgr,
Tensor& output, // NOLINT
int width,
int height,
float* means,
float* scales) {
int size = width * height;
float* ptr0 = output.mutable_data<float>();
float r_means = means[0];
float g_means = means[1];
float b_means = means[2];
float r_scales = scales[0];
float g_scales = scales[1];
float b_scales = scales[2];
int dim8 = width >> 3;
int remain = wwidth - (dim8 << 3);
float32x4_t vrmean = vdupq_n_f32(r_means);
float32x4_t vgmean = vdupq_n_f32(g_means);
float32x4_t vbmean = vdupq_n_f32(b_means);
float32x4_t vrscale = vdupq_n_f32(r_scales);
float32x4_t vgscale = vdupq_n_f32(g_scales);
float32x4_t vbscale = vdupq_n_f32(b_scales);
for (int i = 0; i < height; i++) {
const uint8_t* ptr_bgr = bgr + i * width * 4;
float* ptr0_b = ptr0 + i * width;
float* ptr1_g = ptr0_b + size;
float* ptr2_r = ptr1_g + size;
for (int j = 0; j < dim8; j++) {
uint8x8x4_t vbgr = vld4_u8(ptr_bgr);
uint8x8_t vb = vbgr.val[0];
uint8x8_t vg = vbgr.val[1];
uint8x8_t vr = vbgr.val[2];
uint16x8_t vb_16 = vmovl_u8(vb);
uint16x8_t vg_16 = vmovl_u8(vg);
uint16x8_t vr_16 = vmovl_u8(vr);
uint32x4_t vb_low_32 = vmovl_u16(vget_low_u16(vb_16));
uint32x4_t vg_low_32 = vmovl_u16(vget_low_u16(vg_16));
uint32x4_t vr_low_32 = vmovl_u16(vget_low_u16(vr_16));
uint32x4_t vb_high_32 = vmovl_u16(vget_high_u16(vb_16));
uint32x4_t vg_high_32 = vmovl_u16(vget_high_u16(vg_16));
uint32x4_t vr_high_32 = vmovl_u16(vget_high_u16(vr_16));
float32x4_t vb_low_f32 = vcvtq_f32_u32(vb_low_32);
float32x4_t vr_low_f32 = vcvtq_f32_u32(vr_low_32);
float32x4_t vg_low_f32 = vcvtq_f32_u32(vg_low_32);
float32x4_t vb_high_f32 = vcvtq_f32_u32(vb_high_32);
float32x4_t vg_high_f32 = vcvtq_f32_u32(vg_high_32);
float32x4_t vr_high_f32 = vcvtq_f32_u32(vr_high_32);
vb_low_f32 = vsubq_f32(vb_low_f32, vbmean);
vg_low_f32 = vsubq_f32(vg_low_f32, vgmean);
vr_low_f32 = vsubq_f32(vr_low_f32, vrmean);
vb_high_f32 = vsubq_f32(vb_high_f32, vbmean);
vg_high_f32 = vsubq_f32(vg_high_f32, vgmean);
vr_high_f32 = vsubq_f32(vr_high_f32, vrmean);
vb_low_f32 = vmulq_f32(vb_low_f32, vbscale);
vg_low_f32 = vmulq_f32(vg_low_f32, vgscale);
vr_low_f32 = vmulq_f32(vr_low_f32, vrscale);
vb_high_f32 = vmulq_f32(vb_high_f32, vbscale);
vg_high_f32 = vmulq_f32(vg_high_f32, vgscale);
vr_high_f32 = vmulq_f32(vr_high_f32, vrscale);
vst1q_f32(ptr0_b, vb_low_f32);
vst1q_f32(ptr1_g, vg_low_f32);
vst1q_f32(ptr2_r, vr_low_f32);
ptr_bgr += 32;
vst1q_f32(ptr0_b + 4, vb_high_f32);
vst1q_f32(ptr1_g + 4, vg_high_f32);
vst1q_f32(ptr2_r + 4, vr_high_f32);
ptr0_b += 8;
ptr1_g += 8;
ptr2_r += 8;
}
for (int j = 0; j < remain; j++) {
*ptr0_b++ = (*ptr_bgr - b_means) * b_scales;
ptr_bgr++;
*ptr1_g++ = (*ptr_bgr - g_means) * g_scales;
ptr_bgr++;
*ptr2_r++ = (*ptr_bgr - r_means) * r_scales;
ptr_bgr++;
ptr_bgr++;
}
}
}
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/tests/cv/anakin/cv_utils.h"
void image_basic_convert(const uint8_t* src,
uint8_t* dst,
ImageFormat srcFormat,
ImageFormat dstFormat,
int srcw,
int srch,
int out_size) {
if (srcFormat == dstFormat) {
// copy
memcpy(dst, src, sizeof(uint8_t) * out_size);
return;
} else {
if (srcFormat == ImageFormat::NV12 &&
(dstFormat == ImageFormat::BGR || dstFormat == ImageFormat::RGB)) {
nv12_to_bgr(src, dst, srcw, srch);
} else if (srcFormat == ImageFormat::NV21 &&
(dstFormat == ImageFormat::BGR ||
dstFormat == ImageFormat::RGB)) {
nv21_to_bgr(src, dst, srcw, srch);
} else if (srcFormat == ImageFormat::NV12 &&
(dstFormat == ImageFormat::BGRA ||
dstFormat == ImageFormat::RGBA)) {
nv12_to_bgra(src, dst, srcw, srch);
} else if (srcFormat == ImageFormat::NV21 &&
(dstFormat == ImageFormat::BGRA ||
dstFormat == ImageFormat::RGBA)) {
nv21_to_bgra(src, dst, srcw, srch);
} else {
printf("bais-anakin srcFormat: %d, dstFormat: %d does not support! \n",
srcFormat,
dstFormat);
}
}
}
void image_basic_resize(const uint8_t* src,
uint8_t* dst,
ImageFormat srcFormat,
int srcw,
int srch,
int dstw,
int dsth) {
int size = srcw * srch;
if (srcw == dstw && srch == dsth) {
if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
size = srcw * (static_cast<int>(1.5 * srch));
} else if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
size = 3 * srcw * srch;
} else if (srcFormat == ImageFormat::BGRA ||
srcFormat == ImageFormat::RGBA) {
size = 4 * srcw * srch;
}
memcpy(dst, src, sizeof(uint8_t) * size);
return;
} else {
if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
nv21_resize(src, dst, srcw, srch, dstw, dsth);
} else if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
bgr_resize(src, dst, srcw, srch, dstw, dsth);
} else if (srcFormat == ImageFormat::BGRA ||
srcFormat == ImageFormat::RGBA) {
bgra_resize(src, dst, srcw, srch, dstw, dsth);
} else {
printf("anakin doesn't support this type: %d\n",
static_cast<int>(srcFormat));
}
}
}
void image_basic_flip(const uint8_t* src,
uint8_t* dst,
ImageFormat srcFormat,
int srcw,
int srch,
int flip_num) {
if (flip_num == -1) {
flip_num = 0; // xy
} else if (flip_num == 0) {
flip_num = 1; // x
} else if (flip_num == 1) {
flip_num = -1; // y
}
if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
bgr_flip_hwc(src, dst, srcw, srch, flip_num);
} else if (srcFormat == ImageFormat::BGRA || srcFormat == ImageFormat::RGBA) {
bgra_flip_hwc(src, dst, srcw, srch, flip_num);
} else {
printf("anakin doesn't support this type: %d\n",
static_cast<int>(srcFormat));
}
}
void image_basic_rotate(const uint8_t* src,
uint8_t* dst,
ImageFormat srcFormat,
int srcw,
int srch,
float rotate_num) {
if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
bgr_rotate_hwc(src, dst, srcw, srch, rotate_num);
} else if (srcFormat == ImageFormat::BGRA || srcFormat == ImageFormat::RGBA) {
bgra_rotate_hwc(src, dst, srcw, srch, rotate_num);
} else {
printf("anakin doesn't support this type: %d\n",
static_cast<int>(srcFormat));
}
}
void image_basic_to_tensor(const uint8_t* in_data,
Tensor dst,
ImageFormat srcFormat,
LayoutType layout,
int srcw,
int srch,
float* means,
float* scales) {
if (layout == LayoutType::kNCHW &&
(srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB)) {
bgr_to_tensor_hwc(in_data, dst, srcw, srch, means, scales);
} else if (layout == LayoutType::kNCHW && (srcFormat == ImageFormat::BGRA ||
srcFormat == ImageFormat::RGBA)) {
bgra_to_tensor_hwc(in_data, dst, srcw, srch, means, scales);
} else {
printf("anakin doesn't support this type: %d\n",
static_cast<int>(srcFormat));
}
}
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <arm_neon.h>
#include "lite/core/tensor.h"
#include "lite/utils/cv/paddle_image_preprocess.h"
typedef paddle::lite::utils::cv::ImageFormat ImageFormat;
typedef paddle::lite::utils::cv::FlipParam FlipParam;
typedef paddle::lite::Tensor Tensor;
typedef paddle::lite_api::DataLayoutType LayoutType;
void rotate(const uint8_t* src, uint8_t* dst, int srcw, int srch, int angle);
void bgr_rotate_hwc(
const uint8_t* src, uint8_t* dst, int srcw, int srch, int angle);
void bgra_rotate_hwc(
const uint8_t* src, uint8_t* dst, int srcw, int srch, int angle);
// x: flip_num = 1 y: flip_num = -1 xy: flip_num = 0;
void flip(const uint8_t* src, uint8_t* dst, int srcw, int srch, int flip_num);
// x: flip_num = 1 y: flip_num = -1 xy: flip_num = 0;
void bgr_flip_hwc(
const uint8_t* src, uint8_t* dst, int srcw, int srch, int flip_num);
// x: flip_num = 1 y: flip_num = -1 xy: flip_num = 0;
void bgra_flip_hwc(
const uint8_t* src, uint8_t* dst, int srcw, int srch, int flip_num);
// y_w = srcw, y_h = srch uv_w = srcw uv_h = 1/2 * srch
void nv21_resize(
const uint8_t* src, uint8_t* dst, int srcw, int srch, int dstw, int dsth);
void bgr_resize(
const uint8_t* src, uint8_t* dst, int srcw, int srch, int dstw, int dsth);
void bgra_resize(
const uint8_t* src, uint8_t* dst, int srcw, int srch, int dstw, int dsth);
// nv21(yvu) to BGR: stroe hwc dsth * dstw = srch * (srcw) y_w = srcw
// y_h = srch uv_w = srcw uv_h = 1/2 * srch
void nv21_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch);
// nv12(yuv) to BGR:store hwc dsth * dstw = srch * srcw y_w = srcw
// y_h = srch uv_w = srcw uv_h = 1/2 * srch
void nv12_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch);
// nv21(yvu) to BGRA: stroe hwc dsth * dstw = srch * (srcw) y_w = srcw
// y_h = srch uv_w = srcw uv_h = 1/2 * srch
void nv21_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch);
// nv12(yuv) to BGRA:store hwc dsth * dstw = srch * srcw y_w = srcw
// y_h = srch uv_w = srcw uv_h = 1/2 * srch
void nv12_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch);
// bgr output.w == width output.h == height/3
void bgr_to_tensor_hcw(const uint8_t* bgr,
Tensor& output, // NOLINT
int width,
int height,
float* means,
float* scales);
// bgr output.w == width / 3 output.h == height
void bgr_to_tensor_hwc(const uint8_t* bgr,
Tensor& output, // NOLINT
int width,
int height,
float* means,
float* scales);
// bgra output.w == width / 4 output.h == height
void bgra_to_tensor_hwc(const uint8_t* bgr,
Tensor& output, // NOLINT
int width,
int height,
float* means,
float* scales);
// yvu y_w = width, y_h = height uv_w = width uv_h = 1/2 * height
void nv21_to_tensor(const uint8_t* nv21,
Tensor& output, // NOLINT
int width,
int height,
float* means,
float* scales);
// yuv y_w = width, y_h = height uv_w = width uv_h = 1/2 * height
void nv12_to_tensor(const uint8_t* nv12,
Tensor& output, // NOLINT
int width,
int height,
float* means,
float* scales);
// clang-format on
void image_basic_convert(const uint8_t* src,
uint8_t* dst,
ImageFormat srcFormat,
ImageFormat dstFormat,
int srcw,
int srch,
int out_size);
void image_basic_resize(const uint8_t* src,
uint8_t* dst,
ImageFormat srcFormat,
int srcw,
int srch,
int dstw,
int dsth);
void image_basic_flip(const uint8_t* src,
uint8_t* dst,
ImageFormat srcFormat,
int srcw,
int srch,
int flip_num);
void image_basic_rotate(const uint8_t* src,
uint8_t* dst,
ImageFormat srcFormat,
int srcw,
int srch,
float rotate_num);
void image_basic_to_tensor(const uint8_t* in_data,
Tensor dst,
ImageFormat srcFormat,
LayoutType layout,
int srcw,
int srch,
float* means,
float* scales);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <math.h>
#include "lite/tests/cv/anakin/cv_utils.h"
/*
R = Y + 1.402*(V-128);
G = Y - 0.34414*(U-128) - 0.71414*(V-128);
B = Y + 1.772*(U-128);
float: a*b = ((a << 7)*b )>>7
ra = 1.402 *128 = 179.456 = 179
ga = 0.34414 * 64 = 44.3721 = 44
gb = 0.71414 * 64 = 91.40992 = 91
ba = 1.772 * 62 = 226.816 = 227
*/
// yuv store hwc bgrbgr dsth * dstw = srch * srcw y_w = srcw
// y_h = srch uv_w = srcw uv_h = 1/2 * srch
void nv12_to_bgr(const unsigned char* src,
unsigned char* dst,
int srcw,
int srch) {
int y_h = srch;
int vu_h = 1 / 2 * srch;
const unsigned char* y = src;
const unsigned char* vu = src + y_h * srcw;
int wout = srcw * 3;
int16x8_t bias = vdupq_n_s16(128);
int16x8_t ga = vdupq_n_s16(44);
int16x8_t ra = vdupq_n_s16(179);
int16x8_t ba = vdupq_n_s16(227);
int16x8_t gb = vdupq_n_s16(91);
int16x8_t zero = vdupq_n_s16(0);
int16x8_t max = vdupq_n_s16(255);
uint8_t* zerobuf = new uint8_t[srcw];
uint8_t* writebuf = new uint8_t[wout];
memset(zerobuf, 0, sizeof(uint8_t) * srcw);
for (int i = 0; i < y_h; i += 2) {
const unsigned char* ptr_y1 = y + i * srcw;
const unsigned char* ptr_y2 = ptr_y1 + srcw;
const unsigned char* ptr_vu = vu + (i / 2) * srcw;
unsigned char* ptr_bgr1 = dst + i * wout;
unsigned char* ptr_bgr2 = ptr_bgr1 + wout;
if (i + 2 > y_h) {
ptr_y2 = zerobuf;
ptr_bgr2 = writebuf;
}
// 2*16
#ifdef __aarch64__
asm volatile(
"prfm pldl1keep, [%[ptr_y1]] \n"
"prfm pldl1keep, [%[ptr_y1], #64] \n"
"prfm pldl1keep, [%[ptr_y2]] \n"
"prfm pldl1keep, [%[ptr_y2], #64] \n"
"prfm pldl1keep, [%[ptr_vu]] \n"
"prfm pldl1keep, [%[ptr_vu], #64] \n"
:
: [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
: "memory");
#else
asm volatile(
"pld [%[ptr_y1]] @ preload a, 64byte\n"
"pld [%[ptr_y1], #128] @ preload a, 64byte\n"
"pld [%[ptr_y2]] @ preload a, 64byte\n"
"pld [%[ptr_y2], #128] @ preload a, 64byte\n"
"pld [%[ptr_vu]] @ preload a, 64byte\n"
"pld [%[ptr_vu], #128] @ preload a, 64byte\n"
:
: [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
: "memory");
#endif
int j = 0;
for (; j < srcw - 15; j += 16) {
uint8x8x2_t y1 = vld2_u8(ptr_y1); // d8 = y0y2y4y6...y14 d9 =
// y1y3y5...y15
uint8x8x2_t vu =
vld2_u8(ptr_vu); // d0 = v0v1v2v3v4v5...v7 d1 = u0u1u2...u7
uint8x8x2_t y2 = vld2_u8(ptr_y2);
uint16x8_t v = vmovl_u8(vu.val[1]);
uint16x8_t u = vmovl_u8(vu.val[0]);
int16x8_t v_s = vreinterpretq_s16_u16(v);
int16x8_t u_s = vreinterpretq_s16_u16(u);
int16x8_t v_bias = vsubq_s16(v_s, bias);
int16x8_t u_bias = vsubq_s16(u_s, bias);
// G = Y - 0.34414*(U-128) - 0.71414*(V-128);
int16x8_t g0 = vmulq_s16(ga, u_bias);
// R = Y + 1.402*(V-128);
int16x8_t r0 = vmulq_s16(ra, v_bias);
// B = Y + 1.772*(U-128);
int16x8_t b0 = vmulq_s16(ba, u_bias);
g0 = vmlaq_s16(g0, gb, v_bias);
int16x8_t y1_0_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[0]));
int16x8_t y1_1_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[1]));
int16x8_t y2_0_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[0]));
int16x8_t y2_1_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[1]));
int16x8_t r0_bias = vshrq_n_s16(r0, 7); // r0 / 128
int16x8_t b0_bias = vshrq_n_s16(b0, 7);
int16x8_t g0_bias = vshrq_n_s16(g0, 7);
int16x8_t r0_1 = vaddq_s16(y1_0_8, r0_bias);
int16x8_t b0_1 = vaddq_s16(y1_0_8, b0_bias);
int16x8_t g0_1 = vsubq_s16(y1_0_8, g0_bias); // g0_1 = y1_0_8 - g0_1
int16x8_t r0_2 = vaddq_s16(y1_1_8, r0_bias);
int16x8_t b0_2 = vaddq_s16(y1_1_8, b0_bias);
int16x8_t g0_2 = vsubq_s16(y1_1_8, g0_bias);
r0_1 = vmaxq_s16(r0_1, zero);
b0_1 = vmaxq_s16(b0_1, zero);
g0_1 = vmaxq_s16(g0_1, zero);
r0_2 = vmaxq_s16(r0_2, zero);
b0_2 = vmaxq_s16(b0_2, zero);
g0_2 = vmaxq_s16(g0_2, zero);
r0_1 = vminq_s16(r0_1, max);
b0_1 = vminq_s16(b0_1, max);
g0_1 = vminq_s16(g0_1, max);
r0_2 = vminq_s16(r0_2, max);
b0_2 = vminq_s16(b0_2, max);
g0_2 = vminq_s16(g0_2, max);
uint8x8_t r00 = vreinterpret_u8_s8(vmovn_s16(r0_1));
uint8x8_t b00 = vreinterpret_u8_s8(vmovn_s16(b0_1));
uint8x8_t g00 = vreinterpret_u8_s8(vmovn_s16(g0_1));
uint8x8_t r01 = vreinterpret_u8_s8(vmovn_s16(r0_2));
uint8x8_t b01 = vreinterpret_u8_s8(vmovn_s16(b0_2));
uint8x8_t g01 = vreinterpret_u8_s8(vmovn_s16(g0_2));
int16x8_t r1_1 = vaddq_s16(y2_0_8, r0_bias);
int16x8_t b1_1 = vaddq_s16(y2_0_8, b0_bias);
int16x8_t g1_1 = vsubq_s16(y2_0_8, g0_bias); // g0_1 = y1_0_8 - g0_1
int16x8_t r1_2 = vaddq_s16(y2_1_8, r0_bias);
int16x8_t b1_2 = vaddq_s16(y2_1_8, b0_bias);
int16x8_t g1_2 = vsubq_s16(y2_1_8, g0_bias);
uint8x8x2_t r00_0 = vtrn_u8(r00, r01); // 014589 236710
uint8x8x2_t b00_0 = vtrn_u8(b00, b01);
uint8x8x2_t g00_0 = vtrn_u8(g00, g01);
r1_1 = vmaxq_s16(r1_1, zero);
b1_1 = vmaxq_s16(b1_1, zero);
g1_1 = vmaxq_s16(g1_1, zero);
r1_2 = vmaxq_s16(r1_2, zero);
b1_2 = vmaxq_s16(b1_2, zero);
g1_2 = vmaxq_s16(g1_2, zero);
uint16x4_t r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
uint16x4_t r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
uint16x4_t b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
uint16x4_t b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
uint16x4_t g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
uint16x4_t g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
uint16x4x2_t r00_1 = vtrn_u16(r0_16, r1_16); // 012389 456710
uint16x4x2_t b00_1 = vtrn_u16(b0_16, b1_16);
uint16x4x2_t g00_1 = vtrn_u16(g0_16, g1_16);
r1_1 = vminq_s16(r1_1, max);
b1_1 = vminq_s16(b1_1, max);
g1_1 = vminq_s16(g1_1, max);
r1_2 = vminq_s16(r1_2, max);
b1_2 = vminq_s16(b1_2, max);
g1_2 = vminq_s16(g1_2, max);
uint32x2_t r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
uint32x2_t r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
uint32x2_t b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
uint32x2_t b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
uint32x2_t g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
uint32x2_t g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
uint32x2x2_t r00_2 = vtrn_u32(r0_32, r1_32); // 01234567 8910
uint32x2x2_t b00_2 = vtrn_u32(b0_32, b1_32);
uint32x2x2_t g00_2 = vtrn_u32(g0_32, g1_32);
r00 = vreinterpret_u8_s8(vmovn_s16(r1_1));
b00 = vreinterpret_u8_s8(vmovn_s16(b1_1));
g00 = vreinterpret_u8_s8(vmovn_s16(g1_1));
r01 = vreinterpret_u8_s8(vmovn_s16(r1_2));
b01 = vreinterpret_u8_s8(vmovn_s16(b1_2));
g01 = vreinterpret_u8_s8(vmovn_s16(g1_2));
uint8x8_t r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
uint8x8_t b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
uint8x8_t g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
uint8x8_t r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
uint8x8_t b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
uint8x8_t g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
uint8x8x3_t v_bgr;
v_bgr.val[0] = b0_8;
v_bgr.val[1] = g0_8;
v_bgr.val[2] = r0_8;
r00_0 = vtrn_u8(r00, r01); // 014589 236710
b00_0 = vtrn_u8(b00, b01);
g00_0 = vtrn_u8(g00, g01);
vst3_u8(ptr_bgr1, v_bgr);
r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
ptr_bgr1 += 24;
uint8x8x3_t v_bgr1;
v_bgr1.val[0] = b1_8;
v_bgr1.val[1] = g1_8;
v_bgr1.val[2] = r1_8;
r00_1 = vtrn_u16(r0_16, r1_16); // 012389 456710
b00_1 = vtrn_u16(b0_16, b1_16);
g00_1 = vtrn_u16(g0_16, g1_16);
vst3_u8(ptr_bgr1, v_bgr1);
r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
ptr_bgr1 += 24;
r00_2 = vtrn_u32(r0_32, r1_32); // 01234567 8910
b00_2 = vtrn_u32(b0_32, b1_32);
g00_2 = vtrn_u32(g0_32, g1_32);
ptr_vu += 16;
ptr_y1 += 16;
ptr_y2 += 16;
r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
v_bgr.val[0] = b0_8;
v_bgr.val[1] = g0_8;
v_bgr.val[2] = r0_8;
v_bgr1.val[0] = b1_8;
v_bgr1.val[1] = g1_8;
v_bgr1.val[2] = r1_8;
vst3_u8(ptr_bgr2, v_bgr);
vst3_u8(ptr_bgr2 + 24, v_bgr1);
ptr_bgr2 += 48;
}
// two data
for (; j < srcw; j += 2) {
unsigned char _y0 = ptr_y1[0];
unsigned char _y1 = ptr_y1[1];
unsigned char _v = ptr_vu[1];
unsigned char _u = ptr_vu[0];
unsigned char _y0_1 = ptr_y2[0];
unsigned char _y1_1 = ptr_y2[1];
int ra = floor((179 * (_v - 128)) >> 7);
int ga = floor((44 * (_u - 128) + 91 * (_v - 128)) >> 7);
int ba = floor((227 * (_u - 128)) >> 7);
int r = _y0 + ra;
int g = _y0 - ga;
int b = _y0 + ba;
int r1 = _y1 + ra;
int g1 = _y1 - ga;
int b1 = _y1 + ba;
r = r < 0 ? 0 : (r > 255) ? 255 : r;
g = g < 0 ? 0 : (g > 255) ? 255 : g;
b = b < 0 ? 0 : (b > 255) ? 255 : b;
r1 = r1 < 0 ? 0 : (r1 > 255) ? 255 : r1;
g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1;
b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1;
*ptr_bgr1++ = b;
*ptr_bgr1++ = g;
*ptr_bgr1++ = r;
int r2 = _y0_1 + ra;
int g2 = _y0_1 - ga;
int b2 = _y0_1 + ba;
int r3 = _y1_1 + ra;
int g3 = _y1_1 - ga;
int b3 = _y1_1 + ba;
r2 = r2 < 0 ? 0 : (r2 > 255) ? 255 : r2;
g2 = g2 < 0 ? 0 : (g2 > 255) ? 255 : g2;
b2 = b2 < 0 ? 0 : (b2 > 255) ? 255 : b2;
r3 = r3 < 0 ? 0 : (r3 > 255) ? 255 : r3;
g3 = g3 < 0 ? 0 : (g3 > 255) ? 255 : g3;
b3 = b3 < 0 ? 0 : (b3 > 255) ? 255 : b3;
*ptr_bgr1++ = b1;
*ptr_bgr1++ = g1;
*ptr_bgr1++ = r1;
*ptr_bgr2++ = b2;
*ptr_bgr2++ = g2;
*ptr_bgr2++ = r2;
ptr_y1 += 2;
ptr_y2 += 2;
ptr_vu += 2;
*ptr_bgr2++ = b3;
*ptr_bgr2++ = g3;
*ptr_bgr2++ = r3;
}
}
}
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <math.h>
#include "lite/tests/cv/anakin/cv_utils.h"
/*
R = Y + 1.402*(V-128);
G = Y - 0.34414*(U-128) - 0.71414*(V-128);
B = Y + 1.772*(U-128);
A = 255
float compute a*b = ((a << 7)*b )>>7
ra = 1.402 *128 = 179.456 = 179
ga = 0.34414 * 64 = 44.3721 = 44
gb = 0.71414 * 64 = 91.40992 = 91
ba = 1.772 * 62 = 226.816 = 227
*/
// yuv store hwc bgrabgra dsth * dstw = srch * srcw y_w = srcw
// y_h = srch uv_w = srcw uv_h = 1/2 * srch
void nv12_to_bgra(const unsigned char* src,
unsigned char* dst,
int srcw,
int srch) {
int y_h = srch;
int vu_h = 1 / 2 * srch;
const unsigned char* y = src;
const unsigned char* vu = src + y_h * srcw;
int wout = srcw * 4;
int16x8_t bias = vdupq_n_s16(128);
int16x8_t ga = vdupq_n_s16(44);
int16x8_t ra = vdupq_n_s16(179);
int16x8_t ba = vdupq_n_s16(227);
int16x8_t gb = vdupq_n_s16(91);
int16x8_t zero = vdupq_n_s16(0);
int16x8_t max = vdupq_n_s16(255);
uint8x8_t a_8 = vdup_n_u8(255);
for (int i = 0; i < y_h; i += 2) {
const unsigned char* ptr_y1 = y + i * srcw;
const unsigned char* ptr_y2 = ptr_y1 + srcw;
const unsigned char* ptr_vu = vu + (i / 2) * srcw;
unsigned char* ptr_bgr1 = dst + i * wout;
unsigned char* ptr_bgr2 = ptr_bgr1 + wout;
// 2*16
#ifdef __aarch64__
asm volatile(
"prfm pldl1keep, [%[ptr_y1]] \n"
"prfm pldl1keep, [%[ptr_y1], #64] \n"
"prfm pldl1keep, [%[ptr_y2]] \n"
"prfm pldl1keep, [%[ptr_y2], #64] \n"
"prfm pldl1keep, [%[ptr_vu]] \n"
"prfm pldl1keep, [%[ptr_vu], #64] \n"
:
: [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
: "memory");
#else
asm volatile(
"pld [%[ptr_y1]] @ preload a, 64byte\n"
"pld [%[ptr_y1], #128] @ preload a, 64byte\n"
"pld [%[ptr_y2]] @ preload a, 64byte\n"
"pld [%[ptr_y2], #128] @ preload a, 64byte\n"
"pld [%[ptr_vu]] @ preload a, 64byte\n"
"pld [%[ptr_vu], #128] @ preload a, 64byte\n"
:
: [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
: "memory");
#endif
int j = 0;
for (; j < srcw - 15; j += 16) {
uint8x8x2_t y1 = vld2_u8(ptr_y1); // d8 = y0y2y4y6...y14 d9 =
// y1y3y5...y15
uint8x8x2_t vu =
vld2_u8(ptr_vu); // d0 = v0v1v2v3v4v5...v7 d1 = u0u1u2...u7
uint8x8x2_t y2 = vld2_u8(ptr_y2);
uint16x8_t v = vmovl_u8(vu.val[1]);
uint16x8_t u = vmovl_u8(vu.val[0]);
int16x8_t v_s = vreinterpretq_s16_u16(v);
int16x8_t u_s = vreinterpretq_s16_u16(u);
int16x8_t v_bias = vsubq_s16(v_s, bias);
int16x8_t u_bias = vsubq_s16(u_s, bias);
// G = Y - 0.34414*(U-128) - 0.71414*(V-128);
int16x8_t g0 = vmulq_s16(ga, u_bias);
// R = Y + 1.402*(V-128);
int16x8_t r0 = vmulq_s16(ra, v_bias);
// B = Y + 1.772*(U-128);
int16x8_t b0 = vmulq_s16(ba, u_bias);
g0 = vmlaq_s16(g0, gb, v_bias);
int16x8_t y1_0_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[0]));
int16x8_t y1_1_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[1]));
int16x8_t y2_0_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[0]));
int16x8_t y2_1_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[1]));
int16x8_t r0_bias = vshrq_n_s16(r0, 7); // r0 / 128
int16x8_t b0_bias = vshrq_n_s16(b0, 7);
int16x8_t g0_bias = vshrq_n_s16(g0, 7);
int16x8_t r0_1 = vaddq_s16(y1_0_8, r0_bias);
int16x8_t b0_1 = vaddq_s16(y1_0_8, b0_bias);
int16x8_t g0_1 = vsubq_s16(y1_0_8, g0_bias); // g0_1 = y1_0_8 - g0_1
int16x8_t r0_2 = vaddq_s16(y1_1_8, r0_bias);
int16x8_t b0_2 = vaddq_s16(y1_1_8, b0_bias);
int16x8_t g0_2 = vsubq_s16(y1_1_8, g0_bias);
r0_1 = vmaxq_s16(r0_1, zero);
b0_1 = vmaxq_s16(b0_1, zero);
g0_1 = vmaxq_s16(g0_1, zero);
r0_2 = vmaxq_s16(r0_2, zero);
b0_2 = vmaxq_s16(b0_2, zero);
g0_2 = vmaxq_s16(g0_2, zero);
r0_1 = vminq_s16(r0_1, max);
b0_1 = vminq_s16(b0_1, max);
g0_1 = vminq_s16(g0_1, max);
r0_2 = vminq_s16(r0_2, max);
b0_2 = vminq_s16(b0_2, max);
g0_2 = vminq_s16(g0_2, max);
uint8x8_t r00 = vreinterpret_u8_s8(vmovn_s16(r0_1));
uint8x8_t b00 = vreinterpret_u8_s8(vmovn_s16(b0_1));
uint8x8_t g00 = vreinterpret_u8_s8(vmovn_s16(g0_1));
uint8x8_t r01 = vreinterpret_u8_s8(vmovn_s16(r0_2));
uint8x8_t b01 = vreinterpret_u8_s8(vmovn_s16(b0_2));
uint8x8_t g01 = vreinterpret_u8_s8(vmovn_s16(g0_2));
int16x8_t r1_1 = vaddq_s16(y2_0_8, r0_bias);
int16x8_t b1_1 = vaddq_s16(y2_0_8, b0_bias);
int16x8_t g1_1 = vsubq_s16(y2_0_8, g0_bias); // g0_1 = y1_0_8 - g0_1
int16x8_t r1_2 = vaddq_s16(y2_1_8, r0_bias);
int16x8_t b1_2 = vaddq_s16(y2_1_8, b0_bias);
int16x8_t g1_2 = vsubq_s16(y2_1_8, g0_bias);
uint8x8x2_t r00_0 = vtrn_u8(r00, r01); // 014589 236710
uint8x8x2_t b00_0 = vtrn_u8(b00, b01);
uint8x8x2_t g00_0 = vtrn_u8(g00, g01);
r1_1 = vmaxq_s16(r1_1, zero);
b1_1 = vmaxq_s16(b1_1, zero);
g1_1 = vmaxq_s16(g1_1, zero);
r1_2 = vmaxq_s16(r1_2, zero);
b1_2 = vmaxq_s16(b1_2, zero);
g1_2 = vmaxq_s16(g1_2, zero);
uint16x4_t r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
uint16x4_t r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
uint16x4_t b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
uint16x4_t b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
uint16x4_t g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
uint16x4_t g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
uint16x4x2_t r00_1 = vtrn_u16(r0_16, r1_16); // 012389 456710
uint16x4x2_t b00_1 = vtrn_u16(b0_16, b1_16);
uint16x4x2_t g00_1 = vtrn_u16(g0_16, g1_16);
r1_1 = vminq_s16(r1_1, max);
b1_1 = vminq_s16(b1_1, max);
g1_1 = vminq_s16(g1_1, max);
r1_2 = vminq_s16(r1_2, max);
b1_2 = vminq_s16(b1_2, max);
g1_2 = vminq_s16(g1_2, max);
uint32x2_t r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
uint32x2_t r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
uint32x2_t b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
uint32x2_t b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
uint32x2_t g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
uint32x2_t g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
uint32x2x2_t r00_2 = vtrn_u32(r0_32, r1_32); // 01234567 8910
uint32x2x2_t b00_2 = vtrn_u32(b0_32, b1_32);
uint32x2x2_t g00_2 = vtrn_u32(g0_32, g1_32);
r00 = vreinterpret_u8_s8(vmovn_s16(r1_1));
b00 = vreinterpret_u8_s8(vmovn_s16(b1_1));
g00 = vreinterpret_u8_s8(vmovn_s16(g1_1));
r01 = vreinterpret_u8_s8(vmovn_s16(r1_2));
b01 = vreinterpret_u8_s8(vmovn_s16(b1_2));
g01 = vreinterpret_u8_s8(vmovn_s16(g1_2));
uint8x8_t r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
uint8x8_t b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
uint8x8_t g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
uint8x8_t r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
uint8x8_t b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
uint8x8_t g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
uint8x8x4_t v_bgr;
v_bgr.val[0] = b0_8;
v_bgr.val[1] = g0_8;
v_bgr.val[2] = r0_8;
v_bgr.val[3] = a_8;
r00_0 = vtrn_u8(r00, r01); // 014589 236710
b00_0 = vtrn_u8(b00, b01);
g00_0 = vtrn_u8(g00, g01);
vst4_u8(ptr_bgr1, v_bgr);
r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
ptr_bgr1 += 32;
uint8x8x4_t v_bgr1;
v_bgr1.val[0] = b1_8;
v_bgr1.val[1] = g1_8;
v_bgr1.val[2] = r1_8;
v_bgr1.val[3] = a_8;
r00_1 = vtrn_u16(r0_16, r1_16); // 012389 456710
b00_1 = vtrn_u16(b0_16, b1_16);
g00_1 = vtrn_u16(g0_16, g1_16);
vst4_u8(ptr_bgr1, v_bgr1);
r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
ptr_bgr1 += 32;
r00_2 = vtrn_u32(r0_32, r1_32); // 01234567 8910
b00_2 = vtrn_u32(b0_32, b1_32);
g00_2 = vtrn_u32(g0_32, g1_32);
ptr_vu += 16;
ptr_y1 += 16;
ptr_y2 += 16;
r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
v_bgr.val[0] = b0_8;
v_bgr.val[1] = g0_8;
v_bgr.val[2] = r0_8;
v_bgr1.val[0] = b1_8;
v_bgr1.val[1] = g1_8;
v_bgr1.val[2] = r1_8;
vst4_u8(ptr_bgr2, v_bgr);
vst4_u8(ptr_bgr2 + 32, v_bgr1);
ptr_bgr2 += 64;
}
// two data
for (; j < srcw; j += 2) {
unsigned char _y0 = ptr_y1[0];
unsigned char _y1 = ptr_y1[1];
unsigned char _v = ptr_vu[1];
unsigned char _u = ptr_vu[0];
unsigned char _y0_1 = ptr_y2[0];
unsigned char _y1_1 = ptr_y2[1];
int ra = floor((179 * (_v - 128)) >> 7);
int ga = floor((44 * (_u - 128) + 91 * (_v - 128)) >> 7);
int ba = floor((227 * (_u - 128)) >> 7);
int r = _y0 + ra;
int g = _y0 - ga;
int b = _y0 + ba;
int r1 = _y1 + ra;
int g1 = _y1 - ga;
int b1 = _y1 + ba;
r = r < 0 ? 0 : (r > 255) ? 255 : r;
g = g < 0 ? 0 : (g > 255) ? 255 : g;
b = b < 0 ? 0 : (b > 255) ? 255 : b;
r1 = r1 < 0 ? 0 : (r1 > 255) ? 255 : r1;
g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1;
b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1;
*ptr_bgr1++ = b;
*ptr_bgr1++ = g;
*ptr_bgr1++ = r;
*ptr_bgr1++ = 255;
int r2 = _y0_1 + ra;
int g2 = _y0_1 - ga;
int b2 = _y0_1 + ba;
int r3 = _y1_1 + ra;
int g3 = _y1_1 - ga;
int b3 = _y1_1 + ba;
r2 = r2 < 0 ? 0 : (r2 > 255) ? 255 : r2;
g2 = g2 < 0 ? 0 : (g2 > 255) ? 255 : g2;
b2 = b2 < 0 ? 0 : (b2 > 255) ? 255 : b2;
r3 = r3 < 0 ? 0 : (r3 > 255) ? 255 : r3;
g3 = g3 < 0 ? 0 : (g3 > 255) ? 255 : g3;
b3 = b3 < 0 ? 0 : (b3 > 255) ? 255 : b3;
*ptr_bgr1++ = b1;
*ptr_bgr1++ = g1;
*ptr_bgr1++ = r1;
*ptr_bgr1++ = 255;
*ptr_bgr2++ = b2;
*ptr_bgr2++ = g2;
*ptr_bgr2++ = r2;
*ptr_bgr2++ = 255;
ptr_y1 += 2;
ptr_y2 += 2;
ptr_vu += 2;
*ptr_bgr2++ = b3;
*ptr_bgr2++ = g3;
*ptr_bgr2++ = r3;
*ptr_bgr2++ = 255;
}
}
}
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <limits.h>
#include <math.h>
#include "lite/tests/cv/anakin/cv_utils.h"
void resize_one_channel(
const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
void resize_one_channel_uv(
const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
void nv21_resize(const uint8_t* src,
uint8_t* dst,
int w_in,
int h_in,
int w_out,
int h_out) {
if (w_out == w_in && h_out == h_in) {
printf("nv21_resize equal \n");
memcpy(dst, src, sizeof(uint8_t) * w_in * static_cast<int>(1.5 * h_in));
return;
}
int y_h = h_in;
int uv_h = h_in / 2;
const uint8_t* y_ptr = src;
const uint8_t* uv_ptr = src + y_h * w_in;
// out
int dst_y_h = h_out;
int dst_uv_h = h_out / 2;
uint8_t* dst_ptr = dst + dst_y_h * w_out;
resize_one_channel(y_ptr, w_in, y_h, dst, w_out, dst_y_h);
resize_one_channel_uv(uv_ptr, w_in, uv_h, dst_ptr, w_out, dst_uv_h);
}
void resize_one_channel(const uint8_t* src,
int w_in,
int h_in,
uint8_t* dst,
int w_out,
int h_out) {
const int resize_coef_bits = 11;
const int resize_coef_scale = 1 << resize_coef_bits;
double scale_x = static_cast<double>(w_in) / w_out;
double scale_y = static_cast<double>(h_in) / h_out;
int* buf = new int[w_out * 2 + h_out * 2];
int* xofs = buf; // new int[w];
int* yofs = buf + w_out; // new int[h];
int16_t* ialpha =
reinterpret_cast<int16_t*>(buf + w_out + h_out); // new short[w * 2];
int16_t* ibeta =
reinterpret_cast<int16_t*>(buf + w_out * 2 + h_out); // new short[h * 2];
float fx = 0.f;
float fy = 0.f;
int sx = 0;
int sy = 0;
#define SATURATE_CAST_SHORT(X) \
(int16_t)::std::min( \
::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
SHRT_MAX);
for (int dx = 0; dx < w_out; dx++) {
fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
sx = floor(fx);
fx -= sx;
if (sx < 0) {
sx = 0;
fx = 0.f;
}
if (sx >= w_in - 1) {
sx = w_in - 2;
fx = 1.f;
}
xofs[dx] = sx;
float a0 = (1.f - fx) * resize_coef_scale;
float a1 = fx * resize_coef_scale;
ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
}
for (int dy = 0; dy < h_out; dy++) {
fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
sy = floor(fy);
fy -= sy;
if (sy < 0) {
sy = 0;
fy = 0.f;
}
if (sy >= h_in - 1) {
sy = h_in - 2;
fy = 1.f;
}
yofs[dy] = sy;
float b0 = (1.f - fy) * resize_coef_scale;
float b1 = fy * resize_coef_scale;
ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
}
#undef SATURATE_CAST_SHORT
// loop body
int16_t* rowsbuf0 = new int16_t[w_out + 1];
int16_t* rowsbuf1 = new int16_t[w_out + 1];
int16_t* rows0 = rowsbuf0;
int16_t* rows1 = rowsbuf1;
int prev_sy1 = -1;
for (int dy = 0; dy < h_out; dy++) {
int sy = yofs[dy];
if (sy == prev_sy1) {
// hresize one row
int16_t* rows0_old = rows0;
rows0 = rows1;
rows1 = rows0_old;
const uint8_t* S1 = src + w_in * (sy + 1);
const int16_t* ialphap = ialpha;
int16_t* rows1p = rows1;
for (int dx = 0; dx < w_out; dx++) {
int sx = xofs[dx];
int16_t a0 = ialphap[0];
int16_t a1 = ialphap[1];
const uint8_t* S1p = S1 + sx;
rows1p[dx] = (S1p[0] * a0 + S1p[1] * a1) >> 4;
ialphap += 2;
}
} else {
// hresize two rows
const uint8_t* S0 = src + w_in * (sy);
const uint8_t* S1 = src + w_in * (sy + 1);
const int16_t* ialphap = ialpha;
int16_t* rows0p = rows0;
int16_t* rows1p = rows1;
for (int dx = 0; dx < w_out; dx++) {
int sx = xofs[dx];
int16_t a0 = ialphap[0];
int16_t a1 = ialphap[1];
const uint8_t* S0p = S0 + sx;
const uint8_t* S1p = S1 + sx;
rows0p[dx] = (S0p[0] * a0 + S0p[1] * a1) >> 4;
rows1p[dx] = (S1p[0] * a0 + S1p[1] * a1) >> 4;
ialphap += 2;
}
}
prev_sy1 = sy + 1;
// vresize
int16_t b0 = ibeta[0];
int16_t b1 = ibeta[1];
int16_t* rows0p = rows0;
int16_t* rows1p = rows1;
uint8_t* dp_ptr = dst + w_out * (dy);
int cnt = w_out >> 3;
int remain = w_out - (cnt << 3);
int16x4_t _b0 = vdup_n_s16(b0);
int16x4_t _b1 = vdup_n_s16(b1);
int32x4_t _v2 = vdupq_n_s32(2);
// #pragma omp parallel for
#if 1 // __aarch64__
for (cnt = w_out >> 3; cnt > 0; cnt--) {
int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
int32x4_t _acc = _v2;
_acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
_acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
int32x4_t _acc_1 = _v2;
_acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
_acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
vst1_u8(dp_ptr, _dout);
dp_ptr += 8;
rows0p += 8;
rows1p += 8;
}
#else
#pragma omp parallel for
if (cnt > 0) {
asm volatile(
"mov r4, #2 \n"
"vdup.s32 q12, r4 \n"
"0: \n"
"pld [%[rows0p], #128] \n"
"pld [%[rows1p], #128] \n"
"vld1.s16 {d2-d3}, [%[rows0p]]!\n"
"vld1.s16 {d6-d7}, [%[rows0p]]!\n"
"pld [%[rows0p], #128] \n"
"pld [%[rows1p], #128] \n"
"vmull.s16 q0, d2, %[_b0] \n"
"vmull.s16 q1, d3, %[_b0] \n"
"vmull.s16 q2, d6, %[_b1] \n"
"vmull.s16 q3, d7, %[_b1] \n"
"vld1.s16 {d2-d3}, [%[rows0p]]!\n"
"vld1.s16 {d6-d7}, [%[rows0p]]!\n"
"vorr.s32 q10, q12, q12 \n"
"vorr.s32 q11, q12, q12 \n"
"vsra.s32 q10, q0, #16 \n"
"vsra.s32 q11, q1, #16 \n"
"vsra.s32 q10, q2, #16 \n"
"vsra.s32 q11, q3, #16 \n"
"vmull.s16 q0, d2, %[_b0] \n"
"vmull.s16 q1, d3, %[_b0] \n"
"vmull.s16 q2, d6, %[_b1] \n"
"vmull.s16 q3, d7, %[_b1] \n"
"vsra.s32 q10, q0, #16 \n"
"vsra.s32 q11, q1, #16 \n"
"vsra.s32 q10, q2, #16 \n"
"vsra.s32 q11, q3, #16 \n"
"vshrn.s32 d20, q10, #2 \n"
"vshrn.s32 d21, q11, #2 \n"
"vqmovun.s16 d20, q10 \n"
"vst1.8 {d20}, [%[dp]]! \n"
"subs %[cnt], #1 \n"
"bne 0b \n"
"sub %[rows0p], #16 \n"
"sub %[rows1p], #16 \n"
: [rows0p] "+r"(rows0p),
[rows1p] "+r"(rows1p),
[_b0] "+w"(_b0),
[_b1] "+w"(_b1),
[cnt] "+r"(cnt),
[dp] "+r"(dp_ptr)
:
: "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12");
}
#endif // __aarch64__
for (; remain; --remain) {
// D[x] = (rows0[x]*b0 + rows1[x]*b1) >>
// INTER_RESIZE_COEF_BITS;
*dp_ptr++ =
(uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
(int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
2);
}
ibeta += 2;
}
delete[] buf;
delete[] rowsbuf0;
delete[] rowsbuf1;
}
void resize_one_channel_uv(const uint8_t* src,
int w_in,
int h_in,
uint8_t* dst,
int w_out,
int h_out) {
const int resize_coef_bits = 11;
const int resize_coef_scale = 1 << resize_coef_bits;
double scale_x = static_cast<double>(w_in) / w_out;
double scale_y = static_cast<double>(h_in) / h_out;
int* buf = new int[w_out * 2 + h_out * 2];
int* xofs = buf; // new int[w];
int* yofs = buf + w_out; // new int[h];
int16_t* ialpha =
reinterpret_cast<int16_t*>(buf + w_out + h_out); // new int16_t[w * 2];
int16_t* ibeta = reinterpret_cast<int16_t*>(buf + w_out * 2 +
h_out); // new int16_t[h * 2];
float fx = 0.f;
float fy = 0.f;
int sx = 0.f;
int sy = 0.f;
#define SATURATE_CAST_SHORT(X) \
(int16_t)::std::min( \
::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
SHRT_MAX);
for (int dx = 0; dx < w_out / 2; dx++) {
fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
sx = floor(fx);
fx -= sx;
if (sx < 0) {
sx = 0;
fx = 0.f;
}
if (sx >= w_in - 1) {
sx = w_in - 2;
fx = 1.f;
}
xofs[dx] = sx;
float a0 = (1.f - fx) * resize_coef_scale;
float a1 = fx * resize_coef_scale;
ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
}
for (int dy = 0; dy < h_out; dy++) {
fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
sy = floor(fy);
fy -= sy;
if (sy < 0) {
sy = 0;
fy = 0.f;
}
if (sy >= h_in - 1) {
sy = h_in - 2;
fy = 1.f;
}
yofs[dy] = sy;
float b0 = (1.f - fy) * resize_coef_scale;
float b1 = fy * resize_coef_scale;
ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
}
#undef SATURATE_CAST_SHORT
// loop body
int16_t* rowsbuf0 = new int16_t[w_out + 1];
int16_t* rowsbuf1 = new int16_t[w_out + 1];
int16_t* rows0 = rowsbuf0;
int16_t* rows1 = rowsbuf1;
int prev_sy1 = -1;
for (int dy = 0; dy < h_out; dy++) {
int sy = yofs[dy];
if (sy == prev_sy1) {
// hresize one row
int16_t* rows0_old = rows0;
rows0 = rows1;
rows1 = rows0_old;
const uint8_t* S1 = src + w_in * (sy + 1);
const int16_t* ialphap = ialpha;
int16_t* rows1p = rows1;
for (int dx = 0; dx < w_out / 2; dx++) {
int sx = xofs[dx] * 2;
int16_t a0 = ialphap[0];
int16_t a1 = ialphap[1];
const uint8_t* S1p = S1 + sx;
int tmp = dx * 2;
rows1p[tmp] = (S1p[0] * a0 + S1p[2] * a1) >> 4;
rows1p[tmp + 1] = (S1p[1] * a0 + S1p[3] * a1) >> 4;
ialphap += 2;
}
} else {
// hresize two rows
const uint8_t* S0 = src + w_in * (sy);
const uint8_t* S1 = src + w_in * (sy + 1);
const int16_t* ialphap = ialpha;
int16_t* rows0p = rows0;
int16_t* rows1p = rows1;
for (int dx = 0; dx < w_out / 2; dx++) {
int sx = xofs[dx] * 2;
int16_t a0 = ialphap[0];
int16_t a1 = ialphap[1];
const uint8_t* S0p = S0 + sx;
const uint8_t* S1p = S1 + sx;
int tmp = dx * 2;
rows0p[tmp] = (S0p[0] * a0 + S0p[2] * a1) >> 4;
rows1p[tmp] = (S1p[0] * a0 + S1p[2] * a1) >> 4;
rows0p[tmp + 1] = (S0p[1] * a0 + S0p[3] * a1) >> 4;
rows1p[tmp + 1] = (S1p[1] * a0 + S1p[3] * a1) >> 4;
ialphap += 2;
}
}
prev_sy1 = sy + 1;
// vresize
int16_t b0 = ibeta[0];
int16_t b1 = ibeta[1];
int16_t* rows0p = rows0;
int16_t* rows1p = rows1;
uint8_t* dp_ptr = dst + w_out * (dy);
int cnt = w_out >> 3;
int remain = w_out - (cnt << 3);
int16x4_t _b0 = vdup_n_s16(b0);
int16x4_t _b1 = vdup_n_s16(b1);
int32x4_t _v2 = vdupq_n_s32(2);
for (cnt = w_out >> 3; cnt > 0; cnt--) {
int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
int32x4_t _acc = _v2;
_acc = vsraq_n_s32(
_acc, _rows0p_sr4_mb0, 16); // _acc >> 16 + _rows0p_sr4_mb0 >> 16
_acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
int32x4_t _acc_1 = _v2;
_acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
_acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
int16x4_t _acc16 = vshrn_n_s32(_acc, 2); // _acc >> 2
int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
vst1_u8(dp_ptr, _dout);
dp_ptr += 8;
rows0p += 8;
rows1p += 8;
}
for (; remain; --remain) {
// D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
*dp_ptr++ =
(uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
(int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
2);
}
ibeta += 2;
}
delete[] buf;
delete[] rowsbuf0;
delete[] rowsbuf1;
}
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <math.h>
#include "lite/tests/cv/anakin/cv_utils.h"
/*
R = Y + 1.402*(V-128);
G = Y - 0.34414*(U-128) - 0.71414*(V-128);
float compute: a*b = ((a << 7)*b )>>7
ra = 1.402 *128 = 179.456 = 179
ga = 0.34414 * 64 = 44.3721 = 44
gb = 0.71414 * 64 = 91.40992 = 91
ba = 1.772 * 62 = 226.816 = 227
*/
// yvu store hwc bgrbgr dsth * dstw = srch * srcw y_w = srcw
// y_h = srch uv_w = srcw uv_h = 1/2 * srch
void nv21_to_bgr(const unsigned char* src,
unsigned char* dst,
int srcw,
int srch) {
int y_h = srch;
int wout = srcw * 3;
const unsigned char* y = src;
const unsigned char* vu = src + y_h * srcw;
int16x8_t bias = vdupq_n_s16(128);
int16x8_t ga = vdupq_n_s16(44);
int16x8_t ra = vdupq_n_s16(179);
int16x8_t ba = vdupq_n_s16(227);
int16x8_t gb = vdupq_n_s16(91);
int16x8_t zero = vdupq_n_s16(0);
int16x8_t max = vdupq_n_s16(255);
for (int i = 0; i < y_h; i += 2) {
const unsigned char* ptr_y1 = y + i * srcw;
const unsigned char* ptr_y2 = ptr_y1 + srcw;
const unsigned char* ptr_vu = vu + (i / 2) * srcw;
unsigned char* ptr_bgr1 = dst + i * wout;
unsigned char* ptr_bgr2 = ptr_bgr1 + wout;
// 2*16
#ifdef __aarch64__
asm volatile(
"prfm pldl1keep, [%[ptr_y1]] \n"
"prfm pldl1keep, [%[ptr_y1], #64] \n"
"prfm pldl1keep, [%[ptr_y2]] \n"
"prfm pldl1keep, [%[ptr_y2], #64] \n"
"prfm pldl1keep, [%[ptr_vu]] \n"
"prfm pldl1keep, [%[ptr_vu], #64] \n"
:
: [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
: "memory");
#else
asm volatile(
"pld [%[ptr_y1]] @ preload a, 64byte\n"
"pld [%[ptr_y1], #128] @ preload a, 64byte\n"
"pld [%[ptr_y2]] @ preload a, 64byte\n"
"pld [%[ptr_y2], #128] @ preload a, 64byte\n"
"pld [%[ptr_vu]] @ preload a, 64byte\n"
"pld [%[ptr_vu], #128] @ preload a, 64byte\n"
:
: [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
: "memory");
#endif
int j = 0;
for (; j < srcw - 15; j += 16) {
uint8x8x2_t y1 = vld2_u8(ptr_y1); // d8 = y0y2y4y6...y14 d9 =
// y1y3y5...y15
uint8x8x2_t vu =
vld2_u8(ptr_vu); // d0 = v0v1v2v3v4v5...v7 d1 = u0u1u2...u7
uint8x8x2_t y2 = vld2_u8(ptr_y2);
uint16x8_t v = vmovl_u8(vu.val[0]);
uint16x8_t u = vmovl_u8(vu.val[1]);
int16x8_t v_s = vreinterpretq_s16_u16(v);
int16x8_t u_s = vreinterpretq_s16_u16(u);
int16x8_t v_bias = vsubq_s16(v_s, bias);
int16x8_t u_bias = vsubq_s16(u_s, bias);
// G = Y - 0.34414*(U-128) - 0.71414*(V-128);
int16x8_t g0 = vmulq_s16(ga, u_bias);
// R = Y + 1.402*(V-128);
int16x8_t r0 = vmulq_s16(ra, v_bias);
// B = Y + 1.772*(U-128);
int16x8_t b0 = vmulq_s16(ba, u_bias);
g0 = vmlaq_s16(g0, gb, v_bias);
int16x8_t y1_0_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[0]));
int16x8_t y1_1_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[1]));
int16x8_t y2_0_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[0]));
int16x8_t y2_1_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[1]));
int16x8_t r0_bias = vshrq_n_s16(r0, 7); // r0 / 128
int16x8_t b0_bias = vshrq_n_s16(b0, 7);
int16x8_t g0_bias = vshrq_n_s16(g0, 7);
int16x8_t r0_1 = vaddq_s16(y1_0_8, r0_bias);
int16x8_t b0_1 = vaddq_s16(y1_0_8, b0_bias);
int16x8_t g0_1 = vsubq_s16(y1_0_8, g0_bias); // g0_1 = y1_0_8 - g0_1
int16x8_t r0_2 = vaddq_s16(y1_1_8, r0_bias);
int16x8_t b0_2 = vaddq_s16(y1_1_8, b0_bias);
int16x8_t g0_2 = vsubq_s16(y1_1_8, g0_bias);
r0_1 = vmaxq_s16(r0_1, zero);
b0_1 = vmaxq_s16(b0_1, zero);
g0_1 = vmaxq_s16(g0_1, zero);
r0_2 = vmaxq_s16(r0_2, zero);
b0_2 = vmaxq_s16(b0_2, zero);
g0_2 = vmaxq_s16(g0_2, zero);
r0_1 = vminq_s16(r0_1, max);
b0_1 = vminq_s16(b0_1, max);
g0_1 = vminq_s16(g0_1, max);
r0_2 = vminq_s16(r0_2, max);
b0_2 = vminq_s16(b0_2, max);
g0_2 = vminq_s16(g0_2, max);
uint8x8_t r00 = vreinterpret_u8_s8(vmovn_s16(r0_1));
uint8x8_t b00 = vreinterpret_u8_s8(vmovn_s16(b0_1));
uint8x8_t g00 = vreinterpret_u8_s8(vmovn_s16(g0_1));
uint8x8_t r01 = vreinterpret_u8_s8(vmovn_s16(r0_2));
uint8x8_t b01 = vreinterpret_u8_s8(vmovn_s16(b0_2));
uint8x8_t g01 = vreinterpret_u8_s8(vmovn_s16(g0_2));
int16x8_t r1_1 = vaddq_s16(y2_0_8, r0_bias);
int16x8_t b1_1 = vaddq_s16(y2_0_8, b0_bias);
int16x8_t g1_1 = vsubq_s16(y2_0_8, g0_bias); // g0_1 = y1_0_8 - g0_1
int16x8_t r1_2 = vaddq_s16(y2_1_8, r0_bias);
int16x8_t b1_2 = vaddq_s16(y2_1_8, b0_bias);
int16x8_t g1_2 = vsubq_s16(y2_1_8, g0_bias);
uint8x8x2_t r00_0 = vtrn_u8(r00, r01); // 014589 236710
uint8x8x2_t b00_0 = vtrn_u8(b00, b01);
uint8x8x2_t g00_0 = vtrn_u8(g00, g01);
r1_1 = vmaxq_s16(r1_1, zero);
b1_1 = vmaxq_s16(b1_1, zero);
g1_1 = vmaxq_s16(g1_1, zero);
r1_2 = vmaxq_s16(r1_2, zero);
b1_2 = vmaxq_s16(b1_2, zero);
g1_2 = vmaxq_s16(g1_2, zero);
uint16x4_t r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
uint16x4_t r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
uint16x4_t b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
uint16x4_t b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
uint16x4_t g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
uint16x4_t g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
uint16x4x2_t r00_1 = vtrn_u16(r0_16, r1_16); // 012389 456710
uint16x4x2_t b00_1 = vtrn_u16(b0_16, b1_16);
uint16x4x2_t g00_1 = vtrn_u16(g0_16, g1_16);
r1_1 = vminq_s16(r1_1, max);
b1_1 = vminq_s16(b1_1, max);
g1_1 = vminq_s16(g1_1, max);
r1_2 = vminq_s16(r1_2, max);
b1_2 = vminq_s16(b1_2, max);
g1_2 = vminq_s16(g1_2, max);
uint32x2_t r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
uint32x2_t r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
uint32x2_t b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
uint32x2_t b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
uint32x2_t g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
uint32x2_t g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
uint32x2x2_t r00_2 = vtrn_u32(r0_32, r1_32); // 01234567 8910
uint32x2x2_t b00_2 = vtrn_u32(b0_32, b1_32);
uint32x2x2_t g00_2 = vtrn_u32(g0_32, g1_32);
r00 = vreinterpret_u8_s8(vmovn_s16(r1_1));
b00 = vreinterpret_u8_s8(vmovn_s16(b1_1));
g00 = vreinterpret_u8_s8(vmovn_s16(g1_1));
r01 = vreinterpret_u8_s8(vmovn_s16(r1_2));
b01 = vreinterpret_u8_s8(vmovn_s16(b1_2));
g01 = vreinterpret_u8_s8(vmovn_s16(g1_2));
uint8x8_t r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
uint8x8_t b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
uint8x8_t g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
uint8x8_t r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
uint8x8_t b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
uint8x8_t g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
uint8x8x3_t v_bgr;
v_bgr.val[0] = b0_8;
v_bgr.val[1] = g0_8;
v_bgr.val[2] = r0_8;
r00_0 = vtrn_u8(r00, r01); // 014589 236710
b00_0 = vtrn_u8(b00, b01);
g00_0 = vtrn_u8(g00, g01);
vst3_u8(ptr_bgr1, v_bgr);
r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
ptr_bgr1 += 24;
uint8x8x3_t v_bgr1;
v_bgr1.val[0] = b1_8;
v_bgr1.val[1] = g1_8;
v_bgr1.val[2] = r1_8;
r00_1 = vtrn_u16(r0_16, r1_16); // 012389 456710
b00_1 = vtrn_u16(b0_16, b1_16);
g00_1 = vtrn_u16(g0_16, g1_16);
vst3_u8(ptr_bgr1, v_bgr1);
r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
ptr_bgr1 += 24;
r00_2 = vtrn_u32(r0_32, r1_32); // 01234567 8910
b00_2 = vtrn_u32(b0_32, b1_32);
g00_2 = vtrn_u32(g0_32, g1_32);
ptr_vu += 16;
ptr_y1 += 16;
ptr_y2 += 16;
r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
v_bgr.val[0] = b0_8;
v_bgr.val[1] = g0_8;
v_bgr.val[2] = r0_8;
v_bgr1.val[0] = b1_8;
v_bgr1.val[1] = g1_8;
v_bgr1.val[2] = r1_8;
vst3_u8(ptr_bgr2, v_bgr);
vst3_u8(ptr_bgr2 + 24, v_bgr1);
ptr_bgr2 += 48;
}
// two data
for (; j < srcw; j += 2) {
unsigned char _y0 = ptr_y1[0];
unsigned char _y1 = ptr_y1[1];
unsigned char _v = ptr_vu[0];
unsigned char _u = ptr_vu[1];
unsigned char _y0_1 = ptr_y2[0];
unsigned char _y1_1 = ptr_y2[1];
int ra = floor((179 * (_v - 128)) >> 7);
int ga = floor((44 * (_u - 128) + 91 * (_v - 128)) >> 7);
int ba = floor((227 * (_u - 128)) >> 7);
int r = _y0 + ra;
int g = _y0 - ga;
int b = _y0 + ba;
int r1 = _y1 + ra;
int g1 = _y1 - ga;
int b1 = _y1 + ba;
r = r < 0 ? 0 : (r > 255) ? 255 : r;
g = g < 0 ? 0 : (g > 255) ? 255 : g;
b = b < 0 ? 0 : (b > 255) ? 255 : b;
r1 = r1 < 0 ? 0 : (r1 > 255) ? 255 : r1;
g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1;
b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1;
*ptr_bgr1++ = b;
*ptr_bgr1++ = g;
*ptr_bgr1++ = r;
int r2 = _y0_1 + ra;
int g2 = _y0_1 - ga;
int b2 = _y0_1 + ba;
int r3 = _y1_1 + ra;
int g3 = _y1_1 - ga;
int b3 = _y1_1 + ba;
r2 = r2 < 0 ? 0 : (r2 > 255) ? 255 : r2;
g2 = g2 < 0 ? 0 : (g2 > 255) ? 255 : g2;
b2 = b2 < 0 ? 0 : (b2 > 255) ? 255 : b2;
r3 = r3 < 0 ? 0 : (r3 > 255) ? 255 : r3;
g3 = g3 < 0 ? 0 : (g3 > 255) ? 255 : g3;
b3 = b3 < 0 ? 0 : (b3 > 255) ? 255 : b3;
*ptr_bgr1++ = b1;
*ptr_bgr1++ = g1;
*ptr_bgr1++ = r1;
*ptr_bgr2++ = b2;
*ptr_bgr2++ = g2;
*ptr_bgr2++ = r2;
ptr_y1 += 2;
ptr_y2 += 2;
ptr_vu += 2;
*ptr_bgr2++ = b3;
*ptr_bgr2++ = g3;
*ptr_bgr2++ = r3;
}
}
}
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <math.h>
#include "lite/tests/cv/anakin/cv_utils.h"
/*
R = Y + 1.402*(V-128);
G = Y - 0.34414*(U-128) - 0.71414*(V-128);
B = Y + 1.772*(U-128);
A = 255
float-compute: a*b = ((a << 7)*b )>>7
ra = 1.402 *128 = 179.456 = 179
ga = 0.34414 * 64 = 44.3721 = 44
gb = 0.71414 * 64 = 91.40992 = 91
ba = 1.772 * 62 = 226.816 = 227
*/
// yvu store hwc bgrabgra dsth * dstw = srch * srcw y_w = srcw
// y_h = srch uv_w = srcw uv_h = 1/2 * srch
void nv21_to_bgra(const unsigned char* src,
unsigned char* dst,
int srcw,
int srch) {
int y_h = srch;
int vu_h = 1 / 2 * srch;
const unsigned char* y = src;
const unsigned char* vu = src + y_h * srcw;
int wout = srcw * 4;
int16x8_t bias = vdupq_n_s16(128);
int16x8_t ga = vdupq_n_s16(44);
int16x8_t ra = vdupq_n_s16(179);
int16x8_t ba = vdupq_n_s16(227);
int16x8_t gb = vdupq_n_s16(91);
int16x8_t zero = vdupq_n_s16(0);
int16x8_t max = vdupq_n_s16(255);
uint8x8_t a_8 = vdup_n_u8(255);
for (int i = 0; i < y_h; i += 2) {
const unsigned char* ptr_y1 = y + i * srcw;
const unsigned char* ptr_y2 = ptr_y1 + srcw;
const unsigned char* ptr_vu = vu + (i / 2) * srcw;
unsigned char* ptr_bgr1 = dst + i * wout;
unsigned char* ptr_bgr2 = ptr_bgr1 + wout;
// 2*16
#ifdef __aarch64__
asm volatile(
"prfm pldl1keep, [%[ptr_y1]] \n"
"prfm pldl1keep, [%[ptr_y1], #64] \n"
"prfm pldl1keep, [%[ptr_y2]] \n"
"prfm pldl1keep, [%[ptr_y2], #64] \n"
"prfm pldl1keep, [%[ptr_vu]] \n"
"prfm pldl1keep, [%[ptr_vu], #64] \n"
:
: [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
: "memory");
#else
asm volatile(
"pld [%[ptr_y1]] @ preload a, 64byte\n"
"pld [%[ptr_y1], #128] @ preload a, 64byte\n"
"pld [%[ptr_y2]] @ preload a, 64byte\n"
"pld [%[ptr_y2], #128] @ preload a, 64byte\n"
"pld [%[ptr_vu]] @ preload a, 64byte\n"
"pld [%[ptr_vu], #128] @ preload a, 64byte\n"
:
: [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
: "memory");
#endif
int j = 0;
for (; j < srcw - 15; j += 16) {
uint8x8x2_t y1 = vld2_u8(ptr_y1); // d8 = y0y2y4y6...y14 d9 =
// y1y3y5...y15
uint8x8x2_t vu =
vld2_u8(ptr_vu); // d0 = v0v1v2v3v4v5...v7 d1 = u0u1u2...u7
uint8x8x2_t y2 = vld2_u8(ptr_y2);
uint16x8_t v = vmovl_u8(vu.val[0]);
uint16x8_t u = vmovl_u8(vu.val[1]);
int16x8_t v_s = vreinterpretq_s16_u16(v);
int16x8_t u_s = vreinterpretq_s16_u16(u);
int16x8_t v_bias = vsubq_s16(v_s, bias);
int16x8_t u_bias = vsubq_s16(u_s, bias);
// G = Y - 0.34414*(U-128) - 0.71414*(V-128);
int16x8_t g0 = vmulq_s16(ga, u_bias);
// R = Y + 1.402*(V-128);
int16x8_t r0 = vmulq_s16(ra, v_bias);
// B = Y + 1.772*(U-128);
int16x8_t b0 = vmulq_s16(ba, u_bias);
g0 = vmlaq_s16(g0, gb, v_bias);
int16x8_t y1_0_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[0]));
int16x8_t y1_1_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[1]));
int16x8_t y2_0_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[0]));
int16x8_t y2_1_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[1]));
int16x8_t r0_bias = vshrq_n_s16(r0, 7); // r0 / 128
int16x8_t b0_bias = vshrq_n_s16(b0, 7);
int16x8_t g0_bias = vshrq_n_s16(g0, 7);
int16x8_t r0_1 = vaddq_s16(y1_0_8, r0_bias);
int16x8_t b0_1 = vaddq_s16(y1_0_8, b0_bias);
int16x8_t g0_1 = vsubq_s16(y1_0_8, g0_bias); // g0_1 = y1_0_8 - g0_1
int16x8_t r0_2 = vaddq_s16(y1_1_8, r0_bias);
int16x8_t b0_2 = vaddq_s16(y1_1_8, b0_bias);
int16x8_t g0_2 = vsubq_s16(y1_1_8, g0_bias);
r0_1 = vmaxq_s16(r0_1, zero);
b0_1 = vmaxq_s16(b0_1, zero);
g0_1 = vmaxq_s16(g0_1, zero);
r0_2 = vmaxq_s16(r0_2, zero);
b0_2 = vmaxq_s16(b0_2, zero);
g0_2 = vmaxq_s16(g0_2, zero);
r0_1 = vminq_s16(r0_1, max);
b0_1 = vminq_s16(b0_1, max);
g0_1 = vminq_s16(g0_1, max);
r0_2 = vminq_s16(r0_2, max);
b0_2 = vminq_s16(b0_2, max);
g0_2 = vminq_s16(g0_2, max);
uint8x8_t r00 = vreinterpret_u8_s8(vmovn_s16(r0_1));
uint8x8_t b00 = vreinterpret_u8_s8(vmovn_s16(b0_1));
uint8x8_t g00 = vreinterpret_u8_s8(vmovn_s16(g0_1));
uint8x8_t r01 = vreinterpret_u8_s8(vmovn_s16(r0_2));
uint8x8_t b01 = vreinterpret_u8_s8(vmovn_s16(b0_2));
uint8x8_t g01 = vreinterpret_u8_s8(vmovn_s16(g0_2));
int16x8_t r1_1 = vaddq_s16(y2_0_8, r0_bias);
int16x8_t b1_1 = vaddq_s16(y2_0_8, b0_bias);
int16x8_t g1_1 = vsubq_s16(y2_0_8, g0_bias); // g0_1 = y1_0_8 - g0_1
int16x8_t r1_2 = vaddq_s16(y2_1_8, r0_bias);
int16x8_t b1_2 = vaddq_s16(y2_1_8, b0_bias);
int16x8_t g1_2 = vsubq_s16(y2_1_8, g0_bias);
uint8x8x2_t r00_0 = vtrn_u8(r00, r01); // 014589 236710
uint8x8x2_t b00_0 = vtrn_u8(b00, b01);
uint8x8x2_t g00_0 = vtrn_u8(g00, g01);
r1_1 = vmaxq_s16(r1_1, zero);
b1_1 = vmaxq_s16(b1_1, zero);
g1_1 = vmaxq_s16(g1_1, zero);
r1_2 = vmaxq_s16(r1_2, zero);
b1_2 = vmaxq_s16(b1_2, zero);
g1_2 = vmaxq_s16(g1_2, zero);
uint16x4_t r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
uint16x4_t r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
uint16x4_t b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
uint16x4_t b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
uint16x4_t g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
uint16x4_t g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
uint16x4x2_t r00_1 = vtrn_u16(r0_16, r1_16); // 012389 456710
uint16x4x2_t b00_1 = vtrn_u16(b0_16, b1_16);
uint16x4x2_t g00_1 = vtrn_u16(g0_16, g1_16);
r1_1 = vminq_s16(r1_1, max);
b1_1 = vminq_s16(b1_1, max);
g1_1 = vminq_s16(g1_1, max);
r1_2 = vminq_s16(r1_2, max);
b1_2 = vminq_s16(b1_2, max);
g1_2 = vminq_s16(g1_2, max);
uint32x2_t r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
uint32x2_t r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
uint32x2_t b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
uint32x2_t b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
uint32x2_t g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
uint32x2_t g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
uint32x2x2_t r00_2 = vtrn_u32(r0_32, r1_32); // 01234567 8910
uint32x2x2_t b00_2 = vtrn_u32(b0_32, b1_32);
uint32x2x2_t g00_2 = vtrn_u32(g0_32, g1_32);
r00 = vreinterpret_u8_s8(vmovn_s16(r1_1));
b00 = vreinterpret_u8_s8(vmovn_s16(b1_1));
g00 = vreinterpret_u8_s8(vmovn_s16(g1_1));
r01 = vreinterpret_u8_s8(vmovn_s16(r1_2));
b01 = vreinterpret_u8_s8(vmovn_s16(b1_2));
g01 = vreinterpret_u8_s8(vmovn_s16(g1_2));
uint8x8_t r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
uint8x8_t b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
uint8x8_t g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
uint8x8_t r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
uint8x8_t b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
uint8x8_t g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
uint8x8x4_t v_bgr;
v_bgr.val[0] = b0_8;
v_bgr.val[1] = g0_8;
v_bgr.val[2] = r0_8;
v_bgr.val[3] = a_8;
r00_0 = vtrn_u8(r00, r01); // 014589 236710
b00_0 = vtrn_u8(b00, b01);
g00_0 = vtrn_u8(g00, g01);
vst4_u8(ptr_bgr1, v_bgr);
r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
ptr_bgr1 += 32;
uint8x8x4_t v_bgr1;
v_bgr1.val[0] = b1_8;
v_bgr1.val[1] = g1_8;
v_bgr1.val[2] = r1_8;
v_bgr1.val[3] = a_8;
r00_1 = vtrn_u16(r0_16, r1_16); // 012389 456710
b00_1 = vtrn_u16(b0_16, b1_16);
g00_1 = vtrn_u16(g0_16, g1_16);
vst4_u8(ptr_bgr1, v_bgr1);
r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
ptr_bgr1 += 32;
r00_2 = vtrn_u32(r0_32, r1_32); // 01234567 8910
b00_2 = vtrn_u32(b0_32, b1_32);
g00_2 = vtrn_u32(g0_32, g1_32);
ptr_vu += 16;
ptr_y1 += 16;
ptr_y2 += 16;
r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
v_bgr.val[0] = b0_8;
v_bgr.val[1] = g0_8;
v_bgr.val[2] = r0_8;
v_bgr1.val[0] = b1_8;
v_bgr1.val[1] = g1_8;
v_bgr1.val[2] = r1_8;
vst4_u8(ptr_bgr2, v_bgr);
vst4_u8(ptr_bgr2 + 32, v_bgr1);
ptr_bgr2 += 64;
}
// two data
for (; j < srcw; j += 2) {
unsigned char _y0 = ptr_y1[0];
unsigned char _y1 = ptr_y1[1];
unsigned char _v = ptr_vu[0];
unsigned char _u = ptr_vu[1];
unsigned char _y0_1 = ptr_y2[0];
unsigned char _y1_1 = ptr_y2[1];
int ra = floor((179 * (_v - 128)) >> 7);
int ga = floor((44 * (_u - 128) + 91 * (_v - 128)) >> 7);
int ba = floor((227 * (_u - 128)) >> 7);
int r = _y0 + ra;
int g = _y0 - ga;
int b = _y0 + ba;
int r1 = _y1 + ra;
int g1 = _y1 - ga;
int b1 = _y1 + ba;
r = r < 0 ? 0 : (r > 255) ? 255 : r;
g = g < 0 ? 0 : (g > 255) ? 255 : g;
b = b < 0 ? 0 : (b > 255) ? 255 : b;
r1 = r1 < 0 ? 0 : (r1 > 255) ? 255 : r1;
g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1;
b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1;
*ptr_bgr1++ = b;
*ptr_bgr1++ = g;
*ptr_bgr1++ = r;
*ptr_bgr1++ = 255;
int r2 = _y0_1 + ra;
int g2 = _y0_1 - ga;
int b2 = _y0_1 + ba;
int r3 = _y1_1 + ra;
int g3 = _y1_1 - ga;
int b3 = _y1_1 + ba;
r2 = r2 < 0 ? 0 : (r2 > 255) ? 255 : r2;
g2 = g2 < 0 ? 0 : (g2 > 255) ? 255 : g2;
b2 = b2 < 0 ? 0 : (b2 > 255) ? 255 : b2;
r3 = r3 < 0 ? 0 : (r3 > 255) ? 255 : r3;
g3 = g3 < 0 ? 0 : (g3 > 255) ? 255 : g3;
b3 = b3 < 0 ? 0 : (b3 > 255) ? 255 : b3;
*ptr_bgr1++ = b1;
*ptr_bgr1++ = g1;
*ptr_bgr1++ = r1;
*ptr_bgr1++ = 255;
*ptr_bgr2++ = b2;
*ptr_bgr2++ = g2;
*ptr_bgr2++ = r2;
*ptr_bgr2++ = 255;
ptr_y1 += 2;
ptr_y2 += 2;
ptr_vu += 2;
*ptr_bgr2++ = b3;
*ptr_bgr2++ = g3;
*ptr_bgr2++ = r3;
*ptr_bgr2++ = 255;
}
}
}
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <math.h>
#include <random>
#include "lite/core/context.h"
#include "lite/core/profile/timer.h"
#include "lite/tests/cv/anakin/cv_utils.h"
#include "lite/tests/utils/tensor_utils.h"
#include "lite/utils/cv/paddle_image_preprocess.h"
#include "time.h" // NOLINT
DEFINE_int32(cluster, 3, "cluster id");
DEFINE_int32(threads, 1, "threads num");
DEFINE_int32(warmup, 0, "warmup times");
DEFINE_int32(repeats, 10, "repeats times");
DEFINE_bool(basic_test, false, "do all tests");
DEFINE_bool(check_result, true, "check the result");
DEFINE_int32(srcFormat, 12, "input image format NV12");
DEFINE_int32(dstFormat, 3, "output image format BGR");
DEFINE_int32(srch, 1920, "input height");
DEFINE_int32(srcw, 1080, "input width");
DEFINE_int32(dsth, 960, "output height");
DEFINE_int32(dstw, 540, "output width");
DEFINE_int32(angle, 90, "rotate angel");
DEFINE_int32(flip_num, 0, "flip x");
DEFINE_int32(layout, 1, "layout nchw");
typedef paddle::lite::utils::cv::ImageFormat ImageFormat;
typedef paddle::lite::utils::cv::FlipParam FlipParam;
typedef paddle::lite_api::DataLayoutType LayoutType;
typedef paddle::lite::utils::cv::TransParam TransParam;
typedef paddle::lite::utils::cv::ImagePreprocess ImagePreprocess;
typedef paddle::lite_api::Tensor Tensor_api;
typedef paddle::lite::Tensor Tensor;
using paddle::lite::profile::Timer;
void fill_tensor_host_rand(uint8_t* dio, int64_t size) {
uint seed = 256;
for (int64_t i = 0; i < size; ++i) {
dio[i] = rand_r(&seed) % 256; // -128;
}
}
void print_int8(uint8_t* ptr, int size, int width) {
for (int i = 0; i < size; i++) {
printf("%d ", *ptr++);
if ((i + 1) % width == 0) {
printf("\n");
}
}
printf("\n");
}
void print_int(int* ptr, int size, int width) {
int j = 0;
for (int i = 0; i < size; i++) {
printf("%d ", *ptr++);
if ((i + 1) % width == 0) {
printf("\n");
}
}
printf("\n");
}
void print_fp32(const float* ptr, int size, int width) {
int j = 0;
for (int i = 0; i < size; i++) {
printf("%f ", *ptr++);
if ((i + 1) % width == 0) {
printf("\n");
}
}
printf("\n");
}
#ifdef LITE_WITH_ARM
void test_convert(const std::vector<int>& cluster_id,
const std::vector<int>& thread_num,
int srcw,
int srch,
int dstw,
int dsth,
ImageFormat srcFormat,
ImageFormat dstFormat,
float rotate,
FlipParam flip,
LayoutType layout,
int test_iter = 10) {
for (auto& cls : cluster_id) {
for (auto& th : thread_num) {
std::unique_ptr<paddle::lite::KernelContext> ctx1(
new paddle::lite::KernelContext);
auto& ctx = ctx1->As<paddle::lite::ARMContext>();
ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
LOG(INFO) << "cluster: " << cls << ", threads: " << th;
int size = 3 * srch * srcw;
if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
size = ceil(1.5 * srch) * srcw;
} else if (srcFormat == ImageFormat::BGRA ||
srcFormat == ImageFormat::RGBA) {
size = 4 * srch * srcw;
} else if (srcFormat == ImageFormat::GRAY) {
size = srch * srcw;
}
uint8_t* src = new uint8_t[size];
fill_tensor_host_rand(src, size);
int out_size = srch * srcw;
if (dstFormat == ImageFormat::NV12 || dstFormat == ImageFormat::NV21) {
out_size = ceil(1.5 * srch) * srcw;
} else if (dstFormat == ImageFormat::BGR ||
dstFormat == ImageFormat::RGB) {
out_size = 3 * srch * srcw;
} else if (dstFormat == ImageFormat::BGRA ||
dstFormat == ImageFormat::RGBA) {
out_size = 4 * srch * srcw;
} else if (dstFormat == ImageFormat::GRAY) {
out_size = srch * srcw;
}
uint8_t* basic_dst = new uint8_t[out_size];
uint8_t* lite_dst = new uint8_t[out_size];
Timer t_basic, t_lite;
LOG(INFO) << "basic Convert compute";
for (int i = 0; i < test_iter; i++) {
t_basic.Start();
image_basic_convert(src,
basic_dst,
(ImageFormat)srcFormat,
(ImageFormat)dstFormat,
srcw,
srch,
out_size);
t_basic.Stop();
}
LOG(INFO) << "image baisc Convert avg time : " << t_basic.LapTimes().Avg()
<< ", min time: " << t_basic.LapTimes().Min()
<< ", max time: " << t_basic.LapTimes().Max();
LOG(INFO) << "lite Convert compute";
TransParam tparam;
tparam.ih = srch;
tparam.iw = srcw;
tparam.oh = srch;
tparam.ow = srcw;
tparam.flip_param = flip;
tparam.rotate_param = rotate;
ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
for (int i = 0; i < test_iter; ++i) {
t_lite.Start();
image_preprocess.imageConvert(src, lite_dst);
t_lite.Stop();
}
LOG(INFO) << "image Convert avg time : " << t_lite.LapTimes().Avg()
<< ", min time: " << t_lite.LapTimes().Min()
<< ", max time: " << t_lite.LapTimes().Max();
LOG(INFO) << "basic Convert compute";
double max_ratio = 0;
double max_diff = 0;
const double eps = 1e-6f;
if (FLAGS_check_result) {
LOG(INFO) << "diff, image convert size: " << out_size;
uint8_t* diff_v = new uint8_t[out_size];
for (int i = 0; i < out_size; i++) {
uint8_t a = lite_dst[i];
uint8_t b = basic_dst[i];
uint8_t diff1 = a - b;
uint8_t diff = diff1 > 0 ? diff1 : -diff1;
diff_v[i] = diff;
if (max_diff < diff) {
max_diff = diff;
max_ratio = 2.0 * max_diff / (a + b + eps);
}
}
if (std::abs(max_ratio) >= 1e-5f) {
int width = size / srch;
printf("din: \n");
print_int8(src, size, width);
width = out_size / srch;
printf("saber result: \n");
print_int8(lite_dst, out_size, width);
printf("basic result: \n");
print_int8(basic_dst, out_size, width);
printf("diff result: \n");
print_int8(diff_v, out_size, width);
}
delete[] diff_v;
LOG(INFO) << "compare result, max diff: " << max_diff
<< ", max ratio: " << max_ratio;
bool rst = std::abs(max_ratio) < 1e-5f;
CHECK_EQ(rst, true) << "compute result error";
}
LOG(INFO) << "image convert end";
}
}
}
void test_resize(const std::vector<int>& cluster_id,
const std::vector<int>& thread_num,
int srcw,
int srch,
int dstw,
int dsth,
ImageFormat srcFormat,
ImageFormat dstFormat,
float rotate,
FlipParam flip,
LayoutType layout,
int test_iter = 10) {
test_iter = 1;
for (auto& cls : cluster_id) {
for (auto& th : thread_num) {
std::unique_ptr<paddle::lite::KernelContext> ctx1(
new paddle::lite::KernelContext);
auto& ctx = ctx1->As<paddle::lite::ARMContext>();
ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
LOG(INFO) << "cluster: " << cls << ", threads: " << th;
int size = 3 * srch * srcw;
if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
size = ceil(1.5 * srch) * srcw;
} else if (srcFormat == ImageFormat::BGRA ||
srcFormat == ImageFormat::RGBA) {
size = 4 * srch * srcw;
} else if (srcFormat == ImageFormat::GRAY) {
size = srch * srcw;
}
uint8_t* src = new uint8_t[size];
fill_tensor_host_rand(src, size);
int out_size = dsth * dstw;
if (dstFormat == ImageFormat::NV12 || dstFormat == ImageFormat::NV21) {
out_size = ceil(1.5 * dsth) * dstw;
} else if (dstFormat == ImageFormat::BGR ||
dstFormat == ImageFormat::RGB) {
out_size = 3 * dsth * dstw;
} else if (dstFormat == ImageFormat::BGRA ||
dstFormat == ImageFormat::RGBA) {
out_size = 4 * dsth * dstw;
} else if (dstFormat == ImageFormat::GRAY) {
out_size = dsth * dstw;
}
uint8_t* basic_dst = new uint8_t[out_size];
uint8_t* lite_dst = new uint8_t[out_size];
Timer t_rotate;
Timer t_basic, t_lite;
LOG(INFO) << "baisc resize compute";
for (int i = 0; i < test_iter; i++) {
t_basic.Start();
image_basic_resize(
src, basic_dst, (ImageFormat)dstFormat, srcw, srch, dstw, dsth);
t_basic.Stop();
}
LOG(INFO) << "image baisc Resize avg time : " << t_basic.LapTimes().Avg()
<< ", min time: " << t_basic.LapTimes().Min()
<< ", max time: " << t_basic.LapTimes().Max();
LOG(INFO) << "lite resize compute";
TransParam tparam;
tparam.ih = srch;
tparam.iw = srcw;
tparam.oh = dsth;
tparam.ow = dstw;
tparam.flip_param = flip;
tparam.rotate_param = rotate;
ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
for (int i = 0; i < test_iter; ++i) {
t_rotate.Start();
image_preprocess.imageResize(src, lite_dst);
t_rotate.Stop();
}
LOG(INFO) << "image Resize avg time : " << t_rotate.LapTimes().Avg()
<< ", min time: " << t_rotate.LapTimes().Min()
<< ", max time: " << t_rotate.LapTimes().Max();
double max_ratio = 0;
double max_diff = 0;
const double eps = 1e-6f;
if (FLAGS_check_result) {
LOG(INFO) << "diff, image Resize size: " << out_size;
int* diff_v = new int[out_size];
for (int i = 0; i < out_size; i++) {
uint8_t a = lite_dst[i];
uint8_t b = basic_dst[i];
int diff1 = a - b; // basic resize and saber resize 在float ->
// int转换时存在误差,误差范围是{-1, 1}
int diff = 0;
if (diff1 < -1 || diff1 > 1) diff = diff1 < 0 ? -diff1 : diff1;
diff_v[i] = diff;
if (diff > 1 && max_diff < diff) {
max_diff = diff;
printf("i: %d, lite: %d, basic: %d \n", i, a, b);
max_ratio = 2.0 * max_diff / (a + b + eps);
}
}
if (std::abs(max_ratio) >= 1e-5f) {
int width = size / srcw;
printf("din: \n");
print_int8(src, size, width);
width = out_size / dstw;
printf("saber result: \n");
print_int8(lite_dst, out_size, width);
printf("basic result: \n");
print_int8(basic_dst, out_size, width);
printf("diff result: \n");
print_int(diff_v, out_size, width);
}
delete[] diff_v;
LOG(INFO) << "compare result, max diff: " << max_diff
<< ", max ratio: " << max_ratio;
bool rst = std::abs(max_ratio) < 1e-5f;
CHECK_EQ(rst, true) << "compute result error";
}
LOG(INFO) << "image Resize end";
}
}
}
void test_flip(const std::vector<int>& cluster_id,
const std::vector<int>& thread_num,
int srcw,
int srch,
int dstw,
int dsth,
ImageFormat srcFormat,
ImageFormat dstFormat,
float rotate,
FlipParam flip,
LayoutType layout,
int test_iter = 10) {
for (auto& cls : cluster_id) {
for (auto& th : thread_num) {
std::unique_ptr<paddle::lite::KernelContext> ctx1(
new paddle::lite::KernelContext);
auto& ctx = ctx1->As<paddle::lite::ARMContext>();
ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
LOG(INFO) << "cluster: " << cls << ", threads: " << th;
int size = 3 * srch * srcw;
if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
size = ceil(1.5 * srch) * srcw;
} else if (srcFormat == ImageFormat::BGRA ||
srcFormat == ImageFormat::RGBA) {
size = 4 * srch * srcw;
} else if (srcFormat == ImageFormat::GRAY) {
size = srch * srcw;
}
uint8_t* src = new uint8_t[size];
fill_tensor_host_rand(src, size);
int out_size = srch * srcw;
if (dstFormat == ImageFormat::NV12 || dstFormat == ImageFormat::NV21) {
out_size = ceil(1.5 * srch) * srcw;
} else if (dstFormat == ImageFormat::BGR ||
dstFormat == ImageFormat::RGB) {
out_size = 3 * srch * srcw;
} else if (dstFormat == ImageFormat::BGRA ||
dstFormat == ImageFormat::RGBA) {
out_size = 4 * srch * srcw;
} else if (dstFormat == ImageFormat::GRAY) {
out_size = srch * srcw;
}
uint8_t* basic_dst = new uint8_t[out_size];
uint8_t* lite_dst = new uint8_t[out_size];
LOG(INFO) << "basic flip compute";
Timer t_basic, t_lite;
for (int i = 0; i < test_iter; i++) {
t_basic.Start();
image_basic_flip(
src, basic_dst, (ImageFormat)dstFormat, srcw, srch, flip);
t_basic.Stop();
}
LOG(INFO) << "image baisc flip avg time : " << t_basic.LapTimes().Avg()
<< ", min time: " << t_basic.LapTimes().Min()
<< ", max time: " << t_basic.LapTimes().Max();
LOG(INFO) << "lite flip compute";
TransParam tparam;
tparam.ih = srch;
tparam.iw = srcw;
tparam.oh = srch;
tparam.ow = srcw;
tparam.flip_param = flip;
tparam.rotate_param = rotate;
ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
for (int i = 0; i < test_iter; ++i) {
t_lite.Start();
image_preprocess.imageFlip(src, lite_dst);
t_lite.Stop();
}
LOG(INFO) << "image flip avg time : " << t_lite.LapTimes().Avg()
<< ", min time: " << t_lite.LapTimes().Min()
<< ", max time: " << t_lite.LapTimes().Max();
double max_ratio = 0;
double max_diff = 0;
const double eps = 1e-6f;
if (FLAGS_check_result) {
LOG(INFO) << "diff, image flip size: " << out_size;
uint8_t* diff_v = new uint8_t[out_size];
for (int i = 0; i < out_size; i++) {
uint8_t a = lite_dst[i];
uint8_t b = basic_dst[i];
uint8_t diff1 = a - b;
uint8_t diff = diff1 > 0 ? diff1 : -diff1;
diff_v[i] = diff;
if (max_diff < diff) {
max_diff = diff;
max_ratio = 2.0 * max_diff / (a + b + eps);
}
}
if (std::abs(max_ratio) >= 1e-5f) {
int width = size / srch;
printf("din: \n");
print_int8(src, size, width);
width = out_size / srch;
printf("saber result: \n");
print_int8(lite_dst, out_size, width);
printf("basic result: \n");
print_int8(basic_dst, out_size, width);
printf("diff result: \n");
print_int8(diff_v, out_size, width);
}
delete[] diff_v;
LOG(INFO) << "compare result, max diff: " << max_diff
<< ", max ratio: " << max_ratio;
bool rst = std::abs(max_ratio) < 1e-5f;
CHECK_EQ(rst, true) << "compute result error";
}
LOG(INFO) << "image flip end";
}
}
}
void test_rotate(const std::vector<int>& cluster_id,
const std::vector<int>& thread_num,
int srcw,
int srch,
int dstw,
int dsth,
ImageFormat srcFormat,
ImageFormat dstFormat,
float rotate,
FlipParam flip,
LayoutType layout,
int test_iter = 10) {
for (auto& cls : cluster_id) {
for (auto& th : thread_num) {
std::unique_ptr<paddle::lite::KernelContext> ctx1(
new paddle::lite::KernelContext);
auto& ctx = ctx1->As<paddle::lite::ARMContext>();
ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
LOG(INFO) << "cluster: " << cls << ", threads: " << th;
int size = 3 * srch * srcw;
if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
size = ceil(1.5 * srch) * srcw;
} else if (srcFormat == ImageFormat::BGRA ||
srcFormat == ImageFormat::RGBA) {
size = 4 * srch * srcw;
} else if (srcFormat == ImageFormat::GRAY) {
size = srch * srcw;
}
uint8_t* src = new uint8_t[size];
fill_tensor_host_rand(src, size);
int out_size = srch * srcw;
if (dstFormat == ImageFormat::NV12 || dstFormat == ImageFormat::NV21) {
out_size = ceil(1.5 * srch) * srcw;
} else if (dstFormat == ImageFormat::BGR ||
dstFormat == ImageFormat::RGB) {
out_size = 3 * srch * srcw;
} else if (dstFormat == ImageFormat::BGRA ||
dstFormat == ImageFormat::RGBA) {
out_size = 4 * srch * srcw;
} else if (dstFormat == ImageFormat::GRAY) {
out_size = srch * srcw;
}
uint8_t* basic_dst = new uint8_t[out_size];
uint8_t* lite_dst = new uint8_t[out_size];
LOG(INFO) << "basic rotate compute";
Timer t_basic, t_lite;
for (int i = 0; i < test_iter; i++) {
t_basic.Start();
image_basic_rotate(
src, basic_dst, (ImageFormat)dstFormat, srcw, srch, rotate);
t_basic.Stop();
}
LOG(INFO) << "image baisc rotate avg time : " << t_basic.LapTimes().Avg()
<< ", min time: " << t_basic.LapTimes().Min()
<< ", max time: " << t_basic.LapTimes().Max();
LOG(INFO) << "lite rotate compute";
TransParam tparam;
tparam.ih = srch;
tparam.iw = srcw;
tparam.oh = srch;
tparam.ow = srcw;
tparam.flip_param = flip;
tparam.rotate_param = rotate;
ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
for (int i = 0; i < test_iter; ++i) {
t_lite.Start();
image_preprocess.imageRotate(src, lite_dst);
t_lite.Stop();
}
LOG(INFO) << "image rotate avg time : " << t_lite.LapTimes().Avg()
<< ", min time: " << t_lite.LapTimes().Min()
<< ", max time: " << t_lite.LapTimes().Max();
double max_ratio = 0;
double max_diff = 0;
const double eps = 1e-6f;
if (FLAGS_check_result) {
LOG(INFO) << "diff, image rotate size: " << out_size;
uint8_t* diff_v = new uint8_t[out_size];
for (int i = 0; i < out_size; i++) {
uint8_t a = lite_dst[i];
uint8_t b = basic_dst[i];
uint8_t diff1 = a - b;
uint8_t diff = diff1 > 0 ? diff1 : -diff1;
diff_v[i] = diff;
if (max_diff < diff) {
max_diff = diff;
max_ratio = 2.0 * max_diff / (a + b + eps);
}
}
if (std::abs(max_ratio) >= 1e-5f) {
int width = size / srch;
printf("din: \n");
print_int8(src, size, width);
width = out_size / srch;
printf("saber result: \n");
print_int8(lite_dst, out_size, width);
printf("basic result: \n");
print_int8(basic_dst, out_size, width);
printf("diff result: \n");
print_int8(diff_v, out_size, width);
}
delete[] diff_v;
LOG(INFO) << "compare result, max diff: " << max_diff
<< ", max ratio: " << max_ratio;
bool rst = std::abs(max_ratio) < 1e-5f;
CHECK_EQ(rst, true) << "compute result error";
}
LOG(INFO) << "image rotate end";
}
}
}
void test_to_tensor(const std::vector<int>& cluster_id,
const std::vector<int>& thread_num,
int srcw,
int srch,
int dstw,
int dsth,
ImageFormat srcFormat,
ImageFormat dstFormat,
float rotate,
FlipParam flip,
LayoutType layout,
int test_iter = 10) {
for (auto& cls : cluster_id) {
for (auto& th : thread_num) {
std::unique_ptr<paddle::lite::KernelContext> ctx1(
new paddle::lite::KernelContext);
auto& ctx = ctx1->As<paddle::lite::ARMContext>();
ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
LOG(INFO) << "cluster: " << cls << ", threads: " << th;
int size = 3 * srch * srcw;
if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
size = ceil(1.5 * srch) * srcw;
} else if (srcFormat == ImageFormat::BGRA ||
srcFormat == ImageFormat::RGBA) {
size = 4 * srch * srcw;
} else if (srcFormat == ImageFormat::GRAY) {
size = srch * srcw;
}
uint8_t* src = new uint8_t[size];
fill_tensor_host_rand(src, size);
int out_size = srch * srcw;
int resize = dstw * dsth;
if (dstFormat == ImageFormat::NV12 || dstFormat == ImageFormat::NV21) {
out_size = ceil(1.5 * srch) * srcw;
resize = ceil(1.5 * dsth) * dstw;
} else if (dstFormat == ImageFormat::BGR ||
dstFormat == ImageFormat::RGB) {
out_size = 3 * srch * srcw;
resize = 3 * dsth * dstw;
} else if (dstFormat == ImageFormat::BGRA ||
dstFormat == ImageFormat::RGBA) {
out_size = 4 * srch * srcw;
resize = 4 * dsth * dstw;
} else if (dstFormat == ImageFormat::GRAY) {
out_size = srch * srcw;
resize = dsth * dstw;
}
// out
std::vector<int64_t> shape_out = {1, 3, dsth, dstw};
Tensor tensor;
Tensor tensor_basic;
tensor.Resize(shape_out);
tensor_basic.Resize(shape_out);
tensor.set_precision(PRECISION(kFloat));
tensor_basic.set_precision(PRECISION(kFloat));
float means[3] = {127.5f, 127.5f, 127.5f};
float scales[3] = {1 / 127.5f, 1 / 127.5f, 1 / 127.5f};
Timer t_basic, t_lite;
LOG(INFO) << "basic to tensor compute: ";
for (int i = 0; i < test_iter; i++) {
t_basic.Start();
image_basic_to_tensor(src,
tensor_basic,
(ImageFormat)dstFormat,
layout,
dstw,
dsth,
means,
scales);
t_basic.Stop();
}
LOG(INFO) << "image baisc to_tensor avg time : "
<< t_basic.LapTimes().Avg()
<< ", min time: " << t_basic.LapTimes().Min()
<< ", max time: " << t_basic.LapTimes().Max();
LOG(INFO) << "lite to_tensor compute";
TransParam tparam;
tparam.ih = srch;
tparam.iw = srcw;
tparam.oh = dsth;
tparam.ow = dstw;
tparam.flip_param = flip;
tparam.rotate_param = rotate;
Tensor_api dst_tensor(&tensor);
dst_tensor.Resize(shape_out);
ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
for (int i = 0; i < test_iter; ++i) {
t_lite.Start();
image_preprocess.image2Tensor(src,
&dst_tensor,
(ImageFormat)dstFormat,
dstw,
dsth,
layout,
means,
scales);
t_lite.Stop();
}
LOG(INFO) << "image tensor avg time : " << t_lite.LapTimes().Avg()
<< ", min time: " << t_lite.LapTimes().Min()
<< ", max time: " << t_lite.LapTimes().Max();
double max_ratio = 0;
double max_diff = 0;
const double eps = 1e-6f;
if (FLAGS_check_result) {
max_ratio = 0;
max_diff = 0;
LOG(INFO) << "diff, iamge to tensor size: " << tensor.numel();
const float* ptr_a = tensor.data<float>();
const float* ptr_b = tensor_basic.data<float>();
int ss = tensor.numel();
float* diff_v = new float[ss];
for (int i = 0; i < ss; i++) {
int a = ptr_a[i];
int b = ptr_b[i];
int diff1 = a - b;
int diff = 0;
if (diff1 < -1 || diff1 > 1) diff = diff1 < 0 ? -diff1 : diff1;
diff_v[i] = diff;
if (max_diff < diff) {
max_diff = diff;
max_ratio = 2.0 * max_diff / (a + b + eps);
}
}
if (std::abs(max_ratio) >= 1e-5f) {
int width = resize / srch;
printf("din: \n");
print_int8(src, resize, width);
printf("saber result: \n");
print_fp32(ptr_a, resize, width);
printf("basic result: \n");
print_fp32(ptr_b, resize, width);
printf("diff result: \n");
print_fp32(diff_v, resize, width);
}
LOG(INFO) << "compare result, max diff: " << max_diff
<< ", max ratio: " << max_ratio;
bool rst = std::abs(max_ratio) < 1e-5f;
CHECK_EQ(rst, true) << "compute result error";
LOG(INFO) << "iamge to tensor end";
}
}
}
}
void print_info(ImageFormat srcFormat,
ImageFormat dstFormat,
int srcw,
int srch,
int dstw,
int dsth,
float rotate_num,
int flip_num,
int layout) {
paddle::lite::DeviceInfo::Init();
LOG(INFO) << " input tensor size, num= " << 1 << ", channel= " << 1
<< ", height= " << srch << ", width= " << srcw
<< ", srcFormat= " << (ImageFormat)srcFormat;
// RGBA = 0, BGRA, RGB, BGR, GRAY, NV21 = 11, NV12,
if (srcFormat == ImageFormat::NV21) {
LOG(INFO) << "srcFormat: NV21";
}
if (srcFormat == ImageFormat::NV12) {
LOG(INFO) << "srcFormat: NV12";
}
if (srcFormat == ImageFormat::GRAY) {
LOG(INFO) << "srcFormat: GRAY";
}
if (srcFormat == ImageFormat::BGRA) {
LOG(INFO) << "srcFormat: BGRA";
}
if (srcFormat == ImageFormat::BGR) {
LOG(INFO) << "srcFormat: BGR";
}
if (srcFormat == ImageFormat::RGBA) {
LOG(INFO) << "srcFormat: RGBA";
}
if (srcFormat == ImageFormat::RGB) {
LOG(INFO) << "srcFormat: RGB";
}
LOG(INFO) << " output tensor size, num=" << 1 << ", channel=" << 1
<< ", height=" << dsth << ", width=" << dstw
<< ", dstFormat= " << (ImageFormat)dstFormat;
if (dstFormat == ImageFormat::NV21) {
LOG(INFO) << "dstFormat: NV21";
}
if (dstFormat == ImageFormat::NV12) {
LOG(INFO) << "dstFormat: NV12";
}
if (dstFormat == ImageFormat::GRAY) {
LOG(INFO) << "dstFormat: GRAY";
}
if (dstFormat == ImageFormat::BGRA) {
LOG(INFO) << "dstFormat: BGRA";
}
if (dstFormat == ImageFormat::BGR) {
LOG(INFO) << "dstFormat: BGR";
}
if (dstFormat == ImageFormat::RGBA) {
LOG(INFO) << "dstFormat: RGBA";
}
if (dstFormat == ImageFormat::RGB) {
LOG(INFO) << "dstFormat: RGB";
}
LOG(INFO) << "Rotate = " << rotate_num;
if (flip_num == -1) {
LOG(INFO) << "Flip XY";
} else if (flip_num == 0) {
LOG(INFO) << "Flip X";
} else if (flip_num == 1) {
LOG(INFO) << "Flip Y";
}
if (layout == 1) {
LOG(INFO) << "Layout NCHW";
} else if (layout == 3) {
LOG(INFO) << "Layout NHWC";
}
}
#if 0
TEST(TestImageConvertRand, test_func_image_convert_preprocess) {
if (FLAGS_basic_test) {
for (auto w : {1, 4, 8, 16, 112, 224, 1092}) {
for (auto h : {1, 4, 16, 112, 224}) {
for (auto rotate : {180}) {
for (auto flip : {0}) {
for (auto srcFormat : {12}) {
for (auto dstFormat : {0, 1, 2, 3}) {
for (auto layout : {1}) {
// RGBA = 0, BGRA, RGB, BGR, GRAY, NV21 = 11, NV12
if ((srcFormat == ImageFormat::RGB ||
srcFormat == ImageFormat::BGR) &&
(dstFormat == ImageFormat::RGBA ||
dstFormat == ImageFormat::BGRA)) {
continue; // anakin is not suupport
}
print_info((ImageFormat)srcFormat,
(ImageFormat)dstFormat,
w,
h,
w,
h,
rotate,
flip,
layout);
test_convert({FLAGS_cluster},
{1},
w,
h,
w,
h,
(ImageFormat)srcFormat,
(ImageFormat)dstFormat,
rotate,
(FlipParam)flip,
(LayoutType)layout,
FLAGS_repeats);
}
}
}
}
}
}
}
}
}
#endif
#if 0
TEST(TestImageResizeRand, test_func_image_resize_preprocess) {
if (FLAGS_basic_test) {
for (auto w : {8, 16, 112, 224, 1092}) {
for (auto h : {4, 16, 112, 224}) {
for (auto ww : {8, 32, 112}) {
for (auto hh : {8, 112}) {
for (auto rotate : {180}) {
for (auto flip : {0}) {
for (auto srcFormat : {0, 1, 2, 3, 11, 12}) {
for (auto layout : {1}) {
auto dstFormat = srcFormat;
print_info((ImageFormat)srcFormat,
(ImageFormat)dstFormat,
w,
h,
ww,
hh,
rotate,
flip,
layout);
test_resize({FLAGS_cluster},
{1},
w,
h,
ww,
hh,
(ImageFormat)srcFormat,
(ImageFormat)dstFormat,
rotate,
(FlipParam)flip,
(LayoutType)layout,
FLAGS_repeats);
}
}
}
}
}
}
}
}
}
}
#endif
#if 1
TEST(TestImageFlipRand, test_func_image_flip_preprocess) {
if (FLAGS_basic_test) {
for (auto w : {1, 8, 16, 112, 224, 1092}) {
for (auto h : {1, 16, 112, 224}) {
for (auto rotate : {90}) {
for (auto flip : {-1, 0, 1}) {
for (auto srcFormat : {0, 1, 2, 3}) {
for (auto layout : {1}) {
auto dstFormat = srcFormat;
print_info((ImageFormat)srcFormat,
(ImageFormat)dstFormat,
w,
h,
w,
h,
rotate,
flip,
layout);
test_flip({FLAGS_cluster},
{1},
w,
h,
w,
h,
(ImageFormat)srcFormat,
(ImageFormat)dstFormat,
rotate,
(FlipParam)flip,
(LayoutType)layout,
FLAGS_repeats);
}
}
}
}
}
}
}
}
#endif
#if 1
TEST(TestImageRotateRand, test_func_image_rotate_preprocess) {
if (FLAGS_basic_test) {
for (auto w : {1, 8, 16, 112, 224, 1092}) {
for (auto h : {1, 16, 112, 224}) {
for (auto rotate : {90, 180, 270}) {
for (auto flip : {0}) {
for (auto srcFormat : {0, 1, 2, 3}) {
for (auto layout : {1}) {
auto dstFormat = srcFormat;
print_info((ImageFormat)srcFormat,
(ImageFormat)dstFormat,
w,
h,
w,
h,
rotate,
flip,
layout);
test_rotate({FLAGS_cluster},
{1},
w,
h,
w,
h,
(ImageFormat)srcFormat,
(ImageFormat)dstFormat,
rotate,
(FlipParam)flip,
(LayoutType)layout,
FLAGS_repeats);
}
}
}
}
}
}
}
}
#endif
#if 1
TEST(TestImageToTensorRand, test_func_image_to_tensor_preprocess) {
if (FLAGS_basic_test) {
for (auto w : {1, 8, 16, 112, 224, 1092}) {
for (auto h : {1, 16, 112, 224}) {
for (auto rotate : {90}) {
for (auto flip : {0}) {
for (auto srcFormat : {0, 1, 2, 3}) {
for (auto layout : {1}) {
auto dstFormat = srcFormat;
print_info((ImageFormat)srcFormat,
(ImageFormat)dstFormat,
w,
h,
w,
h,
rotate,
flip,
layout);
test_to_tensor({FLAGS_cluster},
{1},
w,
h,
w,
h,
(ImageFormat)srcFormat,
(ImageFormat)dstFormat,
rotate,
(FlipParam)flip,
(LayoutType)layout,
FLAGS_repeats);
}
}
}
}
}
}
}
}
#endif
#if 1
TEST(TestImageConvertCustom, test_func_image_preprocess_custom) {
LOG(INFO) << "print info";
print_info((ImageFormat)FLAGS_srcFormat,
(ImageFormat)FLAGS_dstFormat,
FLAGS_srcw,
FLAGS_srch,
FLAGS_dstw,
FLAGS_dsth,
FLAGS_angle,
FLAGS_flip_num,
FLAGS_layout);
test_convert({FLAGS_cluster},
{1},
FLAGS_srcw,
FLAGS_srch,
FLAGS_dstw,
FLAGS_dsth,
(ImageFormat)FLAGS_srcFormat,
(ImageFormat)FLAGS_dstFormat,
FLAGS_angle,
(FlipParam)FLAGS_flip_num,
(LayoutType)FLAGS_layout,
FLAGS_repeats);
test_resize({FLAGS_cluster},
{1},
FLAGS_srcw,
FLAGS_srch,
FLAGS_dstw,
FLAGS_dsth,
(ImageFormat)FLAGS_dstFormat,
(ImageFormat)FLAGS_dstFormat,
FLAGS_angle,
(FlipParam)FLAGS_flip_num,
(LayoutType)FLAGS_layout,
FLAGS_repeats);
test_flip({FLAGS_cluster},
{1},
FLAGS_srcw,
FLAGS_srch,
FLAGS_dstw,
FLAGS_dsth,
(ImageFormat)FLAGS_dstFormat,
(ImageFormat)FLAGS_dstFormat,
FLAGS_angle,
(FlipParam)FLAGS_flip_num,
(LayoutType)FLAGS_layout,
FLAGS_repeats);
test_rotate({FLAGS_cluster},
{1},
FLAGS_srcw,
FLAGS_srch,
FLAGS_dstw,
FLAGS_dsth,
(ImageFormat)FLAGS_dstFormat,
(ImageFormat)FLAGS_dstFormat,
FLAGS_angle,
(FlipParam)FLAGS_flip_num,
(LayoutType)FLAGS_layout,
FLAGS_repeats);
test_to_tensor({FLAGS_cluster},
{1},
FLAGS_srcw,
FLAGS_srch,
FLAGS_dstw,
FLAGS_dsth,
(ImageFormat)FLAGS_dstFormat,
(ImageFormat)FLAGS_dstFormat,
FLAGS_angle,
(FlipParam)FLAGS_flip_num,
(LayoutType)FLAGS_layout,
FLAGS_repeats);
}
#endif
#endif
...@@ -131,7 +131,7 @@ void ImageConvert::choose(const uint8_t* src, ...@@ -131,7 +131,7 @@ void ImageConvert::choose(const uint8_t* src,
impl_(src, dst, srcw, srch); impl_(src, dst, srcw, srch);
} }
/* /*
nv21(yvu) to BGR: stroe hwc dsth * dstw = srch * (srcw) nv12(yuv) to BGR: stroe hwc dsth * dstw = srch * (srcw)
y_w = srcw, y_h = srch uv_w = srcw uv_h = 1/2 * srch y_w = srcw, y_h = srch uv_w = srcw uv_h = 1/2 * srch
R = Y + 1.402*(V-128); R = Y + 1.402*(V-128);
G = Y - 0.34414*(U-128) - 0.71414*(V-128); G = Y - 0.34414*(U-128) - 0.71414*(V-128);
...@@ -141,16 +141,8 @@ ra = 1.402 *128 = 179.456 = 179 ...@@ -141,16 +141,8 @@ ra = 1.402 *128 = 179.456 = 179
ga = 0.34414 * 64 = 44.3721 = 44 ga = 0.34414 * 64 = 44.3721 = 44
gb = 0.71414 * 64 = 91.40992 = 91 gb = 0.71414 * 64 = 91.40992 = 91
ba = 1.772 * 62 = 226.816 = 227 ba = 1.772 * 62 = 226.816 = 227
nv12bgr, nv21tobgr
*/ */
void nv_to_bgr(const uint8_t* src, inline void nv12_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
uint8_t* dst,
int srcw,
int srch,
int x_num,
int y_num) {
// nv21 x = 0, y = 1
// nv12 x = 1, y = 0
int y_h = srch; int y_h = srch;
int wout = srcw * 3; int wout = srcw * 3;
const uint8_t* y = src; const uint8_t* y = src;
...@@ -181,6 +173,698 @@ void nv_to_bgr(const uint8_t* src, ...@@ -181,6 +173,698 @@ void nv_to_bgr(const uint8_t* src,
ptr_bgr2 = writebuf; ptr_bgr2 = writebuf;
} }
int j = 0; int j = 0;
#ifdef __aarch64__
asm volatile(
"prfm pldl1keep, [%[ptr_y1]] \n"
"prfm pldl1keep, [%[ptr_y1], #64] \n"
"prfm pldl1keep, [%[ptr_y2]] \n"
"prfm pldl1keep, [%[ptr_y2], #64] \n"
"prfm pldl1keep, [%[ptr_vu]] \n"
"prfm pldl1keep, [%[ptr_vu], #64] \n"
:
: [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
: "memory");
#else
asm volatile(
"pld [%[ptr_y1]] @ preload a, 64byte\n"
"pld [%[ptr_y1], #128] @ preload a, 64byte\n"
"pld [%[ptr_y2]] @ preload a, 64byte\n"
"pld [%[ptr_y2], #128] @ preload a, 64byte\n"
"pld [%[ptr_vu]] @ preload a, 64byte\n"
"pld [%[ptr_vu], #128] @ preload a, 64byte\n"
:
: [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
: "memory");
#endif
for (; j < srcw - 15; j += 16) {
uint8x8x2_t y1 = vld2_u8(ptr_y1); // d8 = y0y2y4y6...y14 d9 =
// y1y3y5...y15
uint8x8x2_t vu =
vld2_u8(ptr_vu); // d0 = v0v1v2v3v4v5...v7 d1 = u0u1u2...u7
uint8x8x2_t y2 = vld2_u8(ptr_y2);
uint16x8_t v = vmovl_u8(vu.val[1]);
uint16x8_t u = vmovl_u8(vu.val[0]);
int16x8_t v_s = vreinterpretq_s16_u16(v);
int16x8_t u_s = vreinterpretq_s16_u16(u);
int16x8_t v_bias = vsubq_s16(v_s, bias);
int16x8_t u_bias = vsubq_s16(u_s, bias);
// G = Y - 0.34414*(U-128) - 0.71414*(V-128);
int16x8_t g0 = vmulq_s16(ga, u_bias);
// R = Y + 1.402*(V-128);
int16x8_t r0 = vmulq_s16(ra, v_bias);
// B = Y + 1.772*(U-128);
int16x8_t b0 = vmulq_s16(ba, u_bias);
g0 = vmlaq_s16(g0, gb, v_bias);
int16x8_t y1_0_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[0]));
int16x8_t y1_1_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[1]));
int16x8_t y2_0_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[0]));
int16x8_t y2_1_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[1]));
int16x8_t r0_bias = vshrq_n_s16(r0, 7); // r0 / 128
int16x8_t b0_bias = vshrq_n_s16(b0, 7);
int16x8_t g0_bias = vshrq_n_s16(g0, 7);
int16x8_t r0_1 = vaddq_s16(y1_0_8, r0_bias);
int16x8_t b0_1 = vaddq_s16(y1_0_8, b0_bias);
int16x8_t g0_1 = vsubq_s16(y1_0_8, g0_bias); // g0_1 = y1_0_8 - g0_1
int16x8_t r0_2 = vaddq_s16(y1_1_8, r0_bias);
int16x8_t b0_2 = vaddq_s16(y1_1_8, b0_bias);
int16x8_t g0_2 = vsubq_s16(y1_1_8, g0_bias);
r0_1 = vmaxq_s16(r0_1, zero);
b0_1 = vmaxq_s16(b0_1, zero);
g0_1 = vmaxq_s16(g0_1, zero);
r0_2 = vmaxq_s16(r0_2, zero);
b0_2 = vmaxq_s16(b0_2, zero);
g0_2 = vmaxq_s16(g0_2, zero);
r0_1 = vminq_s16(r0_1, max);
b0_1 = vminq_s16(b0_1, max);
g0_1 = vminq_s16(g0_1, max);
r0_2 = vminq_s16(r0_2, max);
b0_2 = vminq_s16(b0_2, max);
g0_2 = vminq_s16(g0_2, max);
uint8x8_t r00 = vreinterpret_u8_s8(vmovn_s16(r0_1));
uint8x8_t b00 = vreinterpret_u8_s8(vmovn_s16(b0_1));
uint8x8_t g00 = vreinterpret_u8_s8(vmovn_s16(g0_1));
uint8x8_t r01 = vreinterpret_u8_s8(vmovn_s16(r0_2));
uint8x8_t b01 = vreinterpret_u8_s8(vmovn_s16(b0_2));
uint8x8_t g01 = vreinterpret_u8_s8(vmovn_s16(g0_2));
int16x8_t r1_1 = vaddq_s16(y2_0_8, r0_bias);
int16x8_t b1_1 = vaddq_s16(y2_0_8, b0_bias);
int16x8_t g1_1 = vsubq_s16(y2_0_8, g0_bias); // g0_1 = y1_0_8 - g0_1
int16x8_t r1_2 = vaddq_s16(y2_1_8, r0_bias);
int16x8_t b1_2 = vaddq_s16(y2_1_8, b0_bias);
int16x8_t g1_2 = vsubq_s16(y2_1_8, g0_bias);
uint8x8x2_t r00_0 = vtrn_u8(r00, r01); // 014589 236710
uint8x8x2_t b00_0 = vtrn_u8(b00, b01);
uint8x8x2_t g00_0 = vtrn_u8(g00, g01);
r1_1 = vmaxq_s16(r1_1, zero);
b1_1 = vmaxq_s16(b1_1, zero);
g1_1 = vmaxq_s16(g1_1, zero);
r1_2 = vmaxq_s16(r1_2, zero);
b1_2 = vmaxq_s16(b1_2, zero);
g1_2 = vmaxq_s16(g1_2, zero);
uint16x4_t r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
uint16x4_t r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
uint16x4_t b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
uint16x4_t b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
uint16x4_t g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
uint16x4_t g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
uint16x4x2_t r00_1 = vtrn_u16(r0_16, r1_16); // 012389 456710
uint16x4x2_t b00_1 = vtrn_u16(b0_16, b1_16);
uint16x4x2_t g00_1 = vtrn_u16(g0_16, g1_16);
r1_1 = vminq_s16(r1_1, max);
b1_1 = vminq_s16(b1_1, max);
g1_1 = vminq_s16(g1_1, max);
r1_2 = vminq_s16(r1_2, max);
b1_2 = vminq_s16(b1_2, max);
g1_2 = vminq_s16(g1_2, max);
uint32x2_t r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
uint32x2_t r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
uint32x2_t b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
uint32x2_t b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
uint32x2_t g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
uint32x2_t g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
uint32x2x2_t r00_2 = vtrn_u32(r0_32, r1_32); // 01234567 8910
uint32x2x2_t b00_2 = vtrn_u32(b0_32, b1_32);
uint32x2x2_t g00_2 = vtrn_u32(g0_32, g1_32);
r00 = vreinterpret_u8_s8(vmovn_s16(r1_1));
b00 = vreinterpret_u8_s8(vmovn_s16(b1_1));
g00 = vreinterpret_u8_s8(vmovn_s16(g1_1));
r01 = vreinterpret_u8_s8(vmovn_s16(r1_2));
b01 = vreinterpret_u8_s8(vmovn_s16(b1_2));
g01 = vreinterpret_u8_s8(vmovn_s16(g1_2));
uint8x8_t r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
uint8x8_t b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
uint8x8_t g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
uint8x8_t r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
uint8x8_t b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
uint8x8_t g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
uint8x8x3_t v_bgr;
v_bgr.val[0] = b0_8;
v_bgr.val[1] = g0_8;
v_bgr.val[2] = r0_8;
r00_0 = vtrn_u8(r00, r01); // 014589 236710
b00_0 = vtrn_u8(b00, b01);
g00_0 = vtrn_u8(g00, g01);
vst3_u8(ptr_bgr1, v_bgr);
r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
ptr_bgr1 += 24;
uint8x8x3_t v_bgr1;
v_bgr1.val[0] = b1_8;
v_bgr1.val[1] = g1_8;
v_bgr1.val[2] = r1_8;
r00_1 = vtrn_u16(r0_16, r1_16); // 012389 456710
b00_1 = vtrn_u16(b0_16, b1_16);
g00_1 = vtrn_u16(g0_16, g1_16);
vst3_u8(ptr_bgr1, v_bgr1);
r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
ptr_bgr1 += 24;
r00_2 = vtrn_u32(r0_32, r1_32); // 01234567 8910
b00_2 = vtrn_u32(b0_32, b1_32);
g00_2 = vtrn_u32(g0_32, g1_32);
ptr_vu += 16;
ptr_y1 += 16;
ptr_y2 += 16;
r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
v_bgr.val[0] = b0_8;
v_bgr.val[1] = g0_8;
v_bgr.val[2] = r0_8;
v_bgr1.val[0] = b1_8;
v_bgr1.val[1] = g1_8;
v_bgr1.val[2] = r1_8;
vst3_u8(ptr_bgr2, v_bgr);
vst3_u8(ptr_bgr2 + 24, v_bgr1);
ptr_bgr2 += 48;
}
// two data
for (; j < srcw; j += 2) {
uint8_t _y0 = ptr_y1[0];
uint8_t _y1 = ptr_y1[1];
uint8_t _v = ptr_vu[1];
uint8_t _u = ptr_vu[0];
uint8_t _y0_1 = ptr_y2[0];
uint8_t _y1_1 = ptr_y2[1];
int ra = floor((179 * (_v - 128)) >> 7);
int ga = floor((44 * (_u - 128) + 91 * (_v - 128)) >> 7);
int ba = floor((227 * (_u - 128)) >> 7);
int r = _y0 + ra;
int g = _y0 - ga;
int b = _y0 + ba;
int r1 = _y1 + ra;
int g1 = _y1 - ga;
int b1 = _y1 + ba;
r = r < 0 ? 0 : (r > 255) ? 255 : r;
g = g < 0 ? 0 : (g > 255) ? 255 : g;
b = b < 0 ? 0 : (b > 255) ? 255 : b;
r1 = r1 < 0 ? 0 : (r1 > 255) ? 255 : r1;
g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1;
b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1;
*ptr_bgr1++ = b;
*ptr_bgr1++ = g;
*ptr_bgr1++ = r;
int r2 = _y0_1 + ra;
int g2 = _y0_1 - ga;
int b2 = _y0_1 + ba;
int r3 = _y1_1 + ra;
int g3 = _y1_1 - ga;
int b3 = _y1_1 + ba;
r2 = r2 < 0 ? 0 : (r2 > 255) ? 255 : r2;
g2 = g2 < 0 ? 0 : (g2 > 255) ? 255 : g2;
b2 = b2 < 0 ? 0 : (b2 > 255) ? 255 : b2;
r3 = r3 < 0 ? 0 : (r3 > 255) ? 255 : r3;
g3 = g3 < 0 ? 0 : (g3 > 255) ? 255 : g3;
b3 = b3 < 0 ? 0 : (b3 > 255) ? 255 : b3;
*ptr_bgr1++ = b1;
*ptr_bgr1++ = g1;
*ptr_bgr1++ = r1;
*ptr_bgr2++ = b2;
*ptr_bgr2++ = g2;
*ptr_bgr2++ = r2;
ptr_y1 += 2;
ptr_y2 += 2;
ptr_vu += 2;
*ptr_bgr2++ = b3;
*ptr_bgr2++ = g3;
*ptr_bgr2++ = r3;
}
}
delete[] zerobuf;
delete[] writebuf;
}
/*
nv21(yvu) to BGR: stroe hwc dsth * dstw = srch * (srcw)
*/
inline void nv21_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
int y_h = srch;
int wout = srcw * 3;
const uint8_t* y = src;
const uint8_t* vu = src + y_h * srcw;
int16x8_t bias = vdupq_n_s16(128);
int16x8_t ga = vdupq_n_s16(44);
int16x8_t ra = vdupq_n_s16(179);
int16x8_t ba = vdupq_n_s16(227);
int16x8_t gb = vdupq_n_s16(91);
int16x8_t zero = vdupq_n_s16(0);
int16x8_t max = vdupq_n_s16(255);
uint8_t* zerobuf = new uint8_t[srcw];
uint8_t* writebuf = new uint8_t[wout];
memset(zerobuf, 0, sizeof(uint8_t) * srcw);
int i = 0;
#pragma omp parallel for
for (i = 0; i < y_h; i += 2) {
const uint8_t* ptr_y1 = y + i * srcw;
const uint8_t* ptr_y2 = ptr_y1 + srcw;
const uint8_t* ptr_vu = vu + (i / 2) * srcw;
uint8_t* ptr_bgr1 = dst + i * wout;
uint8_t* ptr_bgr2 = ptr_bgr1 + wout;
if (i + 2 > y_h) {
ptr_y2 = zerobuf;
ptr_bgr2 = writebuf;
}
int j = 0;
#ifdef __aarch64__
asm volatile(
"prfm pldl1keep, [%[ptr_y1]] \n"
"prfm pldl1keep, [%[ptr_y1], #64] \n"
"prfm pldl1keep, [%[ptr_y2]] \n"
"prfm pldl1keep, [%[ptr_y2], #64] \n"
"prfm pldl1keep, [%[ptr_vu]] \n"
"prfm pldl1keep, [%[ptr_vu], #64] \n"
:
: [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
: "memory");
#else
asm volatile(
"pld [%[ptr_y1]] @ preload a, 64byte\n"
"pld [%[ptr_y1], #128] @ preload a, 64byte\n"
"pld [%[ptr_y2]] @ preload a, 64byte\n"
"pld [%[ptr_y2], #128] @ preload a, 64byte\n"
"pld [%[ptr_vu]] @ preload a, 64byte\n"
"pld [%[ptr_vu], #128] @ preload a, 64byte\n"
:
: [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
: "memory");
#endif
for (; j < srcw - 15; j += 16) {
uint8x8x2_t y1 = vld2_u8(ptr_y1); // d8 = y0y2y4y6...y14 d9 =
// y1y3y5...y15
uint8x8x2_t vu =
vld2_u8(ptr_vu); // d0 = v0v1v2v3v4v5...v7 d1 = u0u1u2...u7
uint8x8x2_t y2 = vld2_u8(ptr_y2);
uint16x8_t v = vmovl_u8(vu.val[0]);
uint16x8_t u = vmovl_u8(vu.val[1]);
int16x8_t v_s = vreinterpretq_s16_u16(v);
int16x8_t u_s = vreinterpretq_s16_u16(u);
int16x8_t v_bias = vsubq_s16(v_s, bias);
int16x8_t u_bias = vsubq_s16(u_s, bias);
// G = Y - 0.34414*(U-128) - 0.71414*(V-128);
int16x8_t g0 = vmulq_s16(ga, u_bias);
// R = Y + 1.402*(V-128);
int16x8_t r0 = vmulq_s16(ra, v_bias);
// B = Y + 1.772*(U-128);
int16x8_t b0 = vmulq_s16(ba, u_bias);
g0 = vmlaq_s16(g0, gb, v_bias);
int16x8_t y1_0_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[0]));
int16x8_t y1_1_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[1]));
int16x8_t y2_0_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[0]));
int16x8_t y2_1_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[1]));
int16x8_t r0_bias = vshrq_n_s16(r0, 7); // r0 / 128
int16x8_t b0_bias = vshrq_n_s16(b0, 7);
int16x8_t g0_bias = vshrq_n_s16(g0, 7);
int16x8_t r0_1 = vaddq_s16(y1_0_8, r0_bias);
int16x8_t b0_1 = vaddq_s16(y1_0_8, b0_bias);
int16x8_t g0_1 = vsubq_s16(y1_0_8, g0_bias); // g0_1 = y1_0_8 - g0_1
int16x8_t r0_2 = vaddq_s16(y1_1_8, r0_bias);
int16x8_t b0_2 = vaddq_s16(y1_1_8, b0_bias);
int16x8_t g0_2 = vsubq_s16(y1_1_8, g0_bias);
r0_1 = vmaxq_s16(r0_1, zero);
b0_1 = vmaxq_s16(b0_1, zero);
g0_1 = vmaxq_s16(g0_1, zero);
r0_2 = vmaxq_s16(r0_2, zero);
b0_2 = vmaxq_s16(b0_2, zero);
g0_2 = vmaxq_s16(g0_2, zero);
r0_1 = vminq_s16(r0_1, max);
b0_1 = vminq_s16(b0_1, max);
g0_1 = vminq_s16(g0_1, max);
r0_2 = vminq_s16(r0_2, max);
b0_2 = vminq_s16(b0_2, max);
g0_2 = vminq_s16(g0_2, max);
uint8x8_t r00 = vreinterpret_u8_s8(vmovn_s16(r0_1));
uint8x8_t b00 = vreinterpret_u8_s8(vmovn_s16(b0_1));
uint8x8_t g00 = vreinterpret_u8_s8(vmovn_s16(g0_1));
uint8x8_t r01 = vreinterpret_u8_s8(vmovn_s16(r0_2));
uint8x8_t b01 = vreinterpret_u8_s8(vmovn_s16(b0_2));
uint8x8_t g01 = vreinterpret_u8_s8(vmovn_s16(g0_2));
int16x8_t r1_1 = vaddq_s16(y2_0_8, r0_bias);
int16x8_t b1_1 = vaddq_s16(y2_0_8, b0_bias);
int16x8_t g1_1 = vsubq_s16(y2_0_8, g0_bias); // g0_1 = y1_0_8 - g0_1
int16x8_t r1_2 = vaddq_s16(y2_1_8, r0_bias);
int16x8_t b1_2 = vaddq_s16(y2_1_8, b0_bias);
int16x8_t g1_2 = vsubq_s16(y2_1_8, g0_bias);
uint8x8x2_t r00_0 = vtrn_u8(r00, r01); // 014589 236710
uint8x8x2_t b00_0 = vtrn_u8(b00, b01);
uint8x8x2_t g00_0 = vtrn_u8(g00, g01);
r1_1 = vmaxq_s16(r1_1, zero);
b1_1 = vmaxq_s16(b1_1, zero);
g1_1 = vmaxq_s16(g1_1, zero);
r1_2 = vmaxq_s16(r1_2, zero);
b1_2 = vmaxq_s16(b1_2, zero);
g1_2 = vmaxq_s16(g1_2, zero);
uint16x4_t r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
uint16x4_t r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
uint16x4_t b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
uint16x4_t b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
uint16x4_t g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
uint16x4_t g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
uint16x4x2_t r00_1 = vtrn_u16(r0_16, r1_16); // 012389 456710
uint16x4x2_t b00_1 = vtrn_u16(b0_16, b1_16);
uint16x4x2_t g00_1 = vtrn_u16(g0_16, g1_16);
r1_1 = vminq_s16(r1_1, max);
b1_1 = vminq_s16(b1_1, max);
g1_1 = vminq_s16(g1_1, max);
r1_2 = vminq_s16(r1_2, max);
b1_2 = vminq_s16(b1_2, max);
g1_2 = vminq_s16(g1_2, max);
uint32x2_t r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
uint32x2_t r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
uint32x2_t b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
uint32x2_t b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
uint32x2_t g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
uint32x2_t g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
uint32x2x2_t r00_2 = vtrn_u32(r0_32, r1_32); // 01234567 8910
uint32x2x2_t b00_2 = vtrn_u32(b0_32, b1_32);
uint32x2x2_t g00_2 = vtrn_u32(g0_32, g1_32);
r00 = vreinterpret_u8_s8(vmovn_s16(r1_1));
b00 = vreinterpret_u8_s8(vmovn_s16(b1_1));
g00 = vreinterpret_u8_s8(vmovn_s16(g1_1));
r01 = vreinterpret_u8_s8(vmovn_s16(r1_2));
b01 = vreinterpret_u8_s8(vmovn_s16(b1_2));
g01 = vreinterpret_u8_s8(vmovn_s16(g1_2));
uint8x8_t r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
uint8x8_t b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
uint8x8_t g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
uint8x8_t r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
uint8x8_t b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
uint8x8_t g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
uint8x8x3_t v_bgr;
v_bgr.val[0] = b0_8;
v_bgr.val[1] = g0_8;
v_bgr.val[2] = r0_8;
r00_0 = vtrn_u8(r00, r01); // 014589 236710
b00_0 = vtrn_u8(b00, b01);
g00_0 = vtrn_u8(g00, g01);
vst3_u8(ptr_bgr1, v_bgr);
r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
ptr_bgr1 += 24;
uint8x8x3_t v_bgr1;
v_bgr1.val[0] = b1_8;
v_bgr1.val[1] = g1_8;
v_bgr1.val[2] = r1_8;
r00_1 = vtrn_u16(r0_16, r1_16); // 012389 456710
b00_1 = vtrn_u16(b0_16, b1_16);
g00_1 = vtrn_u16(g0_16, g1_16);
vst3_u8(ptr_bgr1, v_bgr1);
r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
ptr_bgr1 += 24;
r00_2 = vtrn_u32(r0_32, r1_32); // 01234567 8910
b00_2 = vtrn_u32(b0_32, b1_32);
g00_2 = vtrn_u32(g0_32, g1_32);
ptr_vu += 16;
ptr_y1 += 16;
ptr_y2 += 16;
r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
v_bgr.val[0] = b0_8;
v_bgr.val[1] = g0_8;
v_bgr.val[2] = r0_8;
v_bgr1.val[0] = b1_8;
v_bgr1.val[1] = g1_8;
v_bgr1.val[2] = r1_8;
vst3_u8(ptr_bgr2, v_bgr);
vst3_u8(ptr_bgr2 + 24, v_bgr1);
ptr_bgr2 += 48;
}
// two data
for (; j < srcw; j += 2) {
uint8_t _y0 = ptr_y1[0];
uint8_t _y1 = ptr_y1[1];
uint8_t _v = ptr_vu[0];
uint8_t _u = ptr_vu[1];
uint8_t _y0_1 = ptr_y2[0];
uint8_t _y1_1 = ptr_y2[1];
int ra = floor((179 * (_v - 128)) >> 7);
int ga = floor((44 * (_u - 128) + 91 * (_v - 128)) >> 7);
int ba = floor((227 * (_u - 128)) >> 7);
int r = _y0 + ra;
int g = _y0 - ga;
int b = _y0 + ba;
int r1 = _y1 + ra;
int g1 = _y1 - ga;
int b1 = _y1 + ba;
r = r < 0 ? 0 : (r > 255) ? 255 : r;
g = g < 0 ? 0 : (g > 255) ? 255 : g;
b = b < 0 ? 0 : (b > 255) ? 255 : b;
r1 = r1 < 0 ? 0 : (r1 > 255) ? 255 : r1;
g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1;
b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1;
*ptr_bgr1++ = b;
*ptr_bgr1++ = g;
*ptr_bgr1++ = r;
int r2 = _y0_1 + ra;
int g2 = _y0_1 - ga;
int b2 = _y0_1 + ba;
int r3 = _y1_1 + ra;
int g3 = _y1_1 - ga;
int b3 = _y1_1 + ba;
r2 = r2 < 0 ? 0 : (r2 > 255) ? 255 : r2;
g2 = g2 < 0 ? 0 : (g2 > 255) ? 255 : g2;
b2 = b2 < 0 ? 0 : (b2 > 255) ? 255 : b2;
r3 = r3 < 0 ? 0 : (r3 > 255) ? 255 : r3;
g3 = g3 < 0 ? 0 : (g3 > 255) ? 255 : g3;
b3 = b3 < 0 ? 0 : (b3 > 255) ? 255 : b3;
*ptr_bgr1++ = b1;
*ptr_bgr1++ = g1;
*ptr_bgr1++ = r1;
*ptr_bgr2++ = b2;
*ptr_bgr2++ = g2;
*ptr_bgr2++ = r2;
ptr_y1 += 2;
ptr_y2 += 2;
ptr_vu += 2;
*ptr_bgr2++ = b3;
*ptr_bgr2++ = g3;
*ptr_bgr2++ = r3;
}
}
delete[] zerobuf;
delete[] writebuf;
}
// nv12(yuv) to BGRA: stroe hwc dsth * dstw = srch * (srcw) y_w = srcw, y_h =
// srch uv_w = srcw uv_h = 1/2 * srch
inline void nv12_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
int y_h = srch;
int vu_h = 1 / 2 * srch;
const uint8_t* y = src;
const uint8_t* vu = src + y_h * srcw;
int wout = srcw * 4;
uint8_t* zerobuf = new uint8_t[srcw];
uint8_t* writebuf = new uint8_t[wout];
memset(zerobuf, 0, sizeof(uint8_t) * srcw);
int16x8_t bias = vdupq_n_s16(128);
int16x8_t ga = vdupq_n_s16(44);
int16x8_t ra = vdupq_n_s16(179);
int16x8_t ba = vdupq_n_s16(227);
int16x8_t gb = vdupq_n_s16(91);
int16x8_t zero = vdupq_n_s16(0);
int16x8_t max = vdupq_n_s16(255);
uint8x8_t a_8 = vdup_n_u8(255);
#pragma omp parallel for
for (int i = 0; i < y_h; i += 2) {
const uint8_t* ptr_y1 = y + i * srcw;
const uint8_t* ptr_y2 = ptr_y1 + srcw;
const uint8_t* ptr_vu = vu + (i / 2) * srcw;
uint8_t* ptr_bgr1 = dst + i * wout;
uint8_t* ptr_bgr2 = ptr_bgr1 + wout;
if (i + 2 > y_h) {
ptr_y2 = zerobuf;
ptr_bgr2 = writebuf;
}
int j = 0;
#ifdef __aarch64__
asm volatile(
"prfm pldl1keep, [%[ptr_y1]] \n"
"prfm pldl1keep, [%[ptr_y1], #64] \n"
"prfm pldl1keep, [%[ptr_y2]] \n"
"prfm pldl1keep, [%[ptr_y2], #64] \n"
"prfm pldl1keep, [%[ptr_vu]] \n"
"prfm pldl1keep, [%[ptr_vu], #64] \n"
:
: [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
: "memory");
#else
asm volatile(
"pld [%[ptr_y1]] @ preload a, 64byte\n"
"pld [%[ptr_y1], #128] @ preload a, 64byte\n"
"pld [%[ptr_y2]] @ preload a, 64byte\n"
"pld [%[ptr_y2], #128] @ preload a, 64byte\n"
"pld [%[ptr_vu]] @ preload a, 64byte\n"
"pld [%[ptr_vu], #128] @ preload a, 64byte\n"
:
: [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
: "memory");
#endif
for (; j < srcw - 15; j += 16) { for (; j < srcw - 15; j += 16) {
uint8x8x2_t y1 = vld2_u8(ptr_y1); // d8 = y0y2y4y6...y14 d9 = uint8x8x2_t y1 = vld2_u8(ptr_y1); // d8 = y0y2y4y6...y14 d9 =
// y1y3y5...y15 // y1y3y5...y15
...@@ -189,8 +873,8 @@ void nv_to_bgr(const uint8_t* src, ...@@ -189,8 +873,8 @@ void nv_to_bgr(const uint8_t* src,
uint8x8x2_t y2 = vld2_u8(ptr_y2); uint8x8x2_t y2 = vld2_u8(ptr_y2);
uint16x8_t v = vmovl_u8(vu.val[x_num]); uint16x8_t v = vmovl_u8(vu.val[1]);
uint16x8_t u = vmovl_u8(vu.val[y_num]); uint16x8_t u = vmovl_u8(vu.val[0]);
int16x8_t v_s = vreinterpretq_s16_u16(v); int16x8_t v_s = vreinterpretq_s16_u16(v);
int16x8_t u_s = vreinterpretq_s16_u16(u); int16x8_t u_s = vreinterpretq_s16_u16(u);
int16x8_t v_bias = vsubq_s16(v_s, bias); int16x8_t v_bias = vsubq_s16(v_s, bias);
...@@ -317,16 +1001,17 @@ void nv_to_bgr(const uint8_t* src, ...@@ -317,16 +1001,17 @@ void nv_to_bgr(const uint8_t* src,
uint8x8_t b1_8 = vreinterpret_u8_u32(b00_2.val[1]); uint8x8_t b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
uint8x8_t g1_8 = vreinterpret_u8_u32(g00_2.val[1]); uint8x8_t g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
uint8x8x3_t v_bgr; uint8x8x4_t v_bgr;
v_bgr.val[0] = b0_8; v_bgr.val[0] = b0_8;
v_bgr.val[1] = g0_8; v_bgr.val[1] = g0_8;
v_bgr.val[2] = r0_8; v_bgr.val[2] = r0_8;
v_bgr.val[3] = a_8;
r00_0 = vtrn_u8(r00, r01); // 014589 236710 r00_0 = vtrn_u8(r00, r01); // 014589 236710
b00_0 = vtrn_u8(b00, b01); b00_0 = vtrn_u8(b00, b01);
g00_0 = vtrn_u8(g00, g01); g00_0 = vtrn_u8(g00, g01);
vst3_u8(ptr_bgr1, v_bgr); vst4_u8(ptr_bgr1, v_bgr);
r0_16 = vreinterpret_u16_u8(r00_0.val[0]); r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
r1_16 = vreinterpret_u16_u8(r00_0.val[1]); r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
...@@ -337,17 +1022,20 @@ void nv_to_bgr(const uint8_t* src, ...@@ -337,17 +1022,20 @@ void nv_to_bgr(const uint8_t* src,
g0_16 = vreinterpret_u16_u8(g00_0.val[0]); g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
g1_16 = vreinterpret_u16_u8(g00_0.val[1]); g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
ptr_bgr1 += 24; ptr_bgr1 += 32;
uint8x8x3_t v_bgr1; // uint8x8x3_t v_bgr1;
uint8x8x4_t v_bgr1;
v_bgr1.val[0] = b1_8; v_bgr1.val[0] = b1_8;
v_bgr1.val[1] = g1_8; v_bgr1.val[1] = g1_8;
v_bgr1.val[2] = r1_8; v_bgr1.val[2] = r1_8;
v_bgr1.val[3] = a_8;
r00_1 = vtrn_u16(r0_16, r1_16); // 012389 456710 r00_1 = vtrn_u16(r0_16, r1_16); // 012389 456710
b00_1 = vtrn_u16(b0_16, b1_16); b00_1 = vtrn_u16(b0_16, b1_16);
g00_1 = vtrn_u16(g0_16, g1_16); g00_1 = vtrn_u16(g0_16, g1_16);
vst3_u8(ptr_bgr1, v_bgr1); // vst3_u8(ptr_bgr1, v_bgr1);
vst4_u8(ptr_bgr1, v_bgr1);
r0_32 = vreinterpret_u32_u16(r00_1.val[0]); r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
r1_32 = vreinterpret_u32_u16(r00_1.val[1]); r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
...@@ -358,7 +1046,8 @@ void nv_to_bgr(const uint8_t* src, ...@@ -358,7 +1046,8 @@ void nv_to_bgr(const uint8_t* src,
g0_32 = vreinterpret_u32_u16(g00_1.val[0]); g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
g1_32 = vreinterpret_u32_u16(g00_1.val[1]); g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
ptr_bgr1 += 24; // ptr_bgr1 += 24;
ptr_bgr1 += 32;
r00_2 = vtrn_u32(r0_32, r1_32); // 01234567 8910 r00_2 = vtrn_u32(r0_32, r1_32); // 01234567 8910
b00_2 = vtrn_u32(b0_32, b1_32); b00_2 = vtrn_u32(b0_32, b1_32);
...@@ -384,17 +1073,17 @@ void nv_to_bgr(const uint8_t* src, ...@@ -384,17 +1073,17 @@ void nv_to_bgr(const uint8_t* src,
v_bgr1.val[1] = g1_8; v_bgr1.val[1] = g1_8;
v_bgr1.val[2] = r1_8; v_bgr1.val[2] = r1_8;
vst3_u8(ptr_bgr2, v_bgr); vst4_u8(ptr_bgr2, v_bgr);
vst3_u8(ptr_bgr2 + 24, v_bgr1); vst4_u8(ptr_bgr2 + 32, v_bgr1);
ptr_bgr2 += 48; ptr_bgr2 += 64;
} }
// two data // two data
for (; j < srcw; j += 2) { for (; j < srcw; j += 2) {
uint8_t _y0 = ptr_y1[0]; uint8_t _y0 = ptr_y1[0];
uint8_t _y1 = ptr_y1[1]; uint8_t _y1 = ptr_y1[1];
uint8_t _v = ptr_vu[x_num]; uint8_t _v = ptr_vu[1];
uint8_t _u = ptr_vu[y_num]; uint8_t _u = ptr_vu[0];
uint8_t _y0_1 = ptr_y2[0]; uint8_t _y0_1 = ptr_y2[0];
uint8_t _y1_1 = ptr_y2[1]; uint8_t _y1_1 = ptr_y2[1];
...@@ -421,6 +1110,7 @@ void nv_to_bgr(const uint8_t* src, ...@@ -421,6 +1110,7 @@ void nv_to_bgr(const uint8_t* src,
*ptr_bgr1++ = b; *ptr_bgr1++ = b;
*ptr_bgr1++ = g; *ptr_bgr1++ = g;
*ptr_bgr1++ = r; *ptr_bgr1++ = r;
*ptr_bgr1++ = 255;
int r2 = _y0_1 + ra; int r2 = _y0_1 + ra;
int g2 = _y0_1 - ga; int g2 = _y0_1 - ga;
...@@ -441,10 +1131,12 @@ void nv_to_bgr(const uint8_t* src, ...@@ -441,10 +1131,12 @@ void nv_to_bgr(const uint8_t* src,
*ptr_bgr1++ = b1; *ptr_bgr1++ = b1;
*ptr_bgr1++ = g1; *ptr_bgr1++ = g1;
*ptr_bgr1++ = r1; *ptr_bgr1++ = r1;
*ptr_bgr1++ = 255;
*ptr_bgr2++ = b2; *ptr_bgr2++ = b2;
*ptr_bgr2++ = g2; *ptr_bgr2++ = g2;
*ptr_bgr2++ = r2; *ptr_bgr2++ = r2;
*ptr_bgr2++ = 255;
ptr_y1 += 2; ptr_y1 += 2;
ptr_y2 += 2; ptr_y2 += 2;
...@@ -453,20 +1145,16 @@ void nv_to_bgr(const uint8_t* src, ...@@ -453,20 +1145,16 @@ void nv_to_bgr(const uint8_t* src,
*ptr_bgr2++ = b3; *ptr_bgr2++ = b3;
*ptr_bgr2++ = g3; *ptr_bgr2++ = g3;
*ptr_bgr2++ = r3; *ptr_bgr2++ = r3;
*ptr_bgr2++ = 255;
} }
} }
delete[] zerobuf; delete[] zerobuf;
delete[] writebuf; delete[] writebuf;
} }
// nv12bgra, nv21tobgra
void nv_to_bgra(const uint8_t* src, // nv21(yvu) to BGRA:store hwc dsth * dstw = srch * srcw y_w = srcw, y_h = srch
uint8_t* dst, // uv_w = srcw uv_h = 1/2 * srch
int srcw, inline void nv21_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
int srch,
int x_num,
int y_num) {
// nv21 x = 0, y = 1
// nv12 x = 1, y = 0
int y_h = srch; int y_h = srch;
int vu_h = 1 / 2 * srch; int vu_h = 1 / 2 * srch;
const uint8_t* y = src; const uint8_t* y = src;
...@@ -497,6 +1185,29 @@ void nv_to_bgra(const uint8_t* src, ...@@ -497,6 +1185,29 @@ void nv_to_bgra(const uint8_t* src,
ptr_bgr2 = writebuf; ptr_bgr2 = writebuf;
} }
int j = 0; int j = 0;
#ifdef __aarch64__
asm volatile(
"prfm pldl1keep, [%[ptr_y1]] \n"
"prfm pldl1keep, [%[ptr_y1], #64] \n"
"prfm pldl1keep, [%[ptr_y2]] \n"
"prfm pldl1keep, [%[ptr_y2], #64] \n"
"prfm pldl1keep, [%[ptr_vu]] \n"
"prfm pldl1keep, [%[ptr_vu], #64] \n"
:
: [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
: "memory");
#else
asm volatile(
"pld [%[ptr_y1]] @ preload a, 64byte\n"
"pld [%[ptr_y1], #128] @ preload a, 64byte\n"
"pld [%[ptr_y2]] @ preload a, 64byte\n"
"pld [%[ptr_y2], #128] @ preload a, 64byte\n"
"pld [%[ptr_vu]] @ preload a, 64byte\n"
"pld [%[ptr_vu], #128] @ preload a, 64byte\n"
:
: [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
: "memory");
#endif
for (; j < srcw - 15; j += 16) { for (; j < srcw - 15; j += 16) {
uint8x8x2_t y1 = vld2_u8(ptr_y1); // d8 = y0y2y4y6...y14 d9 = uint8x8x2_t y1 = vld2_u8(ptr_y1); // d8 = y0y2y4y6...y14 d9 =
// y1y3y5...y15 // y1y3y5...y15
...@@ -505,8 +1216,8 @@ void nv_to_bgra(const uint8_t* src, ...@@ -505,8 +1216,8 @@ void nv_to_bgra(const uint8_t* src,
uint8x8x2_t y2 = vld2_u8(ptr_y2); uint8x8x2_t y2 = vld2_u8(ptr_y2);
uint16x8_t v = vmovl_u8(vu.val[x_num]); uint16x8_t v = vmovl_u8(vu.val[0]);
uint16x8_t u = vmovl_u8(vu.val[y_num]); uint16x8_t u = vmovl_u8(vu.val[1]);
int16x8_t v_s = vreinterpretq_s16_u16(v); int16x8_t v_s = vreinterpretq_s16_u16(v);
int16x8_t u_s = vreinterpretq_s16_u16(u); int16x8_t u_s = vreinterpretq_s16_u16(u);
int16x8_t v_bias = vsubq_s16(v_s, bias); int16x8_t v_bias = vsubq_s16(v_s, bias);
...@@ -643,10 +1354,6 @@ void nv_to_bgra(const uint8_t* src, ...@@ -643,10 +1354,6 @@ void nv_to_bgra(const uint8_t* src,
b00_0 = vtrn_u8(b00, b01); b00_0 = vtrn_u8(b00, b01);
g00_0 = vtrn_u8(g00, g01); g00_0 = vtrn_u8(g00, g01);
// ptr_bgr3 += 8;
// ptr_bgr1 += 8;
// ptr_bgr2 += 8;
// vst3_u8(ptr_bgr1, v_bgr);
vst4_u8(ptr_bgr1, v_bgr); vst4_u8(ptr_bgr1, v_bgr);
r0_16 = vreinterpret_u16_u8(r00_0.val[0]); r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
...@@ -709,8 +1416,6 @@ void nv_to_bgra(const uint8_t* src, ...@@ -709,8 +1416,6 @@ void nv_to_bgra(const uint8_t* src,
v_bgr1.val[1] = g1_8; v_bgr1.val[1] = g1_8;
v_bgr1.val[2] = r1_8; v_bgr1.val[2] = r1_8;
// vst3_u8(ptr_bgr2, v_bgr);
// vst3_u8(ptr_bgr2 + 24, v_bgr1);
vst4_u8(ptr_bgr2, v_bgr); vst4_u8(ptr_bgr2, v_bgr);
vst4_u8(ptr_bgr2 + 32, v_bgr1); vst4_u8(ptr_bgr2 + 32, v_bgr1);
...@@ -720,8 +1425,8 @@ void nv_to_bgra(const uint8_t* src, ...@@ -720,8 +1425,8 @@ void nv_to_bgra(const uint8_t* src,
for (; j < srcw; j += 2) { for (; j < srcw; j += 2) {
uint8_t _y0 = ptr_y1[0]; uint8_t _y0 = ptr_y1[0];
uint8_t _y1 = ptr_y1[1]; uint8_t _y1 = ptr_y1[1];
uint8_t _v = ptr_vu[x_num]; uint8_t _v = ptr_vu[0];
uint8_t _u = ptr_vu[y_num]; uint8_t _u = ptr_vu[1];
uint8_t _y0_1 = ptr_y2[0]; uint8_t _y0_1 = ptr_y2[0];
uint8_t _y1_1 = ptr_y2[1]; uint8_t _y1_1 = ptr_y2[1];
...@@ -745,9 +1450,6 @@ void nv_to_bgra(const uint8_t* src, ...@@ -745,9 +1450,6 @@ void nv_to_bgra(const uint8_t* src,
g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1; g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1;
b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1; b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1;
// *ptr_bgr1++ = b;
// *ptr_bgr2++ = g;
// *ptr_bgr3++ = r;
*ptr_bgr1++ = b; *ptr_bgr1++ = b;
*ptr_bgr1++ = g; *ptr_bgr1++ = g;
*ptr_bgr1++ = r; *ptr_bgr1++ = r;
...@@ -792,26 +1494,7 @@ void nv_to_bgra(const uint8_t* src, ...@@ -792,26 +1494,7 @@ void nv_to_bgra(const uint8_t* src,
delete[] zerobuf; delete[] zerobuf;
delete[] writebuf; delete[] writebuf;
} }
void nv21_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
nv_to_bgr(src, dst, srcw, srch, 0, 1);
}
// nv12(yuv) to BGR:store hwc dsth * dstw = srch * srcw y_w = srcw, y_h = srch
// uv_w = srcw uv_h = 1/2 * srch
void nv12_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
// exchange vu forward
nv_to_bgr(src, dst, srcw, srch, 1, 0);
}
// nv21(yvu) to BGRA: stroe hwc dsth * dstw = srch * (srcw) y_w = srcw, y_h =
// srch uv_w = srcw uv_h = 1/2 * srch
void nv21_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
nv_to_bgra(src, dst, srcw, srch, 0, 1);
}
// nv12(yuv) to BGRA:store hwc dsth * dstw = srch * srcw y_w = srcw, y_h = srch
// uv_w = srcw uv_h = 1/2 * srch
void nv12_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
nv_to_bgra(src, dst, srcw, srch, 1, 0);
}
/* /*
采用CV_BGR2GRAY,转换公式Gray = 0.1140*B + 0.5870*G + 0.2989*R 采用CV_BGR2GRAY,转换公式Gray = 0.1140*B + 0.5870*G + 0.2989*R
采用CV_RGB2GRAY,转换公式Gray = 0.1140*R + 0.5870*G + 0.2989*B 采用CV_RGB2GRAY,转换公式Gray = 0.1140*R + 0.5870*G + 0.2989*B
...@@ -847,7 +1530,6 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) { ...@@ -847,7 +1530,6 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
uint8_t* outr1 = outr0 + srcw; uint8_t* outr1 = outr0 + srcw;
uint8_t* outr2 = outr1 + srcw; uint8_t* outr2 = outr1 + srcw;
uint8_t* outr3 = outr2 + srcw; uint8_t* outr3 = outr2 + srcw;
int cnt = cnt_pro; int cnt = cnt_pro;
if (cnt > 0) { if (cnt > 0) {
#ifdef __aarch64__ #ifdef __aarch64__
......
...@@ -153,7 +153,10 @@ void flip_hwc1_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { ...@@ -153,7 +153,10 @@ void flip_hwc1_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
// 26, 27}" // 26, 27}"
"ld1 {v3.8b}, [%[inptr3]], #8 \n" // v0={30,31,32, 33, 34, 35, "ld1 {v3.8b}, [%[inptr3]], #8 \n" // v0={30,31,32, 33, 34, 35,
// 36, 37}" // 36, 37}"
"prfm pldl1keep, [%[inptr0]] \n"
"prfm pldl1keep, [%[inptr1]] \n"
"prfm pldl1keep, [%[inptr2]] \n"
"prfm pldl1keep, [%[inptr3]] \n"
"st1 {v0.8b}, [%[outptr0]], #8 \n" // 00 10 20 30 04 14 "st1 {v0.8b}, [%[outptr0]], #8 \n" // 00 10 20 30 04 14
// 24 34 // 24 34
"st1 {v1.8b}, [%[outptr1]], #8 \n" // 02 12 22 32 "st1 {v1.8b}, [%[outptr1]], #8 \n" // 02 12 22 32
...@@ -180,6 +183,10 @@ void flip_hwc1_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { ...@@ -180,6 +183,10 @@ void flip_hwc1_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
"26 27\n" "26 27\n"
"vld1.8 {d12}, [%[inptr3]]! @ zip load r1, d6 = 30 31 32 33 34 35 " "vld1.8 {d12}, [%[inptr3]]! @ zip load r1, d6 = 30 31 32 33 34 35 "
"36 37\n" "36 37\n"
"pld [%[inptr0]] @ preload a, 64byte\n"
"pld [%[inptr1]] @ preload a, 64byte\n"
"pld [%[inptr2]] @ preload a, 64byte\n"
"pld [%[inptr3]] @ preload a, 64byte\n"
"vst1.32 {d0}, [%[outptr0]]! @ write d0(q0,low),r00,r10 20 30\n" "vst1.32 {d0}, [%[outptr0]]! @ write d0(q0,low),r00,r10 20 30\n"
"vst1.32 {d4}, [%[outptr1]]! @ write d4(q0,low),r01,r11 21 31\n" "vst1.32 {d4}, [%[outptr1]]! @ write d4(q0,low),r01,r11 21 31\n"
...@@ -286,7 +293,10 @@ void flip_hwc1_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { ...@@ -286,7 +293,10 @@ void flip_hwc1_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
// 01 00 // 01 00
"rev64 v7.8b, v3.8b \n" //@ reverse 07 06 05 04 03 02 "rev64 v7.8b, v3.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00 // 01 00
"prfm pldl1keep, [%[inptr0]] \n"
"prfm pldl1keep, [%[inptr1]] \n"
"prfm pldl1keep, [%[inptr2]] \n"
"prfm pldl1keep, [%[inptr3]] \n"
"st1 {v4.8b}, [%[outptr0]] \n" // 00 10 20 30 04 14 24 34 "st1 {v4.8b}, [%[outptr0]] \n" // 00 10 20 30 04 14 24 34
"st1 {v5.8b}, [%[outptr1]] \n" // 02 12 22 32 "st1 {v5.8b}, [%[outptr1]] \n" // 02 12 22 32
"st1 {v6.8b}, [%[outptr2]] \n" // 01 11 21 31 "st1 {v6.8b}, [%[outptr2]] \n" // 01 11 21 31
...@@ -324,7 +334,10 @@ void flip_hwc1_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { ...@@ -324,7 +334,10 @@ void flip_hwc1_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
"vrev64.8 d9, d8 @ reverse 07 06 05 04 03 02 01 00 \n" "vrev64.8 d9, d8 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d13, d12 @ reverse 07 06 05 04 03 02 01 00 " "vrev64.8 d13, d12 @ reverse 07 06 05 04 03 02 01 00 "
"\n" "\n"
"pld [%[inptr0]] @ preload a, 64byte\n"
"pld [%[inptr1]] @ preload a, 64byte\n"
"pld [%[inptr2]] @ preload a, 64byte\n"
"pld [%[inptr3]] @ preload a, 64byte\n"
"vst1.32 {d1}, [%[outptr0]] @ write d0(q0,low),r00,r10 20 30\n" "vst1.32 {d1}, [%[outptr0]] @ write d0(q0,low),r00,r10 20 30\n"
"vst1.32 {d5}, [%[outptr1]] @ write d4(q0,low),r01,r11 21 31\n" "vst1.32 {d5}, [%[outptr1]] @ write d4(q0,low),r01,r11 21 31\n"
"vst1.32 {d9}, [%[outptr2]] @ write d4(q0,low),r01,r11 21 31\n" "vst1.32 {d9}, [%[outptr2]] @ write d4(q0,low),r01,r11 21 31\n"
...@@ -440,7 +453,10 @@ void flip_hwc1_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { ...@@ -440,7 +453,10 @@ void flip_hwc1_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
// 01 00 // 01 00
"rev64 v7.8b, v3.8b \n" //@ reverse 07 06 05 04 03 02 "rev64 v7.8b, v3.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00 // 01 00
"prfm pldl1keep, [%[inptr0]] \n"
"prfm pldl1keep, [%[inptr1]] \n"
"prfm pldl1keep, [%[inptr2]] \n"
"prfm pldl1keep, [%[inptr3]] \n"
"st1 {v4.8b}, [%[outptr0]] \n" // 00 10 20 30 04 14 24 34 "st1 {v4.8b}, [%[outptr0]] \n" // 00 10 20 30 04 14 24 34
"st1 {v5.8b}, [%[outptr1]] \n" // 02 12 22 32 "st1 {v5.8b}, [%[outptr1]] \n" // 02 12 22 32
"st1 {v6.8b}, [%[outptr2]] \n" // 01 11 21 31 "st1 {v6.8b}, [%[outptr2]] \n" // 01 11 21 31
...@@ -478,7 +494,10 @@ void flip_hwc1_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { ...@@ -478,7 +494,10 @@ void flip_hwc1_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
"vrev64.8 d9, d8 @ reverse 07 06 05 04 03 02 01 00 \n" "vrev64.8 d9, d8 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d13, d12 @ reverse 07 06 05 04 03 02 01 00 " "vrev64.8 d13, d12 @ reverse 07 06 05 04 03 02 01 00 "
"\n" "\n"
"pld [%[inptr0]] @ preload a, 64byte\n"
"pld [%[inptr1]] @ preload a, 64byte\n"
"pld [%[inptr2]] @ preload a, 64byte\n"
"pld [%[inptr3]] @ preload a, 64byte\n"
"vst1.32 {d1}, [%[outptr0]] @ write d0(q0,low),r00,r10 20 30\n" "vst1.32 {d1}, [%[outptr0]] @ write d0(q0,low),r00,r10 20 30\n"
"vst1.32 {d5}, [%[outptr1]] @ write d4(q0,low),r01,r11 21 31\n" "vst1.32 {d5}, [%[outptr1]] @ write d4(q0,low),r01,r11 21 31\n"
"vst1.32 {d9}, [%[outptr2]] @ write d4(q0,low),r01,r11 21 31\n" "vst1.32 {d9}, [%[outptr2]] @ write d4(q0,low),r01,r11 21 31\n"
...@@ -583,7 +602,10 @@ void flip_hwc3_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { ...@@ -583,7 +602,10 @@ void flip_hwc3_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
"ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24 \n" // v0={30,31,32, "ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24 \n" // v0={30,31,32,
// 33, 34, 35, // 33, 34, 35,
// 36, 37}" // 36, 37}"
"prfm pldl1keep, [%[inptr0]] \n"
"prfm pldl1keep, [%[inptr1]] \n"
"prfm pldl1keep, [%[inptr2]] \n"
"prfm pldl1keep, [%[inptr3]] \n"
"st3 {v0.8b, v1.8b, v2.8b}, [%[outptr0]], #24 \n" // 00 "st3 {v0.8b, v1.8b, v2.8b}, [%[outptr0]], #24 \n" // 00
// 10 // 10
// 20 // 20
...@@ -634,6 +656,10 @@ void flip_hwc3_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { ...@@ -634,6 +656,10 @@ void flip_hwc3_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
"vld3.8 {d9, d10, d11}, [%[inptr3]]! @ zip load r1, d6 = 30 31 32 " "vld3.8 {d9, d10, d11}, [%[inptr3]]! @ zip load r1, d6 = 30 31 32 "
"33 34 35 36 37\n" "33 34 35 36 37\n"
"pld [%[inptr0]] @ preload a, 64byte\n"
"pld [%[inptr1]] @ preload a, 64byte\n"
"pld [%[inptr2]] @ preload a, 64byte\n"
"pld [%[inptr3]] @ preload a, 64byte\n"
"vst3.8 {d0, d1, d2}, [%[outptr0]]! @ write d0(q0,low),r00,r10 " "vst3.8 {d0, d1, d2}, [%[outptr0]]! @ write d0(q0,low),r00,r10 "
"20 30\n" "20 30\n"
"vst3.8 {d3, d4, d5}, [%[outptr1]]! @ write d4(q0,low),r01,r11 " "vst3.8 {d3, d4, d5}, [%[outptr1]]! @ write d4(q0,low),r01,r11 "
...@@ -748,7 +774,10 @@ void flip_hwc3_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { ...@@ -748,7 +774,10 @@ void flip_hwc3_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
"ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24 \n" // v0={30,31,32, "ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24 \n" // v0={30,31,32,
// 33, 34, 35, // 33, 34, 35,
// 36, 37}" // 36, 37}"
"prfm pldl1keep, [%[inptr0]] \n"
"prfm pldl1keep, [%[inptr1]] \n"
"prfm pldl1keep, [%[inptr2]] \n"
"prfm pldl1keep, [%[inptr3]] \n"
"rev64 v12.8b, v0.8b \n" //@ reverse 07 06 05 04 03 "rev64 v12.8b, v0.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 b // 02 01 00 b
"rev64 v13.8b, v1.8b \n" //@ reverse 07 06 05 04 03 "rev64 v13.8b, v1.8b \n" //@ reverse 07 06 05 04 03
...@@ -855,7 +884,10 @@ void flip_hwc3_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { ...@@ -855,7 +884,10 @@ void flip_hwc3_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
"\n" "\n"
"vrev64.8 d23, d11 @ reverse 07 06 05 04 03 02 01 00 " "vrev64.8 d23, d11 @ reverse 07 06 05 04 03 02 01 00 "
"\n" "\n"
"pld [%[inptr0]] @ preload a, 64byte\n"
"pld [%[inptr1]] @ preload a, 64byte\n"
"pld [%[inptr2]] @ preload a, 64byte\n"
"pld [%[inptr3]] @ preload a, 64byte\n"
"vst3.8 {d12, d13, d14}, [%[outptr0]] @ write " "vst3.8 {d12, d13, d14}, [%[outptr0]] @ write "
"d0(q0,low),r00,r10 20 30\n" "d0(q0,low),r00,r10 20 30\n"
"vst3.8 {d15, d16, d17}, [%[outptr1]] @ write " "vst3.8 {d15, d16, d17}, [%[outptr1]] @ write "
...@@ -1027,7 +1059,10 @@ void flip_hwc3_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { ...@@ -1027,7 +1059,10 @@ void flip_hwc3_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
// 02 01 00 // 02 01 00
"rev64 v23.8b, v11.8b \n" //@ reverse 07 06 05 04 03 "rev64 v23.8b, v11.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 // 02 01 00
"prfm pldl1keep, [%[inptr0]] \n"
"prfm pldl1keep, [%[inptr1]] \n"
"prfm pldl1keep, [%[inptr2]] \n"
"prfm pldl1keep, [%[inptr3]] \n"
"st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]] \n" // 00 10 "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]] \n" // 00 10
// 20 30 // 20 30
// 04 14 // 04 14
...@@ -1106,6 +1141,10 @@ void flip_hwc3_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { ...@@ -1106,6 +1141,10 @@ void flip_hwc3_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
"\n" "\n"
"vrev64.8 d23, d11 @ reverse 07 06 05 04 03 02 01 00 " "vrev64.8 d23, d11 @ reverse 07 06 05 04 03 02 01 00 "
"\n" "\n"
"pld [%[inptr0]] @ preload a, 64byte\n"
"pld [%[inptr1]] @ preload a, 64byte\n"
"pld [%[inptr2]] @ preload a, 64byte\n"
"pld [%[inptr3]] @ preload a, 64byte\n"
"vst3.8 {d12, d13, d14}, [%[outptr0]] @ write " "vst3.8 {d12, d13, d14}, [%[outptr0]] @ write "
"d0(q0,low),r00,r10 20 30\n" "d0(q0,low),r00,r10 20 30\n"
...@@ -1262,7 +1301,10 @@ void flip_hwc4_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { ...@@ -1262,7 +1301,10 @@ void flip_hwc4_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
// 35, // 35,
// 36, // 36,
// 37}" // 37}"
"prfm pldl1keep, [%[inptr0]] \n"
"prfm pldl1keep, [%[inptr1]] \n"
"prfm pldl1keep, [%[inptr2]] \n"
"prfm pldl1keep, [%[inptr3]] \n"
"st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[outptr0]], #32 \n" // 00 10 20 "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[outptr0]], #32 \n" // 00 10 20
// 30 04 14 // 30 04 14
// 24 34 // 24 34
...@@ -1306,6 +1348,10 @@ void flip_hwc4_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { ...@@ -1306,6 +1348,10 @@ void flip_hwc4_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
"22 23 24 25 26 27\n" "22 23 24 25 26 27\n"
"vld4.8 {d12, d13, d14, d15}, [%[inptr3]]! @ zip load r1, d6 = 30 " "vld4.8 {d12, d13, d14, d15}, [%[inptr3]]! @ zip load r1, d6 = 30 "
"31 32 33 34 35 36 37\n" "31 32 33 34 35 36 37\n"
"pld [%[inptr0]] @ preload a, 64byte\n"
"pld [%[inptr1]] @ preload a, 64byte\n"
"pld [%[inptr2]] @ preload a, 64byte\n"
"pld [%[inptr3]] @ preload a, 64byte\n"
"vst4.8 {d0, d1, d2, d3}, [%[outptr0]]! @ write " "vst4.8 {d0, d1, d2, d3}, [%[outptr0]]! @ write "
"d0(q0,low),r00,r10 20 30\n" "d0(q0,low),r00,r10 20 30\n"
...@@ -1476,7 +1522,10 @@ void flip_hwc4_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { ...@@ -1476,7 +1522,10 @@ void flip_hwc4_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
// 02 01 00 // 02 01 00
"rev64 v7.8b, v15.8b \n" //@ reverse 07 06 05 04 03 "rev64 v7.8b, v15.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 // 02 01 00
"prfm pldl1keep, [%[inptr0]] \n"
"prfm pldl1keep, [%[inptr1]] \n"
"prfm pldl1keep, [%[inptr2]] \n"
"prfm pldl1keep, [%[inptr3]] \n"
"st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[outptr0]] \n" // 00 10 20 30 04 14 24 34 "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[outptr0]] \n" // 00 10 20 30 04 14 24 34
"st4 {v20.8b, v21.8b, v22.8b, v23.8b}, [%[outptr1]] \n" // 02 12 22 32 "st4 {v20.8b, v21.8b, v22.8b, v23.8b}, [%[outptr1]] \n" // 02 12 22 32
"st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[outptr2]] \n" // 01 "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[outptr2]] \n" // 01
...@@ -1571,6 +1620,10 @@ void flip_hwc4_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { ...@@ -1571,6 +1620,10 @@ void flip_hwc4_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
"\n" "\n"
"vrev64.8 d7, d15 @ reverse 07 06 05 04 03 02 01 00 " "vrev64.8 d7, d15 @ reverse 07 06 05 04 03 02 01 00 "
"\n" "\n"
"pld [%[inptr0]] @ preload a, 64byte\n"
"pld [%[inptr1]] @ preload a, 64byte\n"
"pld [%[inptr2]] @ preload a, 64byte\n"
"pld [%[inptr3]] @ preload a, 64byte\n"
"vst4.8 {d16, d17, d18, d19}, [%[outptr0]] @ write " "vst4.8 {d16, d17, d18, d19}, [%[outptr0]] @ write "
"d0(q0,low),r00,r10 20 30\n" "d0(q0,low),r00,r10 20 30\n"
...@@ -1770,7 +1823,10 @@ void flip_hwc4_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { ...@@ -1770,7 +1823,10 @@ void flip_hwc4_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
// 02 01 00 // 02 01 00
"rev64 v7.8b, v15.8b \n" //@ reverse 07 06 05 04 03 "rev64 v7.8b, v15.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 // 02 01 00
"prfm pldl1keep, [%[inptr0]] \n"
"prfm pldl1keep, [%[inptr1]] \n"
"prfm pldl1keep, [%[inptr2]] \n"
"prfm pldl1keep, [%[inptr3]] \n"
"st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[outptr0]] \n" // 00 10 20 30 04 14 24 34 "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[outptr0]] \n" // 00 10 20 30 04 14 24 34
"st4 {v20.8b, v21.8b, v22.8b, v23.8b}, [%[outptr1]] \n" // 02 12 22 32 "st4 {v20.8b, v21.8b, v22.8b, v23.8b}, [%[outptr1]] \n" // 02 12 22 32
"st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[outptr2]] \n" // 01 "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[outptr2]] \n" // 01
...@@ -1868,6 +1924,10 @@ void flip_hwc4_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { ...@@ -1868,6 +1924,10 @@ void flip_hwc4_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
"vrev64.8 d7, d15 @ reverse 07 06 05 04 03 02 01 00 " "vrev64.8 d7, d15 @ reverse 07 06 05 04 03 02 01 00 "
"\n" "\n"
"pld [%[inptr0]] @ preload a, 64byte\n"
"pld [%[inptr1]] @ preload a, 64byte\n"
"pld [%[inptr2]] @ preload a, 64byte\n"
"pld [%[inptr3]] @ preload a, 64byte\n"
"vst4.8 {d16, d17, d18, d19}, [%[outptr0]] @ write " "vst4.8 {d16, d17, d18, d19}, [%[outptr0]] @ write "
"d0(q0,low),r00,r10 20 30\n" "d0(q0,low),r00,r10 20 30\n"
"vst4.8 {d20, d21, d22, d23}, [%[outptr1]] @ write " "vst4.8 {d20, d21, d22, d23}, [%[outptr1]] @ write "
......
...@@ -51,9 +51,44 @@ void ImageResize::choose(const uint8_t* src, ...@@ -51,9 +51,44 @@ void ImageResize::choose(const uint8_t* src,
int dsth) { int dsth) {
resize(src, dst, srcFormat, srcw, srch, dstw, dsth); resize(src, dst, srcFormat, srcw, srch, dstw, dsth);
} }
void resize_one_channel(
const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
void resize_one_channel_uv(
const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
void resize_three_channel( void resize_three_channel(
const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out); const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
void resize_four_channel(
const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
void nv21_resize(const uint8_t* src,
uint8_t* dst,
int w_in,
int h_in,
int w_out,
int h_out) {
if (w_out == w_in && h_out == h_in) {
memcpy(dst, src, sizeof(uint8_t) * w_in * static_cast<int>(1.5 * h_in));
return;
}
// return;
int y_h = h_in;
int uv_h = h_in / 2;
const uint8_t* y_ptr = src;
const uint8_t* uv_ptr = src + y_h * w_in;
// out
int dst_y_h = h_out;
int dst_uv_h = h_out / 2;
uint8_t* dst_ptr = dst + dst_y_h * w_out;
// y
resize_one_channel(y_ptr, w_in, y_h, dst, w_out, dst_y_h);
// uv
resize_one_channel_uv(uv_ptr, w_in, uv_h, dst_ptr, w_out, dst_uv_h);
}
void bgr_resize(const uint8_t* src, void bgr_resize(const uint8_t* src,
uint8_t* dst, uint8_t* dst,
int w_in, int w_in,
...@@ -67,36 +102,57 @@ void bgr_resize(const uint8_t* src, ...@@ -67,36 +102,57 @@ void bgr_resize(const uint8_t* src,
// y // y
resize_three_channel(src, w_in * 3, h_in, dst, w_out * 3, h_out); resize_three_channel(src, w_in * 3, h_in, dst, w_out * 3, h_out);
} }
void resize_three_channel(const uint8_t* src,
int w_in, void bgra_resize(const uint8_t* src,
int h_in, uint8_t* dst,
uint8_t* dst, int w_in,
int w_out, int h_in,
int h_out) { int w_out,
int h_out) {
if (w_out == w_in && h_out == h_in) {
memcpy(dst, src, sizeof(uint8_t) * w_in * h_in * 4);
return;
}
// y
resize_four_channel(src, w_in * 4, h_in, dst, w_out * 4, h_out);
}
void resize_one_channel(const uint8_t* src,
int w_in,
int h_in,
uint8_t* dst,
int w_out,
int h_out) {
const int resize_coef_bits = 11; const int resize_coef_bits = 11;
const int resize_coef_scale = 1 << resize_coef_bits; const int resize_coef_scale = 1 << resize_coef_bits;
double scale_x = static_cast<double>(w_in) / w_out; double scale_x = static_cast<double>(w_in) / w_out;
double scale_y = static_cast<double>(h_in) / h_out; double scale_y = static_cast<double>(h_in) / h_out;
int* buf = new int[w_out * 2 + h_out * 2]; int* buf = new int[w_out * 2 + h_out * 2];
int* xofs = buf; // new int[w]; int* xofs = buf; // new int[w];
int* yofs = buf + w_out; // new int[h]; int* yofs = buf + w_out; // new int[h];
int16_t* ialpha = int16_t* ialpha =
reinterpret_cast<int16_t*>(buf + w_out + h_out); // new int16_t[w * 2]; reinterpret_cast<int16_t*>(buf + w_out + h_out); // new short[w * 2];
int16_t* ibeta = int16_t* ibeta =
reinterpret_cast<int16_t*>(buf + w_out * 2 + h_out); // new short[h * 2]; reinterpret_cast<int16_t*>(buf + w_out * 2 + h_out); // new short[h * 2];
float fx = 0.f; float fx = 0.f;
float fy = 0.f; float fy = 0.f;
int sx = 0.f; int sx = 0;
int sy = 0.f; int sy = 0;
#define SATURATE_CAST_SHORT(X) \ #define SATURATE_CAST_SHORT(X) \
(int16_t)::std::min( \ (int16_t)::std::min( \
::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \ ::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
SHRT_MAX); SHRT_MAX);
// #pragma omp parallel for for (int dx = 0; dx < w_out; dx++) {
for (int dx = 0; dx < w_out / 3; dx++) {
fx = static_cast<float>((dx + 0.5) * scale_x - 0.5); fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
sx = floor(fx); sx = floor(fx);
fx -= sx; fx -= sx;
if (sx < 0) { if (sx < 0) {
sx = 0; sx = 0;
fx = 0.f; fx = 0.f;
...@@ -105,17 +161,20 @@ void resize_three_channel(const uint8_t* src, ...@@ -105,17 +161,20 @@ void resize_three_channel(const uint8_t* src,
sx = w_in - 2; sx = w_in - 2;
fx = 1.f; fx = 1.f;
} }
xofs[dx] = sx * 3;
xofs[dx] = sx;
float a0 = (1.f - fx) * resize_coef_scale; float a0 = (1.f - fx) * resize_coef_scale;
float a1 = fx * resize_coef_scale; float a1 = fx * resize_coef_scale;
ialpha[dx * 2] = SATURATE_CAST_SHORT(a0); ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1); ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
} }
// #pragma omp parallel for
for (int dy = 0; dy < h_out; dy++) { for (int dy = 0; dy < h_out; dy++) {
fy = static_cast<float>((dy + 0.5) * scale_y - 0.5); fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
sy = floor(fy); sy = floor(fy);
fy -= sy; fy -= sy;
if (sy < 0) { if (sy < 0) {
sy = 0; sy = 0;
fy = 0.f; fy = 0.f;
...@@ -124,9 +183,12 @@ void resize_three_channel(const uint8_t* src, ...@@ -124,9 +183,12 @@ void resize_three_channel(const uint8_t* src,
sy = h_in - 2; sy = h_in - 2;
fy = 1.f; fy = 1.f;
} }
yofs[dy] = sy; yofs[dy] = sy;
float b0 = (1.f - fy) * resize_coef_scale; float b0 = (1.f - fy) * resize_coef_scale;
float b1 = fy * resize_coef_scale; float b1 = fy * resize_coef_scale;
ibeta[dy * 2] = SATURATE_CAST_SHORT(b0); ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1); ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
} }
...@@ -136,9 +198,11 @@ void resize_three_channel(const uint8_t* src, ...@@ -136,9 +198,11 @@ void resize_three_channel(const uint8_t* src,
int16_t* rowsbuf1 = new int16_t[w_out + 1]; int16_t* rowsbuf1 = new int16_t[w_out + 1];
int16_t* rows0 = rowsbuf0; int16_t* rows0 = rowsbuf0;
int16_t* rows1 = rowsbuf1; int16_t* rows1 = rowsbuf1;
int prev_sy1 = -1; int prev_sy1 = -1;
for (int dy = 0; dy < h_out; dy++) { for (int dy = 0; dy < h_out; dy++) {
int sy = yofs[dy]; int sy = yofs[dy];
if (sy == prev_sy1) { if (sy == prev_sy1) {
// hresize one row // hresize one row
int16_t* rows0_old = rows0; int16_t* rows0_old = rows0;
...@@ -147,72 +211,80 @@ void resize_three_channel(const uint8_t* src, ...@@ -147,72 +211,80 @@ void resize_three_channel(const uint8_t* src,
const uint8_t* S1 = src + w_in * (sy + 1); const uint8_t* S1 = src + w_in * (sy + 1);
const int16_t* ialphap = ialpha; const int16_t* ialphap = ialpha;
int16_t* rows1p = rows1; int16_t* rows1p = rows1;
for (int dx = 0; dx < w_out / 3; dx++) { for (int dx = 0; dx < w_out; dx++) {
int sx = xofs[dx]; int sx = xofs[dx];
int16_t a0 = ialphap[0]; int16_t a0 = ialphap[0];
int16_t a1 = ialphap[1]; int16_t a1 = ialphap[1];
const uint8_t* S1p = S1 + sx; const uint8_t* S1p = S1 + sx;
int tmp = dx * 3; rows1p[dx] = (S1p[0] * a0 + S1p[1] * a1) >> 4;
rows1p[tmp] = (S1p[0] * a0 + S1p[3] * a1) >> 4;
rows1p[tmp + 1] = (S1p[1] * a0 + S1p[4] * a1) >> 4;
rows1p[tmp + 2] = (S1p[2] * a0 + S1p[5] * a1) >> 4;
ialphap += 2; ialphap += 2;
} }
} else { } else {
// hresize two rows // hresize two rows
const uint8_t* S0 = src + w_in * (sy); const uint8_t* S0 = src + w_in * (sy);
const uint8_t* S1 = src + w_in * (sy + 1); const uint8_t* S1 = src + w_in * (sy + 1);
const int16_t* ialphap = ialpha; const int16_t* ialphap = ialpha;
int16_t* rows0p = rows0; int16_t* rows0p = rows0;
int16_t* rows1p = rows1; int16_t* rows1p = rows1;
for (int dx = 0; dx < w_out / 3; dx++) { for (int dx = 0; dx < w_out; dx++) {
int sx = xofs[dx]; int sx = xofs[dx];
int16_t a0 = ialphap[0]; int16_t a0 = ialphap[0];
int16_t a1 = ialphap[1]; int16_t a1 = ialphap[1];
const uint8_t* S0p = S0 + sx; const uint8_t* S0p = S0 + sx;
const uint8_t* S1p = S1 + sx; const uint8_t* S1p = S1 + sx;
int tmp = dx * 3; rows0p[dx] = (S0p[0] * a0 + S0p[1] * a1) >> 4;
rows0p[tmp] = (S0p[0] * a0 + S0p[3] * a1) >> 4; rows1p[dx] = (S1p[0] * a0 + S1p[1] * a1) >> 4;
rows1p[tmp] = (S1p[0] * a0 + S1p[3] * a1) >> 4;
rows0p[tmp + 1] = (S0p[1] * a0 + S0p[4] * a1) >> 4;
rows1p[tmp + 1] = (S1p[1] * a0 + S1p[4] * a1) >> 4;
rows0p[tmp + 2] = (S0p[2] * a0 + S0p[5] * a1) >> 4;
rows1p[tmp + 2] = (S1p[2] * a0 + S1p[5] * a1) >> 4;
ialphap += 2; ialphap += 2;
} }
} }
prev_sy1 = sy + 1; prev_sy1 = sy + 1;
// vresize // vresize
int16_t b0 = ibeta[0]; int16_t b0 = ibeta[0];
int16_t b1 = ibeta[1]; int16_t b1 = ibeta[1];
int16_t* rows0p = rows0; int16_t* rows0p = rows0;
int16_t* rows1p = rows1; int16_t* rows1p = rows1;
uint8_t* dp_ptr = dst + w_out * (dy); uint8_t* dp_ptr = dst + w_out * (dy);
int cnt = w_out >> 3; int cnt = w_out >> 3;
int remain = w_out - (cnt << 3); int remain = w_out - (cnt << 3);
int16x4_t _b0 = vdup_n_s16(b0); int16x4_t _b0 = vdup_n_s16(b0);
int16x4_t _b1 = vdup_n_s16(b1); int16x4_t _b1 = vdup_n_s16(b1);
int32x4_t _v2 = vdupq_n_s32(2); int32x4_t _v2 = vdupq_n_s32(2);
for (cnt = w_out >> 3; cnt > 0; cnt--) { for (cnt = w_out >> 3; cnt > 0; cnt--) {
int16x4_t _rows0p_sr4 = vld1_s16(rows0p); int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
int16x4_t _rows1p_sr4 = vld1_s16(rows1p); int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4); int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4); int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0); int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1); int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0); int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1); int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
int32x4_t _acc = _v2; int32x4_t _acc = _v2;
_acc = vsraq_n_s32( _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
_acc, _rows0p_sr4_mb0, 16); // _acc >> 16 + _rows0p_sr4_mb0 >> 16
_acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16); _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
int32x4_t _acc_1 = _v2; int32x4_t _acc_1 = _v2;
_acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16); _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
_acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16); _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
int16x4_t _acc16 = vshrn_n_s32(_acc, 2); // _acc >> 2
int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2); int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1)); uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
vst1_u8(dp_ptr, _dout); vst1_u8(dp_ptr, _dout);
dp_ptr += 8; dp_ptr += 8;
rows0p += 8; rows0p += 8;
rows1p += 8; rows1p += 8;
...@@ -226,45 +298,18 @@ void resize_three_channel(const uint8_t* src, ...@@ -226,45 +298,18 @@ void resize_three_channel(const uint8_t* src,
} }
ibeta += 2; ibeta += 2;
} }
delete[] buf; delete[] buf;
delete[] rowsbuf0; delete[] rowsbuf0;
delete[] rowsbuf1; delete[] rowsbuf1;
} }
void resize_one_channel(
const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
void resize_one_channel_uv(
const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
void nv21_resize(const uint8_t* src,
uint8_t* dst,
int w_in,
int h_in,
int w_out,
int h_out) {
if (w_out == w_in && h_out == h_in) {
memcpy(dst, src, sizeof(uint8_t) * w_in * static_cast<int>(1.5 * h_in));
return;
}
// return;
int y_h = h_in;
int uv_h = h_in / 2;
const uint8_t* y_ptr = src;
const uint8_t* uv_ptr = src + y_h * w_in;
// out
int dst_y_h = h_out;
int dst_uv_h = h_out / 2;
uint8_t* dst_ptr = dst + dst_y_h * w_out;
// y
resize_one_channel(y_ptr, w_in, y_h, dst, w_out, dst_y_h);
// uv
resize_one_channel_uv(uv_ptr, w_in, uv_h, dst_ptr, w_out, dst_uv_h);
}
void resize_one_channel(const uint8_t* src, void resize_one_channel_uv(const uint8_t* src,
int w_in, int w_in,
int h_in, int h_in,
uint8_t* dst, uint8_t* dst,
int w_out, int w_out,
int h_out) { int h_out) {
const int resize_coef_bits = 11; const int resize_coef_bits = 11;
const int resize_coef_scale = 1 << resize_coef_bits; const int resize_coef_scale = 1 << resize_coef_bits;
...@@ -277,20 +322,20 @@ void resize_one_channel(const uint8_t* src, ...@@ -277,20 +322,20 @@ void resize_one_channel(const uint8_t* src,
int* yofs = buf + w_out; // new int[h]; int* yofs = buf + w_out; // new int[h];
int16_t* ialpha = int16_t* ialpha =
reinterpret_cast<int16_t*>(buf + w_out + h_out); // new short[w * 2]; reinterpret_cast<int16_t*>(buf + w_out + h_out); // new int16_t[w * 2];
int16_t* ibeta = int16_t* ibeta = reinterpret_cast<int16_t*>(buf + w_out * 2 +
reinterpret_cast<int16_t*>(buf + w_out * 2 + h_out); // new short[h * 2]; h_out); // new int16_t[h * 2];
float fx = 0.f; float fx = 0.f;
float fy = 0.f; float fy = 0.f;
int sx = 0; int sx = 0.f;
int sy = 0; int sy = 0.f;
#define SATURATE_CAST_SHORT(X) \ #define SATURATE_CAST_SHORT(X) \
(int16_t)::std::min( \ (int16_t)::std::min( \
::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \ ::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
SHRT_MAX); SHRT_MAX);
for (int dx = 0; dx < w_out; dx++) { for (int dx = 0; dx < w_out / 2; dx++) {
fx = static_cast<float>((dx + 0.5) * scale_x - 0.5); fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
sx = floor(fx); sx = floor(fx);
fx -= sx; fx -= sx;
...@@ -334,6 +379,7 @@ void resize_one_channel(const uint8_t* src, ...@@ -334,6 +379,7 @@ void resize_one_channel(const uint8_t* src,
ibeta[dy * 2] = SATURATE_CAST_SHORT(b0); ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1); ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
} }
#undef SATURATE_CAST_SHORT #undef SATURATE_CAST_SHORT
// loop body // loop body
int16_t* rowsbuf0 = new int16_t[w_out + 1]; int16_t* rowsbuf0 = new int16_t[w_out + 1];
...@@ -344,22 +390,23 @@ void resize_one_channel(const uint8_t* src, ...@@ -344,22 +390,23 @@ void resize_one_channel(const uint8_t* src,
int prev_sy1 = -1; int prev_sy1 = -1;
for (int dy = 0; dy < h_out; dy++) { for (int dy = 0; dy < h_out; dy++) {
int sy = yofs[dy]; int sy = yofs[dy];
if (sy == prev_sy1) { if (sy == prev_sy1) {
// hresize one row // hresize one row
int16_t* rows0_old = rows0; int16_t* rows0_old = rows0;
rows0 = rows1; rows0 = rows1;
rows1 = rows0_old; rows1 = rows0_old;
const uint8_t* S1 = src + w_in * (sy + 1); const uint8_t* S1 = src + w_in * (sy + 1);
const int16_t* ialphap = ialpha; const int16_t* ialphap = ialpha;
int16_t* rows1p = rows1; int16_t* rows1p = rows1;
for (int dx = 0; dx < w_out; dx++) { for (int dx = 0; dx < w_out / 2; dx++) {
int sx = xofs[dx]; int sx = xofs[dx] * 2;
int16_t a0 = ialphap[0]; int16_t a0 = ialphap[0];
int16_t a1 = ialphap[1]; int16_t a1 = ialphap[1];
const uint8_t* S1p = S1 + sx; const uint8_t* S1p = S1 + sx;
rows1p[dx] = (S1p[0] * a0 + S1p[1] * a1) >> 4; int tmp = dx * 2;
rows1p[tmp] = (S1p[0] * a0 + S1p[2] * a1) >> 4;
rows1p[tmp + 1] = (S1p[1] * a0 + S1p[3] * a1) >> 4;
ialphap += 2; ialphap += 2;
} }
...@@ -371,20 +418,22 @@ void resize_one_channel(const uint8_t* src, ...@@ -371,20 +418,22 @@ void resize_one_channel(const uint8_t* src,
const int16_t* ialphap = ialpha; const int16_t* ialphap = ialpha;
int16_t* rows0p = rows0; int16_t* rows0p = rows0;
int16_t* rows1p = rows1; int16_t* rows1p = rows1;
for (int dx = 0; dx < w_out; dx++) { for (int dx = 0; dx < w_out / 2; dx++) {
int sx = xofs[dx]; int sx = xofs[dx] * 2;
int16_t a0 = ialphap[0]; int16_t a0 = ialphap[0];
int16_t a1 = ialphap[1]; int16_t a1 = ialphap[1];
const uint8_t* S0p = S0 + sx; const uint8_t* S0p = S0 + sx;
const uint8_t* S1p = S1 + sx; const uint8_t* S1p = S1 + sx;
rows0p[dx] = (S0p[0] * a0 + S0p[1] * a1) >> 4; int tmp = dx * 2;
rows1p[dx] = (S1p[0] * a0 + S1p[1] * a1) >> 4; rows0p[tmp] = (S0p[0] * a0 + S0p[2] * a1) >> 4;
rows1p[tmp] = (S1p[0] * a0 + S1p[2] * a1) >> 4;
rows0p[tmp + 1] = (S0p[1] * a0 + S0p[3] * a1) >> 4;
rows1p[tmp + 1] = (S1p[1] * a0 + S1p[3] * a1) >> 4;
ialphap += 2; ialphap += 2;
} }
} }
prev_sy1 = sy + 1; prev_sy1 = sy + 1;
// vresize // vresize
...@@ -400,7 +449,6 @@ void resize_one_channel(const uint8_t* src, ...@@ -400,7 +449,6 @@ void resize_one_channel(const uint8_t* src,
int16x4_t _b0 = vdup_n_s16(b0); int16x4_t _b0 = vdup_n_s16(b0);
int16x4_t _b1 = vdup_n_s16(b1); int16x4_t _b1 = vdup_n_s16(b1);
int32x4_t _v2 = vdupq_n_s32(2); int32x4_t _v2 = vdupq_n_s32(2);
for (cnt = w_out >> 3; cnt > 0; cnt--) { for (cnt = w_out >> 3; cnt > 0; cnt--) {
int16x4_t _rows0p_sr4 = vld1_s16(rows0p); int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
int16x4_t _rows1p_sr4 = vld1_s16(rows1p); int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
...@@ -413,14 +461,15 @@ void resize_one_channel(const uint8_t* src, ...@@ -413,14 +461,15 @@ void resize_one_channel(const uint8_t* src,
int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1); int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
int32x4_t _acc = _v2; int32x4_t _acc = _v2;
_acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16); _acc = vsraq_n_s32(
_acc, _rows0p_sr4_mb0, 16); // _acc >> 16 + _rows0p_sr4_mb0 >> 16
_acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16); _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
int32x4_t _acc_1 = _v2; int32x4_t _acc_1 = _v2;
_acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16); _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
_acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16); _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
int16x4_t _acc16 = vshrn_n_s32(_acc, 2); int16x4_t _acc16 = vshrn_n_s32(_acc, 2); // _acc >> 2
int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2); int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1)); uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
...@@ -446,42 +495,35 @@ void resize_one_channel(const uint8_t* src, ...@@ -446,42 +495,35 @@ void resize_one_channel(const uint8_t* src,
delete[] rowsbuf1; delete[] rowsbuf1;
} }
void resize_one_channel_uv(const uint8_t* src, void resize_three_channel(const uint8_t* src,
int w_in, int w_in,
int h_in, int h_in,
uint8_t* dst, uint8_t* dst,
int w_out, int w_out,
int h_out) { int h_out) {
const int resize_coef_bits = 11; const int resize_coef_bits = 11;
const int resize_coef_scale = 1 << resize_coef_bits; const int resize_coef_scale = 1 << resize_coef_bits;
double scale_x = static_cast<double>(w_in) / w_out; double scale_x = static_cast<double>(w_in) / w_out;
double scale_y = static_cast<double>(h_in) / h_out; double scale_y = static_cast<double>(h_in) / h_out;
int* buf = new int[w_out * 2 + h_out * 2]; int* buf = new int[w_out * 2 + h_out * 2];
int* xofs = buf; // new int[w]; int* xofs = buf; // new int[w];
int* yofs = buf + w_out; // new int[h]; int* yofs = buf + w_out; // new int[h];
int16_t* ialpha = int16_t* ialpha =
reinterpret_cast<int16_t*>(buf + w_out + h_out); // new int16_t[w * 2]; reinterpret_cast<int16_t*>(buf + w_out + h_out); // new int16_t[w * 2];
int16_t* ibeta = reinterpret_cast<int16_t*>(buf + w_out * 2 + int16_t* ibeta =
h_out); // new int16_t[h * 2]; reinterpret_cast<int16_t*>(buf + w_out * 2 + h_out); // new short[h * 2];
float fx = 0.f; float fx = 0.f;
float fy = 0.f; float fy = 0.f;
int sx = 0.f; int sx = 0.f;
int sy = 0.f; int sy = 0.f;
#define SATURATE_CAST_SHORT(X) \ #define SATURATE_CAST_SHORT(X) \
(int16_t)::std::min( \ (int16_t)::std::min( \
::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \ ::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
SHRT_MAX); SHRT_MAX);
for (int dx = 0; dx < w_out / 2; dx++) { for (int dx = 0; dx < w_out / 3; dx++) {
fx = static_cast<float>((dx + 0.5) * scale_x - 0.5); fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
sx = floor(fx); sx = floor(fx);
fx -= sx; fx -= sx;
if (sx < 0) { if (sx < 0) {
sx = 0; sx = 0;
fx = 0.f; fx = 0.f;
...@@ -490,12 +532,9 @@ void resize_one_channel_uv(const uint8_t* src, ...@@ -490,12 +532,9 @@ void resize_one_channel_uv(const uint8_t* src,
sx = w_in - 2; sx = w_in - 2;
fx = 1.f; fx = 1.f;
} }
xofs[dx] = sx * 3;
xofs[dx] = sx;
float a0 = (1.f - fx) * resize_coef_scale; float a0 = (1.f - fx) * resize_coef_scale;
float a1 = fx * resize_coef_scale; float a1 = fx * resize_coef_scale;
ialpha[dx * 2] = SATURATE_CAST_SHORT(a0); ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1); ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
} }
...@@ -503,7 +542,6 @@ void resize_one_channel_uv(const uint8_t* src, ...@@ -503,7 +542,6 @@ void resize_one_channel_uv(const uint8_t* src,
fy = static_cast<float>((dy + 0.5) * scale_y - 0.5); fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
sy = floor(fy); sy = floor(fy);
fy -= sy; fy -= sy;
if (sy < 0) { if (sy < 0) {
sy = 0; sy = 0;
fy = 0.f; fy = 0.f;
...@@ -512,23 +550,18 @@ void resize_one_channel_uv(const uint8_t* src, ...@@ -512,23 +550,18 @@ void resize_one_channel_uv(const uint8_t* src,
sy = h_in - 2; sy = h_in - 2;
fy = 1.f; fy = 1.f;
} }
yofs[dy] = sy; yofs[dy] = sy;
float b0 = (1.f - fy) * resize_coef_scale; float b0 = (1.f - fy) * resize_coef_scale;
float b1 = fy * resize_coef_scale; float b1 = fy * resize_coef_scale;
ibeta[dy * 2] = SATURATE_CAST_SHORT(b0); ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1); ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
} }
#undef SATURATE_CAST_SHORT #undef SATURATE_CAST_SHORT
// loop body // loop body
int16_t* rowsbuf0 = new int16_t[w_out + 1]; int16_t* rowsbuf0 = new int16_t[w_out + 1];
int16_t* rowsbuf1 = new int16_t[w_out + 1]; int16_t* rowsbuf1 = new int16_t[w_out + 1];
int16_t* rows0 = rowsbuf0; int16_t* rows0 = rowsbuf0;
int16_t* rows1 = rowsbuf1; int16_t* rows1 = rowsbuf1;
int prev_sy1 = -1; int prev_sy1 = -1;
for (int dy = 0; dy < h_out; dy++) { for (int dy = 0; dy < h_out; dy++) {
int sy = yofs[dy]; int sy = yofs[dy];
...@@ -538,54 +571,49 @@ void resize_one_channel_uv(const uint8_t* src, ...@@ -538,54 +571,49 @@ void resize_one_channel_uv(const uint8_t* src,
rows0 = rows1; rows0 = rows1;
rows1 = rows0_old; rows1 = rows0_old;
const uint8_t* S1 = src + w_in * (sy + 1); const uint8_t* S1 = src + w_in * (sy + 1);
const int16_t* ialphap = ialpha; const int16_t* ialphap = ialpha;
int16_t* rows1p = rows1; int16_t* rows1p = rows1;
for (int dx = 0; dx < w_out / 2; dx++) { for (int dx = 0; dx < w_out / 3; dx++) {
int sx = xofs[dx] * 2; int sx = xofs[dx];
int16_t a0 = ialphap[0]; int16_t a0 = ialphap[0];
int16_t a1 = ialphap[1]; int16_t a1 = ialphap[1];
const uint8_t* S1p = S1 + sx; const uint8_t* S1p = S1 + sx;
int tmp = dx * 2; int tmp = dx * 3;
rows1p[tmp] = (S1p[0] * a0 + S1p[2] * a1) >> 4; rows1p[tmp] = (S1p[0] * a0 + S1p[3] * a1) >> 4;
rows1p[tmp + 1] = (S1p[1] * a0 + S1p[3] * a1) >> 4; rows1p[tmp + 1] = (S1p[1] * a0 + S1p[4] * a1) >> 4;
rows1p[tmp + 2] = (S1p[2] * a0 + S1p[5] * a1) >> 4;
ialphap += 2; ialphap += 2;
} }
} else { } else {
// hresize two rows // hresize two rows
const uint8_t* S0 = src + w_in * (sy); const uint8_t* S0 = src + w_in * (sy);
const uint8_t* S1 = src + w_in * (sy + 1); const uint8_t* S1 = src + w_in * (sy + 1);
const int16_t* ialphap = ialpha; const int16_t* ialphap = ialpha;
int16_t* rows0p = rows0; int16_t* rows0p = rows0;
int16_t* rows1p = rows1; int16_t* rows1p = rows1;
for (int dx = 0; dx < w_out / 2; dx++) { for (int dx = 0; dx < w_out / 3; dx++) {
int sx = xofs[dx] * 2; int sx = xofs[dx];
int16_t a0 = ialphap[0]; int16_t a0 = ialphap[0];
int16_t a1 = ialphap[1]; int16_t a1 = ialphap[1];
const uint8_t* S0p = S0 + sx; const uint8_t* S0p = S0 + sx;
const uint8_t* S1p = S1 + sx; const uint8_t* S1p = S1 + sx;
int tmp = dx * 2; int tmp = dx * 3;
rows0p[tmp] = (S0p[0] * a0 + S0p[2] * a1) >> 4; rows0p[tmp] = (S0p[0] * a0 + S0p[3] * a1) >> 4;
rows1p[tmp] = (S1p[0] * a0 + S1p[2] * a1) >> 4; rows1p[tmp] = (S1p[0] * a0 + S1p[3] * a1) >> 4;
rows0p[tmp + 1] = (S0p[1] * a0 + S0p[4] * a1) >> 4;
rows0p[tmp + 1] = (S0p[1] * a0 + S0p[3] * a1) >> 4; rows1p[tmp + 1] = (S1p[1] * a0 + S1p[4] * a1) >> 4;
rows1p[tmp + 1] = (S1p[1] * a0 + S1p[3] * a1) >> 4; rows0p[tmp + 2] = (S0p[2] * a0 + S0p[5] * a1) >> 4;
rows1p[tmp + 2] = (S1p[2] * a0 + S1p[5] * a1) >> 4;
ialphap += 2; ialphap += 2;
} }
} }
prev_sy1 = sy + 1; prev_sy1 = sy + 1;
// vresize // vresize
int16_t b0 = ibeta[0]; int16_t b0 = ibeta[0];
int16_t b1 = ibeta[1]; int16_t b1 = ibeta[1];
int16_t* rows0p = rows0; int16_t* rows0p = rows0;
int16_t* rows1p = rows1; int16_t* rows1p = rows1;
uint8_t* dp_ptr = dst + w_out * (dy); uint8_t* dp_ptr = dst + w_out * (dy);
int cnt = w_out >> 3; int cnt = w_out >> 3;
int remain = w_out - (cnt << 3); int remain = w_out - (cnt << 3);
int16x4_t _b0 = vdup_n_s16(b0); int16x4_t _b0 = vdup_n_s16(b0);
...@@ -596,28 +624,21 @@ void resize_one_channel_uv(const uint8_t* src, ...@@ -596,28 +624,21 @@ void resize_one_channel_uv(const uint8_t* src,
int16x4_t _rows1p_sr4 = vld1_s16(rows1p); int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4); int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4); int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0); int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1); int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0); int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1); int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
int32x4_t _acc = _v2; int32x4_t _acc = _v2;
_acc = vsraq_n_s32( _acc = vsraq_n_s32(
_acc, _rows0p_sr4_mb0, 16); // _acc >> 16 + _rows0p_sr4_mb0 >> 16 _acc, _rows0p_sr4_mb0, 16); // _acc >> 16 + _rows0p_sr4_mb0 >> 16
_acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16); _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
int32x4_t _acc_1 = _v2; int32x4_t _acc_1 = _v2;
_acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16); _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
_acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16); _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
int16x4_t _acc16 = vshrn_n_s32(_acc, 2); // _acc >> 2 int16x4_t _acc16 = vshrn_n_s32(_acc, 2); // _acc >> 2
int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2); int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1)); uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
vst1_u8(dp_ptr, _dout); vst1_u8(dp_ptr, _dout);
dp_ptr += 8; dp_ptr += 8;
rows0p += 8; rows0p += 8;
rows1p += 8; rows1p += 8;
...@@ -631,7 +652,172 @@ void resize_one_channel_uv(const uint8_t* src, ...@@ -631,7 +652,172 @@ void resize_one_channel_uv(const uint8_t* src,
} }
ibeta += 2; ibeta += 2;
} }
delete[] buf;
delete[] rowsbuf0;
delete[] rowsbuf1;
}
void resize_four_channel(const uint8_t* src,
int w_in,
int h_in,
uint8_t* dst,
int w_out,
int h_out) {
const int resize_coef_bits = 11;
const int resize_coef_scale = 1 << resize_coef_bits;
double scale_x = static_cast<double>(w_in) / w_out;
double scale_y = static_cast<double>(h_in) / h_out;
int* buf = new int[w_out * 2 + h_out * 2];
int* xofs = buf; // new int[w];
int* yofs = buf + w_out; // new int[h];
int16_t* ialpha =
reinterpret_cast<int16_t*>(buf + w_out + h_out); // new int16_t[w * 2];
int16_t* ibeta =
reinterpret_cast<int16_t*>(buf + w_out * 2 + h_out); // new short[h * 2];
float fx = 0.f;
float fy = 0.f;
int sx = 0.f;
int sy = 0.f;
#define SATURATE_CAST_SHORT(X) \
(int16_t)::std::min( \
::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
SHRT_MAX);
for (int dx = 0; dx < w_out / 4; dx++) {
fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
sx = floor(fx);
fx -= sx;
if (sx < 0) {
sx = 0;
fx = 0.f;
}
if (sx >= w_in - 1) {
sx = w_in - 2;
fx = 1.f;
}
xofs[dx] = sx * 4;
float a0 = (1.f - fx) * resize_coef_scale;
float a1 = fx * resize_coef_scale;
ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
}
for (int dy = 0; dy < h_out; dy++) {
fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
sy = floor(fy);
fy -= sy;
if (sy < 0) {
sy = 0;
fy = 0.f;
}
if (sy >= h_in - 1) {
sy = h_in - 2;
fy = 1.f;
}
yofs[dy] = sy;
float b0 = (1.f - fy) * resize_coef_scale;
float b1 = fy * resize_coef_scale;
ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
}
#undef SATURATE_CAST_SHORT
// loop body
int16_t* rowsbuf0 = new int16_t[w_out + 1];
int16_t* rowsbuf1 = new int16_t[w_out + 1];
int16_t* rows0 = rowsbuf0;
int16_t* rows1 = rowsbuf1;
int prev_sy1 = -1;
for (int dy = 0; dy < h_out; dy++) {
int sy = yofs[dy];
if (sy == prev_sy1) {
// hresize one row
int16_t* rows0_old = rows0;
rows0 = rows1;
rows1 = rows0_old;
const uint8_t* S1 = src + w_in * (sy + 1);
const int16_t* ialphap = ialpha;
int16_t* rows1p = rows1;
for (int dx = 0; dx < w_out / 4; dx++) {
int sx = xofs[dx];
int16_t a0 = ialphap[0];
int16_t a1 = ialphap[1];
const uint8_t* S1p = S1 + sx;
int tmp = dx * 4;
rows1p[tmp] = (S1p[0] * a0 + S1p[4] * a1) >> 4;
rows1p[tmp + 1] = (S1p[1] * a0 + S1p[5] * a1) >> 4;
rows1p[tmp + 2] = (S1p[2] * a0 + S1p[6] * a1) >> 4;
rows1p[tmp + 3] = (S1p[3] * a0 + S1p[7] * a1) >> 4;
ialphap += 2;
}
} else {
// hresize two rows
const uint8_t* S0 = src + w_in * (sy);
const uint8_t* S1 = src + w_in * (sy + 1);
const int16_t* ialphap = ialpha;
int16_t* rows0p = rows0;
int16_t* rows1p = rows1;
for (int dx = 0; dx < w_out / 4; dx++) {
int sx = xofs[dx];
int16_t a0 = ialphap[0];
int16_t a1 = ialphap[1];
const uint8_t* S0p = S0 + sx;
const uint8_t* S1p = S1 + sx;
int tmp = dx * 4;
rows0p[tmp] = (S0p[0] * a0 + S0p[4] * a1) >> 4;
rows1p[tmp] = (S1p[0] * a0 + S1p[4] * a1) >> 4;
rows0p[tmp + 1] = (S0p[1] * a0 + S0p[5] * a1) >> 4;
rows1p[tmp + 1] = (S1p[1] * a0 + S1p[5] * a1) >> 4;
rows0p[tmp + 2] = (S0p[2] * a0 + S0p[6] * a1) >> 4;
rows1p[tmp + 2] = (S1p[2] * a0 + S1p[6] * a1) >> 4;
rows0p[tmp + 3] = (S0p[3] * a0 + S0p[7] * a1) >> 4;
rows1p[tmp + 3] = (S1p[3] * a0 + S1p[7] * a1) >> 4;
ialphap += 2;
}
}
prev_sy1 = sy + 1;
// vresize
int16_t b0 = ibeta[0];
int16_t b1 = ibeta[1];
int16_t* rows0p = rows0;
int16_t* rows1p = rows1;
uint8_t* dp_ptr = dst + w_out * (dy);
int cnt = w_out >> 3;
int remain = w_out - (cnt << 3);
int16x4_t _b0 = vdup_n_s16(b0);
int16x4_t _b1 = vdup_n_s16(b1);
int32x4_t _v2 = vdupq_n_s32(2);
for (cnt = w_out >> 3; cnt > 0; cnt--) {
int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
int32x4_t _acc = _v2;
// _acc >> 16 + _rows0p_sr4_mb0 >> 16
_acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
_acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
int32x4_t _acc_1 = _v2;
_acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
_acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
// _acc >> 2
int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
vst1_u8(dp_ptr, _dout);
dp_ptr += 8;
rows0p += 8;
rows1p += 8;
}
for (; remain; --remain) {
// D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
*dp_ptr++ =
(uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
(int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
2);
}
ibeta += 2;
}
delete[] buf; delete[] buf;
delete[] rowsbuf0; delete[] rowsbuf0;
delete[] rowsbuf1; delete[] rowsbuf1;
...@@ -648,6 +834,7 @@ void compute_xy(int srcw, ...@@ -648,6 +834,7 @@ void compute_xy(int srcw,
int* yofs, int* yofs,
int16_t* ialpha, int16_t* ialpha,
int16_t* ibeta); int16_t* ibeta);
// use bilinear method to resize // use bilinear method to resize
void resize(const uint8_t* src, void resize(const uint8_t* src,
uint8_t* dst, uint8_t* dst,
...@@ -682,9 +869,8 @@ void resize(const uint8_t* src, ...@@ -682,9 +869,8 @@ void resize(const uint8_t* src,
bgr_resize(src, dst, srcw, srch, dstw, dsth); bgr_resize(src, dst, srcw, srch, dstw, dsth);
return; return;
} else if (srcFormat == BGRA || srcFormat == RGBA) { } else if (srcFormat == BGRA || srcFormat == RGBA) {
w_in = srcw * 4; bgra_resize(src, dst, srcw, srch, dstw, dsth);
w_out = dstw * 4; return;
num = 4;
} }
double scale_x = static_cast<double>(srcw) / dstw; double scale_x = static_cast<double>(srcw) / dstw;
double scale_y = static_cast<double>(srch) / dsth; double scale_y = static_cast<double>(srch) / dsth;
...@@ -701,23 +887,6 @@ void resize(const uint8_t* src, ...@@ -701,23 +887,6 @@ void resize(const uint8_t* src,
int* xofs1 = nullptr; int* xofs1 = nullptr;
int* yofs1 = nullptr; int* yofs1 = nullptr;
int16_t* ialpha1 = nullptr; int16_t* ialpha1 = nullptr;
if (orih < dsth) { // uv
int tmp = dsth - orih;
xofs1 = new int[dstw];
yofs1 = new int[tmp];
ialpha1 = new int16_t[dstw];
compute_xy(srcw,
srch / 2,
dstw / 2,
tmp,
2,
scale_x,
scale_y,
xofs1,
yofs1,
ialpha1,
ibeta + orih * 2);
}
int cnt = w_out >> 3; int cnt = w_out >> 3;
int remain = w_out % 8; int remain = w_out % 8;
int32x4_t _v2 = vdupq_n_s32(2); int32x4_t _v2 = vdupq_n_s32(2);
...@@ -727,13 +896,6 @@ void resize(const uint8_t* src, ...@@ -727,13 +896,6 @@ void resize(const uint8_t* src,
#pragma omp parallel for #pragma omp parallel for
for (int dy = 0; dy < dsth; dy++) { for (int dy = 0; dy < dsth; dy++) {
int sy = yofs[dy]; int sy = yofs[dy];
if (dy >= orih) {
xofs = xofs1;
yofs = yofs1;
ialpha = ialpha1;
num = 2;
sy = yofs1[dy - orih] + srch;
}
// hresize two rows // hresize two rows
const uint8_t* S0 = src + w_in * (sy); const uint8_t* S0 = src + w_in * (sy);
...@@ -850,11 +1012,6 @@ void resize(const uint8_t* src, ...@@ -850,11 +1012,6 @@ void resize(const uint8_t* src,
} }
ibeta += 2; ibeta += 2;
} }
if (orih < dsth) { // uv
delete[] xofs1;
delete[] yofs1;
delete[] ialpha1;
}
delete[] buf; delete[] buf;
delete[] rowsbuf0; delete[] rowsbuf0;
delete[] rowsbuf1; delete[] rowsbuf1;
......
...@@ -39,7 +39,7 @@ ImagePreprocess::ImagePreprocess(ImageFormat srcFormat, ...@@ -39,7 +39,7 @@ ImagePreprocess::ImagePreprocess(ImageFormat srcFormat,
this->dstFormat_ = dstFormat; this->dstFormat_ = dstFormat;
this->transParam_ = param; this->transParam_ = param;
} }
__attribute__((visibility("default"))) void ImagePreprocess::imageConvert( __attribute__((visibility("default"))) void ImagePreprocess::image_convert(
const uint8_t* src, uint8_t* dst) { const uint8_t* src, uint8_t* dst) {
ImageConvert img_convert; ImageConvert img_convert;
img_convert.choose(src, img_convert.choose(src,
...@@ -50,7 +50,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageConvert( ...@@ -50,7 +50,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageConvert(
this->transParam_.ih); this->transParam_.ih);
} }
__attribute__((visibility("default"))) void ImagePreprocess::imageConvert( __attribute__((visibility("default"))) void ImagePreprocess::image_convert(
const uint8_t* src, const uint8_t* src,
uint8_t* dst, uint8_t* dst,
ImageFormat srcFormat, ImageFormat srcFormat,
...@@ -64,7 +64,18 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageConvert( ...@@ -64,7 +64,18 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageConvert(
this->transParam_.ih); this->transParam_.ih);
} }
__attribute__((visibility("default"))) void ImagePreprocess::imageResize( __attribute__((visibility("default"))) void ImagePreprocess::image_convert(
const uint8_t* src,
uint8_t* dst,
ImageFormat srcFormat,
ImageFormat dstFormat,
int srcw,
int srch) {
ImageConvert img_convert;
img_convert.choose(src, dst, srcFormat, dstFormat, srcw, srch);
}
__attribute__((visibility("default"))) void ImagePreprocess::image_resize(
const uint8_t* src, const uint8_t* src,
uint8_t* dst, uint8_t* dst,
ImageFormat srcFormat, ImageFormat srcFormat,
...@@ -76,7 +87,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageResize( ...@@ -76,7 +87,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageResize(
img_resize.choose(src, dst, srcFormat, srcw, srch, dstw, dsth); img_resize.choose(src, dst, srcFormat, srcw, srch, dstw, dsth);
} }
__attribute__((visibility("default"))) void ImagePreprocess::imageResize( __attribute__((visibility("default"))) void ImagePreprocess::image_resize(
const uint8_t* src, uint8_t* dst) { const uint8_t* src, uint8_t* dst) {
int srcw = this->transParam_.iw; int srcw = this->transParam_.iw;
int srch = this->transParam_.ih; int srch = this->transParam_.ih;
...@@ -87,7 +98,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageResize( ...@@ -87,7 +98,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageResize(
img_resize.choose(src, dst, srcFormat, srcw, srch, dstw, dsth); img_resize.choose(src, dst, srcFormat, srcw, srch, dstw, dsth);
} }
__attribute__((visibility("default"))) void ImagePreprocess::imageRotate( __attribute__((visibility("default"))) void ImagePreprocess::image_rotate(
const uint8_t* src, const uint8_t* src,
uint8_t* dst, uint8_t* dst,
ImageFormat srcFormat, ImageFormat srcFormat,
...@@ -98,7 +109,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageRotate( ...@@ -98,7 +109,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageRotate(
img_rotate.choose(src, dst, srcFormat, srcw, srch, degree); img_rotate.choose(src, dst, srcFormat, srcw, srch, degree);
} }
__attribute__((visibility("default"))) void ImagePreprocess::imageRotate( __attribute__((visibility("default"))) void ImagePreprocess::image_rotate(
const uint8_t* src, uint8_t* dst) { const uint8_t* src, uint8_t* dst) {
auto srcw = this->transParam_.ow; auto srcw = this->transParam_.ow;
auto srch = this->transParam_.oh; auto srch = this->transParam_.oh;
...@@ -108,7 +119,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageRotate( ...@@ -108,7 +119,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageRotate(
img_rotate.choose(src, dst, srcFormat, srcw, srch, degree); img_rotate.choose(src, dst, srcFormat, srcw, srch, degree);
} }
__attribute__((visibility("default"))) void ImagePreprocess::imageFlip( __attribute__((visibility("default"))) void ImagePreprocess::image_flip(
const uint8_t* src, const uint8_t* src,
uint8_t* dst, uint8_t* dst,
ImageFormat srcFormat, ImageFormat srcFormat,
...@@ -119,7 +130,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageFlip( ...@@ -119,7 +130,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageFlip(
img_flip.choose(src, dst, srcFormat, srcw, srch, flip_param); img_flip.choose(src, dst, srcFormat, srcw, srch, flip_param);
} }
__attribute__((visibility("default"))) void ImagePreprocess::imageFlip( __attribute__((visibility("default"))) void ImagePreprocess::image_flip(
const uint8_t* src, uint8_t* dst) { const uint8_t* src, uint8_t* dst) {
auto srcw = this->transParam_.ow; auto srcw = this->transParam_.ow;
auto srch = this->transParam_.oh; auto srch = this->transParam_.oh;
...@@ -129,7 +140,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageFlip( ...@@ -129,7 +140,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageFlip(
img_flip.choose(src, dst, srcFormat, srcw, srch, flip_param); img_flip.choose(src, dst, srcFormat, srcw, srch, flip_param);
} }
__attribute__((visibility("default"))) void ImagePreprocess::image2Tensor( __attribute__((visibility("default"))) void ImagePreprocess::image_to_tensor(
const uint8_t* src, const uint8_t* src,
Tensor* dstTensor, Tensor* dstTensor,
ImageFormat srcFormat, ImageFormat srcFormat,
...@@ -143,7 +154,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::image2Tensor( ...@@ -143,7 +154,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::image2Tensor(
src, dstTensor, srcFormat, layout, srcw, srch, means, scales); src, dstTensor, srcFormat, layout, srcw, srch, means, scales);
} }
__attribute__((visibility("default"))) void ImagePreprocess::image2Tensor( __attribute__((visibility("default"))) void ImagePreprocess::image_to_tensor(
const uint8_t* src, const uint8_t* src,
Tensor* dstTensor, Tensor* dstTensor,
LayoutType layout, LayoutType layout,
...@@ -160,7 +171,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::image2Tensor( ...@@ -160,7 +171,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::image2Tensor(
scales); scales);
} }
__attribute__((visibility("default"))) void ImagePreprocess::imageCrop( __attribute__((visibility("default"))) void ImagePreprocess::image_crop(
const uint8_t* src, const uint8_t* src,
uint8_t* dst, uint8_t* dst,
ImageFormat srcFormat, ImageFormat srcFormat,
......
...@@ -75,7 +75,8 @@ class ImagePreprocess { ...@@ -75,7 +75,8 @@ class ImagePreprocess {
* param src: input image data * param src: input image data
* param dst: output image data * param dst: output image data
*/ */
void imageConvert(const uint8_t* src, uint8_t* dst); void image_convert(const uint8_t* src, uint8_t* dst);
/* /*
* image color convert * image color convert
* support NV12/NV21_to_BGR(RGB), NV12/NV21_to_BGRA(RGBA), * support NV12/NV21_to_BGR(RGB), NV12/NV21_to_BGRA(RGBA),
...@@ -91,10 +92,35 @@ class ImagePreprocess { ...@@ -91,10 +92,35 @@ class ImagePreprocess {
* param dstFormat: output image image format, support GRAY, BGR(RGB) and * param dstFormat: output image image format, support GRAY, BGR(RGB) and
* BGRA(RGBA) * BGRA(RGBA)
*/ */
void imageConvert(const uint8_t* src, void image_convert(const uint8_t* src,
uint8_t* dst, uint8_t* dst,
ImageFormat srcFormat, ImageFormat srcFormat,
ImageFormat dstFormat); ImageFormat dstFormat);
/*
* image color convert
* support NV12/NV21_to_BGR(RGB), NV12/NV21_to_BGRA(RGBA),
* BGR(RGB)and BGRA(RGBA) transform,
* BGR(RGB)and RGB(BGR) transform,
* BGR(RGB)and RGBA(BGRA) transform,
* BGR(RGB)and GRAY transform,
* BGRA(RGBA) and GRAY transform,
* param src: input image data
* param dst: output image data
* param srcFormat: input image image format support: GRAY, NV12(NV21),
* BGR(RGB) and BGRA(RGBA)
* param dstFormat: output image image format, support GRAY, BGR(RGB) and
* BGRA(RGBA)
* param srcw: input image width
* param srch: input image height
*/
void image_convert(const uint8_t* src,
uint8_t* dst,
ImageFormat srcFormat,
ImageFormat dstFormat,
int srcw,
int srch);
/* /*
* image resize, use bilinear method * image resize, use bilinear method
* support image format: 1-channel image (egs: GRAY, 2-channel image (egs: * support image format: 1-channel image (egs: GRAY, 2-channel image (egs:
...@@ -102,7 +128,8 @@ class ImagePreprocess { ...@@ -102,7 +128,8 @@ class ImagePreprocess {
* param src: input image data * param src: input image data
* param dst: output image data * param dst: output image data
*/ */
void imageResize(const uint8_t* src, uint8_t* dst); void image_resize(const uint8_t* src, uint8_t* dst);
/* /*
image resize, use bilinear method image resize, use bilinear method
* support image format: 1-channel image (egs: GRAY, 2-channel image (egs: * support image format: 1-channel image (egs: GRAY, 2-channel image (egs:
...@@ -114,13 +141,13 @@ class ImagePreprocess { ...@@ -114,13 +141,13 @@ class ImagePreprocess {
* param dstw: output image width * param dstw: output image width
* param dsth: output image height * param dsth: output image height
*/ */
void imageResize(const uint8_t* src, void image_resize(const uint8_t* src,
uint8_t* dst, uint8_t* dst,
ImageFormat srcFormat, ImageFormat srcFormat,
int srcw, int srcw,
int srch, int srch,
int dstw, int dstw,
int dsth); int dsth);
/* /*
* image Rotate * image Rotate
...@@ -129,7 +156,8 @@ class ImagePreprocess { ...@@ -129,7 +156,8 @@ class ImagePreprocess {
* param src: input image data * param src: input image data
* param dst: output image data * param dst: output image data
*/ */
void imageRotate(const uint8_t* src, uint8_t* dst); void image_rotate(const uint8_t* src, uint8_t* dst);
/* /*
* image Rotate * image Rotate
* support 90, 180 and 270 Rotate process * support 90, 180 and 270 Rotate process
...@@ -141,12 +169,13 @@ class ImagePreprocess { ...@@ -141,12 +169,13 @@ class ImagePreprocess {
* param srch: input image height * param srch: input image height
* param degree: Rotate degree, support 90, 180 and 270 * param degree: Rotate degree, support 90, 180 and 270
*/ */
void imageRotate(const uint8_t* src, void image_rotate(const uint8_t* src,
uint8_t* dst, uint8_t* dst,
ImageFormat srcFormat, ImageFormat srcFormat,
int srcw, int srcw,
int srch, int srch,
float degree); float degree);
/* /*
* image Flip * image Flip
* support X, Y and XY flip process * support X, Y and XY flip process
...@@ -154,7 +183,8 @@ class ImagePreprocess { ...@@ -154,7 +183,8 @@ class ImagePreprocess {
* param src: input image data * param src: input image data
* param dst: output image data * param dst: output image data
*/ */
void imageFlip(const uint8_t* src, uint8_t* dst); void image_flip(const uint8_t* src, uint8_t* dst);
/* /*
* image Flip * image Flip
* support X, Y and XY flip process * support X, Y and XY flip process
...@@ -166,12 +196,13 @@ class ImagePreprocess { ...@@ -166,12 +196,13 @@ class ImagePreprocess {
* param srch: input image height * param srch: input image height
* param flip_param: flip parameter, support X, Y and XY * param flip_param: flip parameter, support X, Y and XY
*/ */
void imageFlip(const uint8_t* src, void image_flip(const uint8_t* src,
uint8_t* dst, uint8_t* dst,
ImageFormat srcFormat, ImageFormat srcFormat,
int srcw, int srcw,
int srch, int srch,
FlipParam flip_param); FlipParam flip_param);
/* /*
* change image data to tensor data * change image data to tensor data
* support image format is GRAY, BGR(RGB) and BGRA(RGBA), Data layout is NHWC * support image format is GRAY, BGR(RGB) and BGRA(RGBA), Data layout is NHWC
...@@ -183,11 +214,12 @@ class ImagePreprocess { ...@@ -183,11 +214,12 @@ class ImagePreprocess {
* param means: means of image * param means: means of image
* param scales: scales of image * param scales: scales of image
*/ */
void image2Tensor(const uint8_t* src, void image_to_tensor(const uint8_t* src,
Tensor* dstTensor, Tensor* dstTensor,
LayoutType layout, LayoutType layout,
float* means, float* means,
float* scales); float* scales);
/* /*
* change image data to tensor data * change image data to tensor data
* support image format is GRAY, BGR(RGB) and BGRA(RGBA), Data layout is NHWC * support image format is GRAY, BGR(RGB) and BGRA(RGBA), Data layout is NHWC
...@@ -202,14 +234,14 @@ class ImagePreprocess { ...@@ -202,14 +234,14 @@ class ImagePreprocess {
* param means: means of image * param means: means of image
* param scales: scales of image * param scales: scales of image
*/ */
void image2Tensor(const uint8_t* src, void image_to_tensor(const uint8_t* src,
Tensor* dstTensor, Tensor* dstTensor,
ImageFormat srcFormat, ImageFormat srcFormat,
int srcw, int srcw,
int srch, int srch,
LayoutType layout, LayoutType layout,
float* means, float* means,
float* scales); float* scales);
/* /*
* image crop process * image crop process
...@@ -217,15 +249,15 @@ class ImagePreprocess { ...@@ -217,15 +249,15 @@ class ImagePreprocess {
* param src: input image data * param src: input image data
* param dst: output image data * param dst: output image data
*/ */
void imageCrop(const uint8_t* src, void image_crop(const uint8_t* src,
uint8_t* dst, uint8_t* dst,
ImageFormat srcFormat, ImageFormat srcFormat,
int srcw, int srcw,
int srch, int srch,
int left_x, int left_x,
int left_y, int left_y,
int dstw, int dstw,
int dsth); int dsth);
private: private:
ImageFormat srcFormat_; ImageFormat srcFormat_;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册