提交 b5fe3840 编写于 作者: H HappyAngel 提交者: GitHub

[LITE][ARMM] fix cmake error in lite/CMakeLists.txt, missing mkdir cxx in iOS (#2418)

* add cv image process

* fix arm liunx build error

* add LITE_WITH_CV defien to make cv, test=develop

* fix cv format, annd add describe in utils/cv

* delete some Meaningless comments, test=develop

* set LITE_WITH_CV=OFF in build.sh, test=develop

* delete cv_enum.h in utils/cv, push the contents in cv_ennum.h to paddle_image_preprocess.h, test=develop

* according to reviews to redefine paddle_image_preprocess.h, test=develop

* add detailed note of flipParam, test=develop

* fix format in paddle_image_preprocess.h, test=develop

* fix error when build x86. test=develop

lite_with_X86 does not contain lite_with_cv

* fix cmake error in llite/CMakeLists.txt, missing mkdir cxx, test=develop

* according to review change, test=develop

* chang grb to rgb, test=develop
上级 ef5dbd1d
......@@ -160,7 +160,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/include"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include"
COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/lib"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include"
)
add_dependencies(tiny_publish_lib bundle_light_api)
add_dependencies(publish_inference tiny_publish_lib)
......
......@@ -69,240 +69,6 @@ void ImagePreprocess::imageResize(const uint8_t* src,
int dstw,
int dsth) {
resize(src, dst, srcFormat, srcw, srch, dstw, dsth);
/*
int size = srcw * srch;
if (srcw == dstw && srch == dsth) {
if (srcFormat == NV12 || srcFormat == NV21) {
size = srcw * (floor(1.5 * srch));
} else if (srcFormat == BGR || srcFormat == RGB) {
size = 3 * srcw * srch;
} else if (srcFormat == BGRA || srcFormat == RGBA) {
size = 4 * srcw * srch;
}
memcpy(dst, src, sizeof(uint8_t) * size);
return;
}
double scale_x = static_cast<double>(srcw / dstw);
double scale_y = static_cast<double>(srch / dsth);
int* buf = new int[dstw * 2 + dsth * 2];
int* xofs = buf;
int* yofs = buf + dstw;
int16_t* ialpha = reinterpret_cast<int16_t*>(buf + dstw + dsth);
int16_t* ibeta = reinterpret_cast<int16_t*>(buf + 2 * dstw + dsth);
compute_xy(
srcw, srch, dstw, dsth, scale_x, scale_y, xofs, yofs, ialpha, ibeta);
int w_out = dstw;
int w_in = srcw;
int num = 1;
int orih = dsth;
if (srcFormat == GRAY) {
num = 1;
} else if (srcFormat == NV12 || srcFormat == NV21) {
num = 1;
int hout = static_cast<int>(0.5 * dsth);
dsth += hout;
} else if (srcFormat == BGR || srcFormat == RGB) {
w_in = srcw * 3;
w_out = dstw * 3;
num = 3;
} else if (srcFormat == BGRA || srcFormat == RGBA) {
w_in = srcw * 4;
w_out = dstw * 4;
num = 4;
}
int* xofs1 = nullptr;
int* yofs1 = nullptr;
int16_t* ialpha1 = nullptr;
if (orih < dsth) { // uv
int tmp = dsth - orih;
int w = dstw / 2;
xofs1 = new int[w];
yofs1 = new int[tmp];
ialpha1 = new int16_t[srcw];
compute_xy(srcw / 2,
srch / 2,
w,
tmp,
scale_x,
scale_y,
xofs1,
yofs1,
ialpha1,
ibeta + orih);
}
int cnt = w_out >> 3;
int remain = w_out % 8;
int32x4_t _v2 = vdupq_n_s32(2);
#pragma omp parallel for
for (int dy = 0; dy < dsth; dy++) {
int16_t* rowsbuf0 = new int16_t[w_out];
int16_t* rowsbuf1 = new int16_t[w_out];
int sy = yofs[dy];
if (dy >= orih) {
xofs = xofs1;
yofs = yofs1;
ialpha = ialpha1;
}
if (sy < 0) {
memset(rowsbuf0, 0, sizeof(uint16_t) * w_out);
const uint8_t* S1 = src + srcw * (sy + 1);
const int16_t* ialphap = ialpha;
int16_t* rows1p = rowsbuf1;
for (int dx = 0; dx < dstw; dx++) {
int sx = xofs[dx] * num; // num = 4
int16_t a0 = ialphap[0];
int16_t a1 = ialphap[1];
const uint8_t* S1pl = S1 + sx;
const uint8_t* S1pr = S1 + sx + num;
if (sx < 0) {
S1pl = S1;
}
for (int i = 0; i < num; i++) {
if (sx < 0) {
*rows1p++ = ((*S1pl++) * a1) >> 4;
} else {
*rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4;
}
}
ialphap += 2;
}
} else {
// hresize two rows
const uint8_t* S0 = src + w_in * (sy);
const uint8_t* S1 = src + w_in * (sy + 1);
const int16_t* ialphap = ialpha;
int16_t* rows0p = rowsbuf0;
int16_t* rows1p = rowsbuf1;
for (int dx = 0; dx < dstw; dx++) {
int sx = xofs[dx] * num; // num = 4
int16_t a0 = ialphap[0];
int16_t a1 = ialphap[1];
const uint8_t* S0pl = S0 + sx;
const uint8_t* S0pr = S0 + sx + num;
const uint8_t* S1pl = S1 + sx;
const uint8_t* S1pr = S1 + sx + num;
if (sx < 0) {
S0pl = S0;
S1pl = S1;
}
for (int i = 0; i < num; i++) {
if (sx < 0) {
*rows0p = ((*S0pl++) * a1) >> 4;
*rows1p = ((*S1pl++) * a1) >> 4;
rows0p++;
rows1p++;
} else {
*rows0p++ = ((*S0pl++) * a0 + (*S0pr++) * a1) >> 4;
*rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4;
}
}
ialphap += 2;
}
}
int ind = dy * 2;
int16_t b0 = ibeta[ind];
int16_t b1 = ibeta[ind + 1];
int16x8_t _b0 = vdupq_n_s16(b0);
int16x8_t _b1 = vdupq_n_s16(b1);
uint8_t* dp_ptr = dst + dy * w_out;
int16_t* rows0p = rowsbuf0;
int16_t* rows1p = rowsbuf1;
int re_cnt = cnt;
if (re_cnt > 0) {
#ifdef __aarch64__
asm volatile(
"1: \n"
"ld1 {v0.8h}, [%[rows0p]], #16 \n"
"ld1 {v1.8h}, [%[rows1p]], #16 \n"
"orr v6.16b, %w[_v2].16b, %w[_v2].16b \n"
"orr v7.16b, %w[_v2].16b, %w[_v2].16b \n"
"smull v2.4s, v0.4h, %w[_b0].4h \n"
"smull2 v4.4s, v0.8h, %w[_b0].8h \n"
"smull v3.4s, v1.4h, %w[_b1].4h \n"
"smull2 v5.4s, v1.8h, %w[_b1].8h \n"
"ssra v6.4s, v2.4s, #16 \n"
"ssra v7.4s, v4.4s, #16 \n"
"ssra v6.4s, v3.4s, #16 \n"
"ssra v7.4s, v5.4s, #16 \n"
"shrn v0.4h, v6.4s, #2 \n"
"shrn2 v0.8h, v7.4s, #2 \n"
"subs %w[cnt], %w[cnt], #1 \n"
"sqxtun v1.8b, v0.8h \n"
"st1 {v1.8b}, [%[dp]], #8 \n"
"bne 1b \n"
: [rows0p] "+r"(rows0p),
[rows1p] "+r"(rows1p),
[cnt] "+r"(re_cnt),
[dp] "+r"(dp_ptr)
: [_b0] "w"(_b0), [_b1] "w"(_b1), [_v2] "w"(_v2)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
#else
asm volatile(
"mov r4, #2 \n"
"vdup.s32 q12, r4 \n"
"0: \n"
"vld1.s16 {d2-d3}, [%[rows0p]]!\n"
"vld1.s16 {d6-d7}, [%[rows1p]]!\n"
"vorr.s32 q10, q12, q12 \n"
"vorr.s32 q11, q12, q12 \n"
"vmull.s16 q0, d2, %[_b0] \n"
"vmull.s16 q1, d3, %[_b0] \n"
"vmull.s16 q2, d6, %[_b1] \n"
"vmull.s16 q3, d7, %[_b1] \n"
"vsra.s32 q10, q0, #16 \n"
"vsra.s32 q11, q1, #16 \n"
"vsra.s32 q10, q2, #16 \n"
"vsra.s32 q11, q3, #16 \n"
"vshrn.s32 d20, q10, #2 \n"
"vshrn.s32 d21, q11, #2 \n"
"subs %[cnt], #1 \n"
"vqmovun.s16 d20, q10 \n"
"vst1.8 {d20}, [%[dp]]! \n"
"bne 0b \n"
: [rows0p] "+r"(rows0p),
[rows1p] "+r"(rows1p),
[cnt] "+r"(re_cnt),
[dp] "+r"(dp_ptr)
: [_b0] "w"(_b0), [_b1] "w"(_b1)
: "cc",
"memory",
"r4",
"q0",
"q1",
"q2",
"q3",
"q8",
"q9",
"q10",
"q11",
"q12");
#endif // __aarch64__
}
for (int i = 0; i < remain; i++) {
// D[x] = (rows0[x]*b0 + rows1[x]*b1) >>
// INTER_RESIZE_COEF_BITS;
*dp_ptr++ =
(uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
(int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
2);
}
}
delete[] buf;
*/
}
void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst) {
......
......@@ -133,7 +133,7 @@ class ImagePreprocess {
* color format support 1-channel image, 3-channel image and 4-channel image
* param src: input image data
* param dst: output image data
* param srcFormat: input image format, support GRAY, BGR(GRB) and BGRA(RGBA)
* param srcFormat: input image format, support GRAY, BGR(RGB) and BGRA(RGBA)
* param srcw: input image width
* param srch: input image height
* param degree: Rotate degree, support 90, 180 and 270
......@@ -158,7 +158,7 @@ class ImagePreprocess {
* color format support 1-channel image, 3-channel image and 4-channel image
* param src: input image data
* param dst: output image data
* param srcFormat: input image format, support GRAY, BGR(GRB) and BGRA(RGBA)
* param srcFormat: input image format, support GRAY, BGR(RGB) and BGRA(RGBA)
* param srcw: input image width
* param srch: input image height
* param flip_param: flip parameter, support X, Y and XY
......@@ -190,7 +190,7 @@ class ImagePreprocess {
* NCHW
* param src: input image data
* param dstTensor: output tensor data
* param srcFormat: input image format, support BGR(GRB) and BGRA(RGBA)
* param srcFormat: input image format, support BGR(RGB) and BGRA(RGBA)
* param srcw: input image width
* param srch: input image height
* param layout: output tensor layout,support NHWC and NCHW
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册