diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt index 61f07583b2ed920ce7ac0f2d56b2b2e89bb99b42..4e4bc103a8a872e1f0635e71e2e39f4eac1272d1 100644 --- a/lite/CMakeLists.txt +++ b/lite/CMakeLists.txt @@ -160,7 +160,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/include" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include" COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/lib" - COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include" ) add_dependencies(tiny_publish_lib bundle_light_api) add_dependencies(publish_inference tiny_publish_lib) diff --git a/lite/utils/cv/paddle_image_preprocess.cc b/lite/utils/cv/paddle_image_preprocess.cc index 0bccfe2804a9ba17473575815bfe4b2e9635f234..f18047556874a82d28c5964a1b5fd2fa8284c814 100644 --- a/lite/utils/cv/paddle_image_preprocess.cc +++ b/lite/utils/cv/paddle_image_preprocess.cc @@ -69,240 +69,6 @@ void ImagePreprocess::imageResize(const uint8_t* src, int dstw, int dsth) { resize(src, dst, srcFormat, srcw, srch, dstw, dsth); - /* - int size = srcw * srch; - if (srcw == dstw && srch == dsth) { - if (srcFormat == NV12 || srcFormat == NV21) { - size = srcw * (floor(1.5 * srch)); - } else if (srcFormat == BGR || srcFormat == RGB) { - size = 3 * srcw * srch; - } else if (srcFormat == BGRA || srcFormat == RGBA) { - size = 4 * srcw * srch; - } - memcpy(dst, src, sizeof(uint8_t) * size); - return; - } - double scale_x = static_cast(srcw / dstw); - double scale_y = static_cast(srch / dsth); - - int* buf = new int[dstw * 2 + dsth * 2]; - - int* xofs = buf; - int* yofs = buf + dstw; - int16_t* ialpha = reinterpret_cast(buf + dstw + dsth); - int16_t* ibeta = reinterpret_cast(buf + 2 * dstw + dsth); - - compute_xy( - srcw, srch, dstw, dsth, scale_x, scale_y, xofs, yofs, ialpha, ibeta); - - int w_out = dstw; - int w_in = srcw; - int num = 1; - int orih = dsth; - if (srcFormat == GRAY) { - num = 1; - } else if (srcFormat == NV12 || srcFormat == NV21) { - num = 1; - int hout = static_cast(0.5 * dsth); - dsth += hout; - } else if (srcFormat == BGR || srcFormat == RGB) { - w_in = srcw * 3; - w_out = dstw * 3; - num = 3; - - } else if (srcFormat == BGRA || srcFormat == RGBA) { - w_in = srcw * 4; - w_out = dstw * 4; - num = 4; - } - - int* xofs1 = nullptr; - int* yofs1 = nullptr; - int16_t* ialpha1 = nullptr; - if (orih < dsth) { // uv - int tmp = dsth - orih; - int w = dstw / 2; - xofs1 = new int[w]; - yofs1 = new int[tmp]; - ialpha1 = new int16_t[srcw]; - compute_xy(srcw / 2, - srch / 2, - w, - tmp, - scale_x, - scale_y, - xofs1, - yofs1, - ialpha1, - ibeta + orih); - } - int cnt = w_out >> 3; - int remain = w_out % 8; - int32x4_t _v2 = vdupq_n_s32(2); - #pragma omp parallel for - for (int dy = 0; dy < dsth; dy++) { - int16_t* rowsbuf0 = new int16_t[w_out]; - int16_t* rowsbuf1 = new int16_t[w_out]; - int sy = yofs[dy]; - if (dy >= orih) { - xofs = xofs1; - yofs = yofs1; - ialpha = ialpha1; - } - if (sy < 0) { - memset(rowsbuf0, 0, sizeof(uint16_t) * w_out); - const uint8_t* S1 = src + srcw * (sy + 1); - const int16_t* ialphap = ialpha; - int16_t* rows1p = rowsbuf1; - for (int dx = 0; dx < dstw; dx++) { - int sx = xofs[dx] * num; // num = 4 - int16_t a0 = ialphap[0]; - int16_t a1 = ialphap[1]; - - const uint8_t* S1pl = S1 + sx; - const uint8_t* S1pr = S1 + sx + num; - if (sx < 0) { - S1pl = S1; - } - for (int i = 0; i < num; i++) { - if (sx < 0) { - *rows1p++ = ((*S1pl++) * a1) >> 4; - } else { - *rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4; - } - } - ialphap += 2; - } - } else { - // hresize two rows - const uint8_t* S0 = src + w_in * (sy); - const uint8_t* S1 = src + w_in * (sy + 1); - const int16_t* ialphap = ialpha; - int16_t* rows0p = rowsbuf0; - int16_t* rows1p = rowsbuf1; - for (int dx = 0; dx < dstw; dx++) { - int sx = xofs[dx] * num; // num = 4 - int16_t a0 = ialphap[0]; - int16_t a1 = ialphap[1]; - - const uint8_t* S0pl = S0 + sx; - const uint8_t* S0pr = S0 + sx + num; - const uint8_t* S1pl = S1 + sx; - const uint8_t* S1pr = S1 + sx + num; - if (sx < 0) { - S0pl = S0; - S1pl = S1; - } - for (int i = 0; i < num; i++) { - if (sx < 0) { - *rows0p = ((*S0pl++) * a1) >> 4; - *rows1p = ((*S1pl++) * a1) >> 4; - rows0p++; - rows1p++; - } else { - *rows0p++ = ((*S0pl++) * a0 + (*S0pr++) * a1) >> 4; - *rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4; - } - } - ialphap += 2; - } - } - int ind = dy * 2; - int16_t b0 = ibeta[ind]; - int16_t b1 = ibeta[ind + 1]; - int16x8_t _b0 = vdupq_n_s16(b0); - int16x8_t _b1 = vdupq_n_s16(b1); - uint8_t* dp_ptr = dst + dy * w_out; - int16_t* rows0p = rowsbuf0; - int16_t* rows1p = rowsbuf1; - int re_cnt = cnt; - if (re_cnt > 0) { - #ifdef __aarch64__ - asm volatile( - "1: \n" - "ld1 {v0.8h}, [%[rows0p]], #16 \n" - "ld1 {v1.8h}, [%[rows1p]], #16 \n" - "orr v6.16b, %w[_v2].16b, %w[_v2].16b \n" - "orr v7.16b, %w[_v2].16b, %w[_v2].16b \n" - "smull v2.4s, v0.4h, %w[_b0].4h \n" - "smull2 v4.4s, v0.8h, %w[_b0].8h \n" - "smull v3.4s, v1.4h, %w[_b1].4h \n" - "smull2 v5.4s, v1.8h, %w[_b1].8h \n" - - "ssra v6.4s, v2.4s, #16 \n" - "ssra v7.4s, v4.4s, #16 \n" - "ssra v6.4s, v3.4s, #16 \n" - "ssra v7.4s, v5.4s, #16 \n" - - "shrn v0.4h, v6.4s, #2 \n" - "shrn2 v0.8h, v7.4s, #2 \n" - "subs %w[cnt], %w[cnt], #1 \n" - "sqxtun v1.8b, v0.8h \n" - "st1 {v1.8b}, [%[dp]], #8 \n" - "bne 1b \n" - : [rows0p] "+r"(rows0p), - [rows1p] "+r"(rows1p), - [cnt] "+r"(re_cnt), - [dp] "+r"(dp_ptr) - : [_b0] "w"(_b0), [_b1] "w"(_b1), [_v2] "w"(_v2) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); - #else - asm volatile( - "mov r4, #2 \n" - "vdup.s32 q12, r4 \n" - "0: \n" - "vld1.s16 {d2-d3}, [%[rows0p]]!\n" - "vld1.s16 {d6-d7}, [%[rows1p]]!\n" - "vorr.s32 q10, q12, q12 \n" - "vorr.s32 q11, q12, q12 \n" - - "vmull.s16 q0, d2, %[_b0] \n" - "vmull.s16 q1, d3, %[_b0] \n" - "vmull.s16 q2, d6, %[_b1] \n" - "vmull.s16 q3, d7, %[_b1] \n" - - "vsra.s32 q10, q0, #16 \n" - "vsra.s32 q11, q1, #16 \n" - "vsra.s32 q10, q2, #16 \n" - "vsra.s32 q11, q3, #16 \n" - - "vshrn.s32 d20, q10, #2 \n" - "vshrn.s32 d21, q11, #2 \n" - "subs %[cnt], #1 \n" - "vqmovun.s16 d20, q10 \n" - "vst1.8 {d20}, [%[dp]]! \n" - "bne 0b \n" - : [rows0p] "+r"(rows0p), - [rows1p] "+r"(rows1p), - [cnt] "+r"(re_cnt), - [dp] "+r"(dp_ptr) - : [_b0] "w"(_b0), [_b1] "w"(_b1) - : "cc", - "memory", - "r4", - "q0", - "q1", - "q2", - "q3", - "q8", - "q9", - "q10", - "q11", - "q12"); - - #endif // __aarch64__ - } - for (int i = 0; i < remain; i++) { - // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> - // INTER_RESIZE_COEF_BITS; - *dp_ptr++ = - (uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) + - (int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >> - 2); - } - } - delete[] buf; - */ } void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst) { diff --git a/lite/utils/cv/paddle_image_preprocess.h b/lite/utils/cv/paddle_image_preprocess.h index 11673e19041883bfa6ca7a45f03ca3feca76dd20..5a46a9e48e8202fe29ec9fc7d950ccf15920cc32 100644 --- a/lite/utils/cv/paddle_image_preprocess.h +++ b/lite/utils/cv/paddle_image_preprocess.h @@ -133,7 +133,7 @@ class ImagePreprocess { * color format support 1-channel image, 3-channel image and 4-channel image * param src: input image data * param dst: output image data - * param srcFormat: input image format, support GRAY, BGR(GRB) and BGRA(RGBA) + * param srcFormat: input image format, support GRAY, BGR(RGB) and BGRA(RGBA) * param srcw: input image width * param srch: input image height * param degree: Rotate degree, support 90, 180 and 270 @@ -158,7 +158,7 @@ class ImagePreprocess { * color format support 1-channel image, 3-channel image and 4-channel image * param src: input image data * param dst: output image data - * param srcFormat: input image format, support GRAY, BGR(GRB) and BGRA(RGBA) + * param srcFormat: input image format, support GRAY, BGR(RGB) and BGRA(RGBA) * param srcw: input image width * param srch: input image height * param flip_param: flip parameter, support X, Y and XY @@ -190,7 +190,7 @@ class ImagePreprocess { * NCHW * param src: input image data * param dstTensor: output tensor data - * param srcFormat: input image format, support BGR(GRB) and BGRA(RGBA) + * param srcFormat: input image format, support BGR(RGB) and BGRA(RGBA) * param srcw: input image width * param srch: input image height * param layout: output tensor layout,support NHWC and NCHW