[arm] add cv unit_test (#4250)

add cv_ut. test=develop add Anakin implement add image_profiler test

[arm] add cv unit_test (#4250)
add cv_ut. test=develop add Anakin implement add image_profiler test
515f9a6a · HappyAngel · GitHub · 339c2e53 · 515f9a6a · 515f9a6a
25 changed file
--- a/docs/api_reference/cv.md
+++ b/docs/api_reference/cv.md
@@ -91,14 +91,24 @@ ImagePreprocess::ImagePreprocess(ImageFormat srcFormat, ImageFormat dstFormat, T
    // 方法二
    void ImagePreprocess::imageCovert(const uint8_t* src,
    uint8_t* dst, ImageFormat srcFormat, ImageFormat dstFormat);
+    // 方法三
+    void ImagePreprocess::imageCovert(const uint8_t* src,
+    uint8_t* dst, ImageFormat srcFormat, ImageFormat dstFormat,
+    int srcw, int srch);
    ```
    + 第一个 `imageCovert` 接口，缺省参数来源于 `ImagePreprocess` 类的成员变量。故在初始化 `ImagePreprocess` 类的对象时，必须要给以下成员变量赋值：
        - param srcFormat：`ImagePreprocess` 类的成员变量`srcFormat_`
        - param dstFormat：`ImagePreprocess` 类的成员变量`dstFormat_`
+        - param srcw: `ImagePreprocess` 类的成员变量`transParam_`结构体中的`iw`变量
+        - param srch: `ImagePreprocess` 类的成员变量`transParam_`结构体中的`ih`变量
-    - 第二个`imageCovert` 接口，可以直接使用
+    - 第二个`imageCovert` 接口，缺省参数来源于 `ImagePreprocess` 类的成员变量。故在初始化 `ImagePreprocess` 类的对象时，必须要给以下成员变量赋值：
+        - param srcw: `ImagePreprocess` 类的成员变量`transParam_`结构体中的`iw`变量
+        - param srch: `ImagePreprocess` 类的成员变量`transParam_`结构体中的`ih`变量
+    - 第二个`imageCovert` 接口, 可以直接使用
 ### 缩放 Resize
 `Resize` 功能支持颜色空间：GRAY、NV12（NV21）、RGB（BGR）和RGBA（BGRA）

--- a/lite/tests/CMakeLists.txt
+++ b/lite/tests/CMakeLists.txt
 add_subdirectory(kernels)
 add_subdirectory(math)
 add_subdirectory(cv)
+add_subdirectory(cv/anakin)
 add_subdirectory(api)
--- a/lite/tests/cv/CMakeLists.txt
+++ b/lite/tests/cv/CMakeLists.txt
 if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_MLU) AND LITE_WITH_ARM)
    lite_cc_test(image_convert_test SRCS image_convert_test.cc DEPS paddle_cv_arm)
+    lite_cc_test(image_profiler_test SRCS image_profiler_test.cc DEPS paddle_cv_arm anakin_cv_arm)
 endif()
--- a/lite/tests/cv/anakin/CMakeLists.txt
+++ b/lite/tests/cv/anakin/CMakeLists.txt
+if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_MLU) AND LITE_WITH_ARM)
+  lite_cc_library(anakin_cv_arm SRCS
+                  bgr_resize.cc
+                  bgr_flip_hwc.cc
+                  bgr_rotate_hwc.cc
+                  bgr_to_tensor_hwc.cc
+                  bgra_resize.cc
+                  bgra_flip_hwc.cc
+                  bgra_rotate_hwc.cc
+                  bgra_to_tensor_hwc.cc
+                  cv_utils.cc
+                  nv12_to_bgr.cc
+                  nv12_to_bgra.cc
+                  nv21_to_bgr.cc
+                  nv21_to_bgra.cc
+                  nv21_resize.cc
+                  DEPS paddle_api place)
+endif()
--- a/lite/tests/cv/anakin/bgr_flip_hwc.cc
+++ b/lite/tests/cv/anakin/bgr_flip_hwc.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/tests/cv/anakin/cv_utils.h"
+void flip_x_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+void flip_y_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+void flip_xy_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+// x: flip_num = 1 y: flip_num = -1 xy: flip_num = 0;
+void bgr_flip_hwc(
+    const uint8_t* src, uint8_t* dst, int w_in, int h_in, int flip_num) {
+  if (flip_num == 1) {  // x
+    flip_x_hwc(src, dst, w_in, h_in);
+  }
+  if (flip_num == -1) {  // y
+    flip_y_hwc(src, dst, w_in, h_in);
+  }
+  if (flip_num == 0) {  // xy
+    flip_xy_hwc(src, dst, w_in, h_in);
+  }
+}
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+rotate:
+bgr7 bgr8 bgr9
+bgr4 bgr5 bgr6
+bgr1 bgr2 bgr3
+*/
+#ifdef __aarch64__
+void flip_x_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int h = h_in - 1;
+  int win = w_in * 3;
+  uint8_t zerobuff[win];  // NOLINT
+  memset(zerobuff, 0, win * sizeof(uint8_t));
+  uint8_t zerobuff2[win];  // NOLINT
+  memset(zerobuff2, 0, win * sizeof(uint8_t));
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+    uint8_t* outptr0 = dst + (h - i) * win;  // last
+    uint8_t* outptr1 = outptr0 - win;
+    uint8_t* outptr2 = outptr1 - win;
+    uint8_t* outptr3 = outptr2 - win;
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    if (i + 3 >= h_in) {
+      switch ((i + 3) - h_in) {
+        case 3:
+          inptr0 = zerobuff;
+          outptr0 = zerobuff2;
+        case 2:
+          inptr1 = zerobuff;
+          outptr1 = zerobuff2;
+        case 1:
+          inptr2 = zerobuff;
+          outptr2 = zerobuff2;
+        case 0:
+          inptr3 = zerobuff;
+          outptr3 = zerobuff2;
+        default:
+          break;
+      }
+    }
+    int j = 0;
+    for (; j < w_in - 7; j += 8) {
+      asm volatile(
+          "ld3  {v0.8b, v1.8b, v2.8b}, [%[inptr0]], #24    \n"  // v0={00,01,02,
+                                                                // 03, 04, 05,
+                                                                // 06, 07}"
+          "ld3  {v3.8b, v4.8b, v5.8b}, [%[inptr1]], #24     \n"  // v0={10,11,12,
+                                                                 // 13, 14, 15,
+                                                                 // 16, 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]], #24    \n"  // v0={20,21,22,
+                                                                // 23, 24, 25,
+                                                                // 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24    \n"  // v0={30,31,32,
+                                                                  // 33, 34, 35,
+                                                                  // 36, 37}"
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
+          "st3 {v0.8b, v1.8b, v2.8b}, [%[outptr0]], #24             \n"   // 00
+                                                                          // 10
+                                                                          // 20
+                                                                          // 30
+                                                                          // 04
+                                                                          // 14
+                                                                          // 24
+                                                                          // 34
+          "st3 {v3.8b, v4.8b, v5.8b}, [%[outptr1]], #24              \n"  // 02
+                                                                          // 12
+                                                                          // 22
+                                                                          // 32
+          "st3 {v6.8b, v7.8b, v8.8b}, [%[outptr2]], #24             \n"   // 01
+                                                                          // 11
+                                                                          // 21
+                                                                          // 31
+          "st3 {v9.8b, v10.8b, v11.8b}, [%[outptr3]], #24              \n"  // 03 13 23 33
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3)
+          :
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11");
+    }
+    for (; j < w_in; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+      }
+    }
+  }
+}
+#else
+void flip_x_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int win = w_in * 3;
+  uint8_t zerobuff[win];  // NOLINT
+  memset(zerobuff, 0, win * sizeof(uint8_t));
+  uint8_t zerobuff2[win];  // NOLINT
+  memset(zerobuff2, 0, win * sizeof(uint8_t));
+  int h = h_in - 1;
+  // 4*8
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+    uint8_t* outptr0 = dst + (h - i) * win;  // last
+    uint8_t* outptr1 = outptr0 - win;
+    uint8_t* outptr2 = outptr1 - win;
+    uint8_t* outptr3 = outptr2 - win;
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    if (i + 3 >= h_in) {
+      switch ((i + 3) - h_in) {
+        case 3:
+          inptr0 = zerobuff;
+          outptr0 = zerobuff2;
+        case 2:
+          inptr1 = zerobuff;
+          outptr1 = zerobuff2;
+        case 1:
+          inptr2 = zerobuff;
+          outptr2 = zerobuff2;
+        case 0:
+          inptr3 = zerobuff;
+          outptr3 = zerobuff2;
+        default:
+          break;
+      }
+    }
+    int j = 0;
+    for (; j < w_in - 7; j += 8) {
+      asm volatile(
+          "vld3.8  {d0, d1, d2}, [%[inptr0]]!   @ zip load r0, d0 =00 01 02 03 "
+          "04 05 06 07\n"
+          "vld3.8  {d3, d4, d5}, [%[inptr1]]!   @ zip load r1, d2 =10 11 12 13 "
+          "14 15 16 17\n"
+          "vld3.8  {d6, d7, d8}, [%[inptr2]]!   @ zip load r1, d4 =20 21 22 23 "
+          "24 25 26 27\n"
+          "vld3.8  {d9, d10, d11}, [%[inptr3]]!   @ zip load r1, d6 = 30 31 32 "
+          "33 34 35 36 37\n"
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
+          "vst3.8  {d0, d1, d2},    [%[outptr0]]!   @ write d0(q0,low),r00,r10 "
+          "20 30\n"
+          "vst3.8  {d3, d4, d5},    [%[outptr1]]!   @ write d4(q0,low),r01,r11 "
+          "21 31\n"
+          "vst3.8  {d6, d7, d8},    [%[outptr2]]!   @ write d4(q0,low),r01,r11 "
+          "21 31\n"
+          "vst3.8  {d9, d10, d11},    [%[outptr3]]!   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3)
+          :
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+    }
+    for (; j < w_in; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+      }
+    }
+  }
+}
+#endif
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+flip:
+bgr3 bgr2 bgr1
+bgr6 bgr5 bgr4
+bgr9 bgr8 bgr7
+*/
+#ifdef __aarch64__
+void flip_y_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) {
+  int w_in = w * 3;
+  uint8_t zerobuff[w_in];  // NOLINT
+  memset(zerobuff, 0, w_in * sizeof(uint8_t));
+  uint8_t zerobuff2[w_in];  // NOLINT
+  memset(zerobuff2, 0, w_in * sizeof(uint8_t));
+  int stride_w = 24;
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * w_in;
+    const uint8_t* inptr1 = inptr0 + w_in;
+    const uint8_t* inptr2 = inptr1 + w_in;
+    const uint8_t* inptr3 = inptr2 + w_in;
+    uint8_t* outptr0 = dst + (i + 1) * w_in - stride_w;  // last col
+    uint8_t* outptr1 = outptr0 + w_in;
+    uint8_t* outptr2 = outptr1 + w_in;
+    uint8_t* outptr3 = outptr2 + w_in;
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    int j = 0;
+    if (i + 3 >= h_in) {
+      switch ((i + 3) - h_in) {
+        case 3:
+          inptr0 = zerobuff;
+          outptr0 = zerobuff2;
+        case 2:
+          inptr1 = zerobuff;
+          outptr1 = zerobuff2;
+        case 1:
+          inptr2 = zerobuff;
+          outptr2 = zerobuff2;
+        case 0:
+          inptr3 = zerobuff;
+          outptr3 = zerobuff2;
+        default:
+          break;
+      }
+    }
+    for (; j < w - 7; j += 8) {
+      asm volatile(
+          "ld3  {v0.8b, v1.8b, v2.8b}, [%[inptr0]], #24    \n"  // v0={00,01,02,
+                                                                // 03, 04, 05,
+                                                                // 06, 07}"
+          "ld3  {v3.8b, v4.8b, v5.8b}, [%[inptr1]], #24     \n"  // v0={10,11,12,
+                                                                 // 13, 14, 15,
+                                                                 // 16, 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]], #24    \n"  // v0={20,21,22,
+                                                                // 23, 24, 25,
+                                                                // 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24    \n"  // v0={30,31,32,
+                                                                  // 33, 34, 35,
+                                                                  // 36, 37}"
+          "rev64  v12.8b, v0.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 b
+          "rev64  v13.8b, v1.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 g
+          "rev64  v14.8b, v2.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 r
+          "rev64  v15.8b, v3.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v16.8b, v4.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v17.8b, v5.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v18.8b, v6.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v19.8b, v7.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v20.8b, v8.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v21.8b, v9.8b                \n"   //@ reverse 07 06 05 04 03
+                                                     // 02 01 00
+          "rev64  v22.8b, v10.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00
+          "rev64  v23.8b, v11.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
+          "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]]             \n"   // 00 10
+                                                                        // 20 30
+                                                                        // 04 14
+                                                                        // 24 34
+          "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr1]]              \n"  // 02 12
+                                                                        // 22 32
+          "st3 {v18.8b, v19.8b, v20.8b}, [%[outptr2]]             \n"   // 01 11
+                                                                        // 21 31
+          "st3 {v21.8b, v22.8b, v23.8b}, [%[outptr3]]              \n"  // 03 13
+                                                                        // 23 33
+          "sub %[outptr0], %[outptr0], %[stride_w]       \n"  //@ ptr - stride_w
+          "sub %[outptr1], %[outptr1], %[stride_w]       \n"
+          "sub %[outptr2], %[outptr2], %[stride_w]       \n"
+          "sub %[outptr3], %[outptr3], %[stride_w]       \n"
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [stride_w] "+r"(stride_w)
+          :
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22",
+            "v23");
+    }
+    outptr3 += stride_w - 3;
+    outptr2 += stride_w - 3;
+    outptr1 += stride_w - 3;
+    outptr0 += stride_w - 3;
+    for (; j < w; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            outptr2 -= 6;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            outptr1 -= 6;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            outptr0 -= 6;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        outptr3 -= 6;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        outptr2 -= 6;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        outptr1 -= 6;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        outptr0 -= 6;
+      }
+    }
+  }
+}
+#else
+void flip_y_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) {
+  int w_in = w * 3;
+  uint8_t zerobuff[w_in];  // NOLINT
+  memset(zerobuff, 0, w_in * sizeof(uint8_t));
+  uint8_t zerobuff2[w_in];  // NOLINT
+  memset(zerobuff2, 0, w_in * sizeof(uint8_t));
+  int stride_w = 24;
+  // 4*8
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * w_in;
+    const uint8_t* inptr1 = inptr0 + w_in;
+    const uint8_t* inptr2 = inptr1 + w_in;
+    const uint8_t* inptr3 = inptr2 + w_in;
+    uint8_t* outptr0 = dst + (i + 1) * w_in - stride_w;  // last
+    uint8_t* outptr1 = outptr0 + w_in;
+    uint8_t* outptr2 = outptr1 + w_in;
+    uint8_t* outptr3 = outptr2 + w_in;
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    if (i + 3 >= h_in) {
+      switch ((i + 3) - h_in) {
+        case 3:
+          inptr0 = zerobuff;
+          outptr0 = zerobuff2;
+        case 2:
+          inptr1 = zerobuff;
+          outptr1 = zerobuff2;
+        case 1:
+          inptr2 = zerobuff;
+          outptr2 = zerobuff2;
+        case 0:
+          inptr3 = zerobuff;
+          outptr3 = zerobuff2;
+        default:
+          break;
+      }
+    }
+    int j = 0;
+    for (; j < w - 7; j += 8) {
+      asm volatile(
+          "vld3.8  {d0, d1, d2}, [%[inptr0]]!   @ zip load r0, d0 =00 01 02 03 "
+          "04 05 06 07\n"
+          "vld3.8  {d3, d4, d5}, [%[inptr1]]!   @ zip load r1, d2 =10 11 12 13 "
+          "14 15 16 17\n"
+          "vld3.8  {d6, d7, d8}, [%[inptr2]]!   @ zip load r1, d4 =20 21 22 23 "
+          "24 25 26 27\n"
+          "vld3.8  {d9, d10, d11}, [%[inptr3]]!   @ zip load r1, d6 = 30 31 32 "
+          "33 34 35 36 37\n"
+          "vrev64.8  d12, d0               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d13, d1               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d14, d2               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d15, d3               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d16, d4               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d17, d5               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d18, d6               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d19, d7               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d20, d8               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d21, d9               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d22, d10               @ reverse 07 06 05 04 03 02 01 00 "
+          "\n"
+          "vrev64.8  d23, d11               @ reverse 07 06 05 04 03 02 01 00 "
+          "\n"
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
+          "vst3.8  {d12, d13, d14},    [%[outptr0]]   @ write "
+          "d0(q0,low),r00,r10 20 30\n"
+          "vst3.8  {d15, d16, d17},    [%[outptr1]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst3.8  {d18, d19, d20},    [%[outptr2]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst3.8  {d21, d22, d23},    [%[outptr3]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "sub %[outptr0], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr1], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr2], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr3], %[stride_w]       @ ptr - stride_w \n"
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [stride_w] "+r"(stride_w)
+          :
+          : "q0",
+            "q1",
+            "q2",
+            "q3",
+            "q4",
+            "q5",
+            "q6",
+            "q7",
+            "q8",
+            "q9",
+            "q10",
+            "q11",
+            "q12");
+    }
+    outptr3 += stride_w - 3;
+    outptr2 += stride_w - 3;
+    outptr1 += stride_w - 3;
+    outptr0 += stride_w - 3;
+    for (; j < w; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            outptr2 -= 6;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            outptr1 -= 6;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            outptr0 -= 6;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        outptr3 -= 6;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        outptr2 -= 6;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        outptr1 -= 6;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        outptr0 -= 6;
+      }
+    }
+  }
+}
+#endif
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+flip:
+bgr9 bgr8 bgr7
+bgr6 bgr5 bgr4
+bgr3 bgr2 bgr1
+*/
+#ifdef __aarch64__
+void flip_xy_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) {
+  int stride_w = 24;
+  int w_in = w * 3;
+  uint8_t zerobuff[w_in];  // NOLINT
+  memset(zerobuff, 0, w_in * sizeof(uint8_t));
+  uint8_t zerobuff2[w_in];  // NOLINT
+  memset(zerobuff2, 0, w_in * sizeof(uint8_t));
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * w_in;
+    const uint8_t* inptr1 = inptr0 + w_in;
+    const uint8_t* inptr2 = inptr1 + w_in;
+    const uint8_t* inptr3 = inptr2 + w_in;
+    uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w;  // last col
+    uint8_t* outptr1 = outptr0 - w_in;
+    uint8_t* outptr2 = outptr1 - w_in;
+    uint8_t* outptr3 = outptr2 - w_in;
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    if (i + 3 >= h_in) {
+      switch ((i + 3) - h_in) {
+        case 3:
+          inptr0 = zerobuff;
+          outptr0 = zerobuff2;
+        case 2:
+          inptr1 = zerobuff;
+          outptr1 = zerobuff2;
+        case 1:
+          inptr2 = zerobuff;
+          outptr2 = zerobuff2;
+        case 0:
+          inptr3 = zerobuff;
+          outptr3 = zerobuff2;
+        default:
+          break;
+      }
+    }
+    int j = 0;
+    for (; j < w - 7; j += 8) {
+      asm volatile(
+          "ld3  {v0.8b, v1.8b, v2.8b}, [%[inptr0]], #24    \n"  // v0={00,01,02,
+                                                                // 03, 04, 05,
+                                                                // 06, 07}"
+          "ld3  {v3.8b, v4.8b, v5.8b}, [%[inptr1]], #24     \n"  // v0={10,11,12,
+                                                                 // 13, 14, 15,
+                                                                 // 16, 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]], #24    \n"  // v0={20,21,22,
+                                                                // 23, 24, 25,
+                                                                // 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24    \n"  // v0={30,31,32,
+                                                                  // 33, 34, 35,
+                                                                  // 36, 37}"
+          "rev64  v12.8b, v0.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 b
+          "rev64  v13.8b, v1.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 g
+          "rev64  v14.8b, v2.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 r
+          "rev64  v15.8b, v3.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v16.8b, v4.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v17.8b, v5.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v18.8b, v6.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v19.8b, v7.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v20.8b, v8.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v21.8b, v9.8b                \n"   //@ reverse 07 06 05 04 03
+                                                     // 02 01 00
+          "rev64  v22.8b, v10.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00
+          "rev64  v23.8b, v11.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
+          "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]]             \n"   // 00 10
+                                                                        // 20 30
+                                                                        // 04 14
+                                                                        // 24 34
+          "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr1]]              \n"  // 02 12
+                                                                        // 22 32
+          "st3 {v18.8b, v19.8b, v20.8b}, [%[outptr2]]             \n"   // 01 11
+                                                                        // 21 31
+          "st3 {v21.8b, v22.8b, v23.8b}, [%[outptr3]]              \n"  // 03 13
+                                                                        // 23 33
+          "sub %[outptr0], %[outptr0], %[stride_w]       \n"  //@ ptr - stride_w
+          "sub %[outptr1], %[outptr1], %[stride_w]       \n"
+          "sub %[outptr2], %[outptr2], %[stride_w]       \n"
+          "sub %[outptr3], %[outptr3], %[stride_w]       \n"
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [stride_w] "+r"(stride_w)
+          :
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22",
+            "v23");
+    }
+    outptr3 += stride_w - 3;
+    outptr2 += stride_w - 3;
+    outptr1 += stride_w - 3;
+    outptr0 += stride_w - 3;
+    for (; j < w; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            outptr2 -= 6;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            outptr1 -= 6;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            outptr0 -= 6;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        outptr3 -= 6;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        outptr2 -= 6;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        outptr1 -= 6;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        outptr0 -= 6;
+      }
+    }
+  }
+}
+#else
+void flip_xy_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) {
+  int w_in = w * 3;
+  uint8_t zerobuff[w_in];  // NOLINT
+  memset(zerobuff, 0, w_in * sizeof(uint8_t));
+  uint8_t zerobuff2[w_in];  // NOLINT
+  memset(zerobuff2, 0, w_in * sizeof(uint8_t));
+  int stride_w = 24;
+  // 4*8
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * w_in;
+    const uint8_t* inptr1 = inptr0 + w_in;
+    const uint8_t* inptr2 = inptr1 + w_in;
+    const uint8_t* inptr3 = inptr2 + w_in;
+    uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w;  // last
+    uint8_t* outptr1 = outptr0 - w_in;
+    uint8_t* outptr2 = outptr1 - w_in;
+    uint8_t* outptr3 = outptr2 - w_in;
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    if (i + 3 >= h_in) {
+      switch ((i + 3) - h_in) {
+        case 3:
+          inptr0 = zerobuff;
+          outptr0 = zerobuff2;
+        case 2:
+          inptr1 = zerobuff;
+          outptr1 = zerobuff2;
+        case 1:
+          inptr2 = zerobuff;
+          outptr2 = zerobuff2;
+        case 0:
+          inptr3 = zerobuff;
+          outptr3 = zerobuff2;
+        default:
+          break;
+      }
+    }
+    int j = 0;
+    for (; j < w - 7; j += 8) {
+      asm volatile(
+          "vld3.8  {d0, d1, d2}, [%[inptr0]]!   @ zip load r0, d0 =00 01 02 03 "
+          "04 05 06 07\n"
+          "vld3.8  {d3, d4, d5}, [%[inptr1]]!   @ zip load r1, d2 =10 11 12 13 "
+          "14 15 16 17\n"
+          "vld3.8  {d6, d7, d8}, [%[inptr2]]!   @ zip load r1, d4 =20 21 22 23 "
+          "24 25 26 27\n"
+          "vld3.8  {d9, d10, d11}, [%[inptr3]]!   @ zip load r1, d6 = 30 31 32 "
+          "33 34 35 36 37\n"
+          "vrev64.8  d12, d0               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d13, d1               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d14, d2               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d15, d3               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d16, d4               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d17, d5               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d18, d6               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d19, d7               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d20, d8               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d21, d9               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d22, d10               @ reverse 07 06 05 04 03 02 01 00 "
+          "\n"
+          "vrev64.8  d23, d11               @ reverse 07 06 05 04 03 02 01 00 "
+          "\n"
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
+          "vst3.8  {d12, d13, d14},    [%[outptr0]]   @ write "
+          "d0(q0,low),r00,r10 20 30\n"
+          "vst3.8  {d15, d16, d17},    [%[outptr1]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst3.8  {d18, d19, d20},    [%[outptr2]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst3.8  {d21, d22, d23},    [%[outptr3]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "sub %[outptr0], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr1], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr2], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr3], %[stride_w]       @ ptr - stride_w \n"
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [stride_w] "+r"(stride_w)
+          :
+          : "q0",
+            "q1",
+            "q2",
+            "q3",
+            "q4",
+            "q5",
+            "q6",
+            "q7",
+            "q8",
+            "q9",
+            "q10",
+            "q11",
+            "q12");
+    }
+    outptr3 += stride_w - 3;
+    outptr2 += stride_w - 3;
+    outptr1 += stride_w - 3;
+    outptr0 += stride_w - 3;
+    for (; j < w; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            outptr2 -= 6;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            outptr1 -= 6;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            outptr0 -= 6;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        outptr3 -= 6;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        outptr2 -= 6;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        outptr1 -= 6;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        outptr0 -= 6;
+      }
+    }
+  }
+}
+#endif
--- a/lite/tests/cv/anakin/bgr_resize.cc
+++ b/lite/tests/cv/anakin/bgr_resize.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <limits.h>
+#include <math.h>
+#include "lite/tests/cv/anakin/cv_utils.h"
+void resize_three_channel(
+    const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
+void bgr_resize(const uint8_t* src,
+                uint8_t* dst,
+                int w_in,
+                int h_in,
+                int w_out,
+                int h_out) {
+  if (w_out == w_in && h_out == h_in) {
+    memcpy(dst, src, sizeof(char) * w_in * h_in * 3);
+    return;
+  }
+  // y
+  resize_three_channel(src, w_in * 3, h_in, dst, w_out * 3, h_out);
+}
+void resize_three_channel(const uint8_t* src,
+                          int w_in,
+                          int h_in,
+                          uint8_t* dst,
+                          int w_out,
+                          int h_out) {
+  const int resize_coef_bits = 11;
+  const int resize_coef_scale = 1 << resize_coef_bits;
+  double scale_x = static_cast<double>(w_in) / w_out;
+  double scale_y = static_cast<double>(h_in) / h_out;
+  int* buf = new int[w_out * 2 + h_out * 2];
+  int* xofs = buf;          // new int[w];
+  int* yofs = buf + w_out;  // new int[h];
+  int16_t* ialpha =
+      reinterpret_cast<int16_t*>(buf + w_out + h_out);  // new int16_t[w * 2];
+  int16_t* ibeta =
+      reinterpret_cast<int16_t*>(buf + w_out * 2 + h_out);  // new short[h * 2];
+  float fx = 0.f;
+  float fy = 0.f;
+  int sx = 0.f;
+  int sy = 0.f;
+#define SATURATE_CAST_SHORT(X)                                               \
+  (int16_t)::std::min(                                                       \
+      ::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
+      SHRT_MAX);
+  for (int dx = 0; dx < w_out / 3; dx++) {
+    fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
+    sx = floor(fx);
+    fx -= sx;
+    if (sx < 0) {
+      sx = 0;
+      fx = 0.f;
+    }
+    if (sx >= w_in - 1) {
+      sx = w_in - 2;
+      fx = 1.f;
+    }
+    xofs[dx] = sx * 3;
+    float a0 = (1.f - fx) * resize_coef_scale;
+    float a1 = fx * resize_coef_scale;
+    ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
+    ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
+  }
+  for (int dy = 0; dy < h_out; dy++) {
+    fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
+    sy = floor(fy);
+    fy -= sy;
+    if (sy < 0) {
+      sy = 0;
+      fy = 0.f;
+    }
+    if (sy >= h_in - 1) {
+      sy = h_in - 2;
+      fy = 1.f;
+    }
+    yofs[dy] = sy;
+    float b0 = (1.f - fy) * resize_coef_scale;
+    float b1 = fy * resize_coef_scale;
+    ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
+    ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
+  }
+#undef SATURATE_CAST_SHORT
+  // loop body
+  int16_t* rowsbuf0 = new int16_t[w_out + 1];
+  int16_t* rowsbuf1 = new int16_t[w_out + 1];
+  int16_t* rows0 = rowsbuf0;
+  int16_t* rows1 = rowsbuf1;
+  int prev_sy1 = -1;
+  for (int dy = 0; dy < h_out; dy++) {
+    int sy = yofs[dy];
+    if (sy == prev_sy1) {
+      // hresize one row
+      int16_t* rows0_old = rows0;
+      rows0 = rows1;
+      rows1 = rows0_old;
+      const uint8_t* S1 = src + w_in * (sy + 1);
+      const int16_t* ialphap = ialpha;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out / 3; dx++) {
+        int sx = xofs[dx];
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+        const uint8_t* S1p = S1 + sx;
+        int tmp = dx * 3;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[3] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[4] * a1) >> 4;
+        rows1p[tmp + 2] = (S1p[2] * a0 + S1p[5] * a1) >> 4;
+        ialphap += 2;
+      }
+    } else {
+      // hresize two rows
+      const uint8_t* S0 = src + w_in * (sy);
+      const uint8_t* S1 = src + w_in * (sy + 1);
+      const int16_t* ialphap = ialpha;
+      int16_t* rows0p = rows0;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out / 3; dx++) {
+        int sx = xofs[dx];
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+        const uint8_t* S0p = S0 + sx;
+        const uint8_t* S1p = S1 + sx;
+        int tmp = dx * 3;
+        rows0p[tmp] = (S0p[0] * a0 + S0p[3] * a1) >> 4;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[3] * a1) >> 4;
+        rows0p[tmp + 1] = (S0p[1] * a0 + S0p[4] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[4] * a1) >> 4;
+        rows0p[tmp + 2] = (S0p[2] * a0 + S0p[5] * a1) >> 4;
+        rows1p[tmp + 2] = (S1p[2] * a0 + S1p[5] * a1) >> 4;
+        ialphap += 2;
+      }
+    }
+    prev_sy1 = sy + 1;
+    // vresize
+    int16_t b0 = ibeta[0];
+    int16_t b1 = ibeta[1];
+    int16_t* rows0p = rows0;
+    int16_t* rows1p = rows1;
+    uint8_t* dp_ptr = dst + w_out * (dy);
+    int cnt = w_out >> 3;
+    int remain = w_out - (cnt << 3);
+    int16x4_t _b0 = vdup_n_s16(b0);
+    int16x4_t _b1 = vdup_n_s16(b1);
+    int32x4_t _v2 = vdupq_n_s32(2);
+    for (cnt = w_out >> 3; cnt > 0; cnt--) {
+      int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
+      int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
+      int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
+      int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
+      int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
+      int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
+      int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
+      int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
+      int32x4_t _acc = _v2;
+      _acc = vsraq_n_s32(
+          _acc, _rows0p_sr4_mb0, 16);  // _acc >> 16 + _rows0p_sr4_mb0 >> 16
+      _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
+      int32x4_t _acc_1 = _v2;
+      _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
+      _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
+      int16x4_t _acc16 = vshrn_n_s32(_acc, 2);  // _acc >> 2
+      int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
+      uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
+      vst1_u8(dp_ptr, _dout);
+      dp_ptr += 8;
+      rows0p += 8;
+      rows1p += 8;
+    }
+    for (; remain; --remain) {
+      // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
+      *dp_ptr++ =
+          (uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
+                     (int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
+                    2);
+    }
+    ibeta += 2;
+  }
+  delete[] buf;
+  delete[] rowsbuf0;
+  delete[] rowsbuf1;
+}
--- a/lite/tests/cv/anakin/bgr_rotate_hwc.cc
+++ b/lite/tests/cv/anakin/bgr_rotate_hwc.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/tests/cv/anakin/cv_utils.h"
+void rotate90_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+void rotate270_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+void rotate180_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+void bgr_rotate_hwc(
+    const uint8_t* src, uint8_t* dst, int w_in, int h_in, int angle) {
+  if (angle == 90) {
+    rotate90_hwc(src, dst, w_in, h_in);
+  }
+  if (angle == 270) {
+    rotate270_hwc(src, dst, w_in, h_in);
+  }
+  if (angle == 180) {
+    rotate180_hwc(src, dst, w_in, h_in);
+  }
+}
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+rotate:
+bgr7 bgr4 bgr1
+bgr8 bgr5 bgr2
+bgr9 bgr6 bgr3
+*/
+#ifdef __aarch64__
+void rotate90_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int w_out = h_in;
+  int h_out = w_in;
+  int win = w_in * 3;
+  int wout = w_out * 3;
+  int stride_h = 4 * win;
+  int stride_h_w = 4 * win - 24;
+  int ww = w_out - 8;
+  // uint8_t* dst = new uint8_t[w_out * h_out * 3];
+  // block 8*8. -- 8*8
+  int i = 0;
+  for (i = 0; i < h_in - 7; i += 8) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr0], #64]   \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        "prfm   pldl1keep, [%[ptr3], #64]   \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    int j = 0;
+    for (; j < w_in - 7; j += 8) {
+      uint8_t* outptr0 = dst + j * wout + (ww - i) * 3;
+      uint8_t* outptr1 = outptr0 + wout;
+      uint8_t* outptr2 = outptr1 + wout;
+      uint8_t* outptr3 = outptr2 + wout;
+      uint8_t* outptr4 = outptr3 + wout;
+      uint8_t* outptr5 = outptr4 + wout;
+      uint8_t* outptr6 = outptr5 + wout;
+      uint8_t* outptr7 = outptr6 + wout;
+      asm volatile(
+          "ld3  {v0.8b, v1.8b, v2.8b}, [%[inptr0]]    \n"  // v0={00,01,02, 03,
+                                                           // 04, 05, 06, 07}"
+          "ld3  {v3.8b, v4.8b, v5.8b}, [%[inptr1]]    \n"  // v0={10,11,12, 13,
+                                                           // 14, 15, 16, 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]]    \n"  // v0={20,21,22, 23,
+                                                           // 24, 25, 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]]    \n"  // v0={30,31,32,
+                                                             // 33, 34, 35, 36,
+                                                             // 37}"
+          "add %[inptr0], %[inptr0], %[stride_h] \n"  // 4 + 4*w_in
+          "add %[inptr1], %[inptr1], %[stride_h] \n"  // 5
+          "add %[inptr2], %[inptr2], %[stride_h] \n"  // 6
+          "add %[inptr3], %[inptr3], %[stride_h] \n"  // 7
+          // b
+          "trn1 v12.8b, v0.8b, v3.8b             \n"  // v4={00 10 02 12 04 14
+                                                      // 06 16 }
+          "trn1 v15.8b, v6.8b, v9.8b             \n"  // v4={20 30 22 32 24 34
+                                                      // 26 36 }
+          "trn2 v18.8b, v0.8b, v3.8b             \n"  // v5={01 11 03 13 05 15
+                                                      // 07 17 }
+          "trn2 v21.8b, v6.8b, v9.8b             \n"  // v7={21 31 23 33 25 35
+                                                      // 27 37 }
+          // g
+          "trn1 v13.8b, v1.8b, v4.8b             \n"   // v4={00 10 02 12 04 14
+                                                       // 06 16 }
+          "trn1 v16.8b, v7.8b, v10.8b             \n"  // v4={20 30 22 32 24 34
+                                                       // 26 36 }
+          "trn2 v19.8b, v1.8b, v4.8b             \n"   // v5={01 11 03 13 05 15
+                                                       // 07 17 }
+          "trn2 v22.8b, v7.8b, v10.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+          // r
+          "trn1 v14.8b, v2.8b, v5.8b             \n"   // v4={00 10 02 12 04 14
+                                                       // 06 16 }
+          "trn1 v17.8b, v8.8b, v11.8b             \n"  // v4={20 30 22 32 24 34
+                                                       // 26 36 }
+          "trn2 v20.8b, v2.8b, v5.8b             \n"   // v5={01 11 03 13 05 15
+                                                       // 07 17 }
+          "trn2 v23.8b, v8.8b, v11.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+          // b1
+          "trn1 v24.4h, v12.4h, v15.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v27.4h, v18.4h, v21.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+          "trn2 v0.4h, v12.4h, v15.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v3.4h, v18.4h, v21.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+          // g1
+          "trn1 v25.4h, v13.4h, v16.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v28.4h, v19.4h, v22.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+          "trn2 v1.4h, v13.4h, v16.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v4.4h, v19.4h, v22.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+          // r1
+          "trn1 v26.4h, v14.4h, v17.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v29.4h, v20.4h, v23.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+          "trn2 v2.4h, v14.4h, v17.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v5.4h, v20.4h, v23.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+          "ld3  {v12.8b, v13.8b, v14.8b}, [%[inptr0]]    \n"  // v0={00,01,02,
+                                                              // 03, 04, 05, 06,
+                                                              // 07}"
+          "ld3  {v15.8b, v16.8b, v17.8b}, [%[inptr1]]    \n"  // v0={10,11,12,
+                                                              // 13, 14, 15, 16,
+                                                              // 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]]    \n"  // v0={20,21,22, 23,
+                                                           // 24, 25, 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]]    \n"  // v0={30,31,32,
+                                                             // 33, 34, 35, 36,
+                                                             // 37}"
+          "sub %[inptr0], %[inptr0], %[stride_h_w] \n"  // 4 - 4*w_in + 8
+          "sub %[inptr1], %[inptr1], %[stride_h_w] \n"  // 5
+          "sub %[inptr2], %[inptr2], %[stride_h_w] \n"  // 6
+          "sub %[inptr3], %[inptr3], %[stride_h_w] \n"  // 7
+          // b2
+          "trn1 v18.8b, v12.8b, v15.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v21.8b, v6.8b, v9.8b             \n"    // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+          // g2
+          "trn1 v19.8b, v13.8b, v16.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v22.8b, v7.8b, v10.8b             \n"   // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+          // r2
+          "trn1 v20.8b, v14.8b, v17.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v23.8b, v8.8b, v11.8b             \n"   // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+          "trn2 v12.8b, v12.8b, v15.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+          "trn2 v13.8b, v13.8b, v16.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+          "trn2 v14.8b, v14.8b, v17.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+          "trn2 v15.8b, v6.8b, v9.8b             \n"   // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+          "trn2 v16.8b, v7.8b, v10.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+          "trn2 v17.8b, v8.8b, v11.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+          // b2
+          "trn1 v6.4h, v18.4h, v21.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+          // g2
+          "trn1 v7.4h, v19.4h, v22.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+          // r2
+          "trn1 v8.4h, v20.4h, v23.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+          // bgr
+          "trn1 v9.4h, v12.4h, v15.4h             \n"   // v2={01 11 21 31 05 15
+                                                        // 25 35}
+          "trn1 v10.4h, v13.4h, v16.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+          "trn1 v11.4h, v14.4h, v17.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+          // bgr
+          "trn2 v18.4h, v18.4h, v21.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+          "trn2 v19.4h, v19.4h, v22.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+          "trn2 v20.4h, v20.4h, v23.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+          // bgr
+          "trn2 v21.4h, v12.4h, v15.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+          "trn2 v22.4h, v13.4h, v16.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+          "trn2 v23.4h, v14.4h, v17.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+          // b1 b2
+          "trn1 v12.2s, v24.2s, v6.2s             \n"  // v8={00 10 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v13.2s, v25.2s, v7.2s             \n"  // v6={00 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v14.2s, v26.2s, v8.2s             \n"  // v6={00 10 20 30 40 50
+                                                       // 60 70} r
+          // b1 b2
+          "trn2 v15.2s, v24.2s, v6.2s             \n"  // v8={04 14 24 34 44 54
+                                                       // 64 74} b
+          "trn2 v16.2s, v25.2s, v7.2s             \n"  // v6={04 14 24 34 44 54
+                                                       // 64 74} g
+          "trn2 v17.2s, v26.2s, v8.2s             \n"  // v6={04 14 24 34 44 54
+                                                       // 64 74} r
+          // b1 b2
+          "trn1 v6.2s, v27.2s, v9.2s             \n"   // v8={01 11 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v7.2s, v28.2s, v10.2s             \n"  // v6={01 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v8.2s, v29.2s, v11.2s             \n"  // v6={01 10 20 30 40 50
+                                                       // 60 70} r
+          "rev64  v12.8b, v12.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 b
+          "rev64  v13.8b, v13.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v14.8b, v14.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+          "rev64  v15.8b, v15.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 b
+          "rev64  v16.8b, v16.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v17.8b, v17.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+          // b1 b2
+          "trn2 v24.2s, v27.2s, v9.2s             \n"   // v8={05 10 20 30 40 50
+                                                        // 60 70} b
+          "trn2 v25.2s, v28.2s, v10.2s             \n"  // v6={05 10 20 30 40 50
+                                                        // 60 70} g
+          "trn2 v26.2s, v29.2s, v11.2s             \n"  // v6={05 10 20 30 40 50
+                                                        // 60 70} r
+          // "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]], #24             \n"
+          // //00 10 20 30 04 14 24 34
+          // "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr4]], #24             \n"
+          // //02 12 22 32
+          "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]], #24             \n"  // 00 10 20 30 04 14 24 34
+          "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr4]], #24             \n"  // 02 12 22 32
+          // b1 b2
+          "trn1 v9.2s, v0.2s, v18.2s             \n"   // v8={02 11 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v10.2s, v1.2s, v19.2s             \n"  // v6={02 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v11.2s, v2.2s, v20.2s             \n"  // v6={02 10 20 30 40 50
+                                                       // 60 70} r
+          "trn2 v27.2s, v0.2s, v18.2s             \n"  // v8={06 11 20 30 40 50
+                                                       // 60 70} b
+          "trn2 v28.2s, v1.2s, v19.2s             \n"  // v6={06 10 20 30 40 50
+                                                       // 60 70} g
+          "trn2 v29.2s, v2.2s, v20.2s             \n"  // v6={06 10 20 30 40 50
+                                                       // 60 70} r
+          // b1 b2
+          "trn1 v0.2s, v3.2s, v21.2s             \n"  // v8={03 11 20 30 40 50
+                                                      // 60 70} b
+          "trn1 v1.2s, v4.2s, v22.2s             \n"  // v6={03 10 20 30 40 50
+                                                      // 60 70} g
+          "trn1 v2.2s, v5.2s, v23.2s             \n"  // v6={03 10 20 30 40 50
+                                                      // 60 70} r
+          "trn2 v18.2s, v3.2s, v21.2s             \n"  // v8={07 11 20 30 40 50
+                                                       // 60 70} b
+          "trn2 v19.2s, v4.2s, v22.2s             \n"  // v6={07 10 20 30 40 50
+                                                       // 60 70} g
+          "trn2 v20.2s, v5.2s, v23.2s             \n"  // v6={07 10 20 30 40 50
+                                                       // 60 70} r
+          "rev64  v6.8b, v6.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 b
+          "rev64  v7.8b, v7.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 g
+          "rev64  v8.8b, v8.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 r
+          "rev64  v24.8b, v24.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 b
+          "rev64  v25.8b, v25.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v26.8b, v26.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+          "rev64  v9.8b, v9.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 b
+          "rev64  v10.8b, v10.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v11.8b, v11.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+          "rev64  v27.8b, v27.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 b
+          "rev64  v28.8b, v28.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v29.8b, v29.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+          "rev64  v0.8b, v0.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 b
+          "rev64  v1.8b, v1.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 g
+          "rev64  v2.8b, v2.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 r
+          "rev64  v18.8b, v18.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 b
+          "rev64  v19.8b, v19.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v20.8b, v20.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+          "st3 {v6.8b, v7.8b, v8.8b}, [%[outptr1]], #24             \n"  // 00
+                                                                         // 10
+                                                                         // 20
+                                                                         // 30
+                                                                         // 04
+                                                                         // 14
+                                                                         // 24
+                                                                         // 34
+          "st3 {v24.8b, v25.8b, v26.8b}, [%[outptr5]], #24             \n"  // 02 12 22 32
+          "st3 {v9.8b, v10.8b, v11.8b}, [%[outptr2]], #24             \n"  // 00
+                                                                           // 10
+                                                                           // 20
+                                                                           // 30
+                                                                           // 04
+                                                                           // 14
+                                                                           // 24
+                                                                           // 34
+          "st3 {v27.8b, v28.8b, v29.8b}, [%[outptr6]], #24             \n"  // 02 12 22 32
+          "st3 {v0.8b, v1.8b, v2.8b}, [%[outptr3]], #24             \n"  // 00
+                                                                         // 10
+                                                                         // 20
+                                                                         // 30
+                                                                         // 04
+                                                                         // 14
+                                                                         // 24
+                                                                         // 34
+          "st3 {v18.8b, v19.8b, v20.8b}, [%[outptr7]], #24             \n"  // 02 12 22 32
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [outptr4] "+r"(outptr4),
+            [outptr5] "+r"(outptr5),
+            [outptr6] "+r"(outptr6),
+            [outptr7] "+r"(outptr7),
+            [stride_h] "+r"(stride_h),
+            [stride_h_w] "+r"(stride_h_w)
+          :
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22",
+            "v23",
+            "v24",
+            "v25",
+            "v26",
+            "v27",
+            "v28",
+            "v29",
+            "v30");
+    }
+    const uint8_t* inptr4 = inptr3 + win;
+    const uint8_t* inptr5 = inptr4 + win;
+    const uint8_t* inptr6 = inptr5 + win;
+    const uint8_t* inptr7 = inptr6 + win;
+    for (; j < w_in; j++) {
+      int tmpx = (ww - i) * 3;
+      uint8_t* outptr = dst + j * wout + tmpx;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+    }
+  }
+  for (; i < h_in; i++) {
+    const uint8_t* inptr0 = src + i * win;
+    for (int j = 0; j < w_in; j++) {
+      uint8_t* outptr0 = dst + j * wout + (w_out - 1 - i) * 3;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+    }
+  }
+}
+#else
+void rotate90_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int w_out = h_in;
+  int h_out = w_in;
+  int win = w_in * 3;
+  int wout = w_out * 3;
+  int hremain = h_in % 8;
+  int stride_h = 4 * win;
+  int stride_h_w = 4 * win - 24;
+  int ww = w_out - 8;
+  // block 8*8. -- 8*8
+  int i = 0;
+  for (i = 0; i < h_in - 7; i += 8) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+    const uint8_t* inptr4 = inptr3 + win;
+    const uint8_t* inptr5 = inptr4 + win;
+    const uint8_t* inptr6 = inptr5 + win;
+    const uint8_t* inptr7 = inptr6 + win;
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr0], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr1], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr2], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        "pld [%[ptr3], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr4]]            @ preload a, 64byte\n"
+        "pld [%[ptr4], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr5]]            @ preload a, 64byte\n"
+        "pld [%[ptr5], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr6]]            @ preload a, 64byte\n"
+        "pld [%[ptr6], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr7]]            @ preload a, 64byte\n"
+        "pld [%[ptr7], #64]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3),
+          [ptr4] "r"(inptr4),
+          [ptr5] "r"(inptr5),
+          [ptr6] "r"(inptr6),
+          [ptr7] "r"(inptr7)
+        : "memory");
+    int j = 0;
+    for (; j < w_in; j++) {
+      int tmpx = (ww - i) * 3;
+      uint8_t* outptr = dst + j * wout + tmpx;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+    }
+  }
+  ww = w_out - 1;
+  for (; i < h_in; i++) {
+    const uint8_t* inptr0 = src + i * win;
+    for (int j = 0; j < w_in; j++) {
+      uint8_t* outptr0 = dst + j * wout + (ww - i) * 3;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+    }
+  }
+}
+#endif
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+rotate:
+bgr3 bgr6 bgr9
+bgr2 bgr5 bgr8
+bgr1 bgr4 bgr7
+*/
+// dst = (h_out - 1) * w_out
+// 类似rotate90，将输出结果倒着输出 或者先rotate90,然后沿Y轴翻转
+#ifdef __aarch64__
+void rotate270_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int w_out = h_in;
+  int h_out = w_in;
+  int win = w_in * 3;
+  int wout = w_out * 3;
+  int stride_h = 4 * win;
+  int stride_h_w = 4 * win - 24;
+  int hout = h_out - 1;
+  // block 8*8. -- 8*8
+  int i = 0;
+  for (; i < h_in - 7; i += 8) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr0], #64]   \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        "prfm   pldl1keep, [%[ptr3], #64]   \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    int j = 0;
+    for (; j < w_in - 7; j += 8) {
+      uint8_t* outptr0 = dst + (hout - j) * wout + i * 3;
+      uint8_t* outptr1 = outptr0 - wout;
+      uint8_t* outptr2 = outptr1 - wout;
+      uint8_t* outptr3 = outptr2 - wout;
+      uint8_t* outptr4 = outptr3 - wout;
+      uint8_t* outptr5 = outptr4 - wout;
+      uint8_t* outptr6 = outptr5 - wout;
+      uint8_t* outptr7 = outptr6 - wout;
+      asm volatile(
+          "ld3  {v0.8b, v1.8b, v2.8b}, [%[inptr0]]    \n"  // v0={00,01,02, 03,
+                                                           // 04, 05, 06, 07}"
+          "ld3  {v3.8b, v4.8b, v5.8b}, [%[inptr1]]    \n"  // v0={10,11,12, 13,
+                                                           // 14, 15, 16, 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]]    \n"  // v0={20,21,22, 23,
+                                                           // 24, 25, 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]]    \n"  // v0={30,31,32,
+                                                             // 33, 34, 35, 36,
+                                                             // 37}"
+          "add %[inptr0], %[inptr0], %[stride_h] \n"  // 4 + 4*w_in
+          "add %[inptr1], %[inptr1], %[stride_h] \n"  // 5
+          "add %[inptr2], %[inptr2], %[stride_h] \n"  // 6
+          "add %[inptr3], %[inptr3], %[stride_h] \n"  // 7
+          // b
+          "trn1 v12.8b, v0.8b, v3.8b             \n"  // v4={00 10 02 12 04 14
+                                                      // 06 16 }
+          "trn1 v15.8b, v6.8b, v9.8b             \n"  // v4={20 30 22 32 24 34
+                                                      // 26 36 }
+          "trn2 v18.8b, v0.8b, v3.8b             \n"  // v5={01 11 03 13 05 15
+                                                      // 07 17 }
+          "trn2 v21.8b, v6.8b, v9.8b             \n"  // v7={21 31 23 33 25 35
+                                                      // 27 37 }
+          // g
+          "trn1 v13.8b, v1.8b, v4.8b             \n"   // v4={00 10 02 12 04 14
+                                                       // 06 16 }
+          "trn1 v16.8b, v7.8b, v10.8b             \n"  // v4={20 30 22 32 24 34
+                                                       // 26 36 }
+          "trn2 v19.8b, v1.8b, v4.8b             \n"   // v5={01 11 03 13 05 15
+                                                       // 07 17 }
+          "trn2 v22.8b, v7.8b, v10.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+          // r
+          "trn1 v14.8b, v2.8b, v5.8b             \n"   // v4={00 10 02 12 04 14
+                                                       // 06 16 }
+          "trn1 v17.8b, v8.8b, v11.8b             \n"  // v4={20 30 22 32 24 34
+                                                       // 26 36 }
+          "trn2 v20.8b, v2.8b, v5.8b             \n"   // v5={01 11 03 13 05 15
+                                                       // 07 17 }
+          "trn2 v23.8b, v8.8b, v11.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+          // b1
+          "trn1 v24.4h, v12.4h, v15.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v27.4h, v18.4h, v21.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+          "trn2 v0.4h, v12.4h, v15.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v3.4h, v18.4h, v21.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+          // g1
+          "trn1 v25.4h, v13.4h, v16.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v28.4h, v19.4h, v22.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+          "trn2 v1.4h, v13.4h, v16.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v4.4h, v19.4h, v22.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+          // r1
+          "trn1 v26.4h, v14.4h, v17.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v29.4h, v20.4h, v23.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+          "trn2 v2.4h, v14.4h, v17.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v5.4h, v20.4h, v23.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+          "ld3  {v12.8b, v13.8b, v14.8b}, [%[inptr0]]    \n"  // v0={00,01,02,
+                                                              // 03, 04, 05, 06,
+                                                              // 07}"
+          "ld3  {v15.8b, v16.8b, v17.8b}, [%[inptr1]]    \n"  // v0={10,11,12,
+                                                              // 13, 14, 15, 16,
+                                                              // 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]]    \n"  // v0={20,21,22, 23,
+                                                           // 24, 25, 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]]    \n"  // v0={30,31,32,
+                                                             // 33, 34, 35, 36,
+                                                             // 37}"
+          "sub %[inptr0], %[inptr0], %[stride_h_w] \n"  // 4 - 4*w_in + 8
+          "sub %[inptr1], %[inptr1], %[stride_h_w] \n"  // 5
+          "sub %[inptr2], %[inptr2], %[stride_h_w] \n"  // 6
+          "sub %[inptr3], %[inptr3], %[stride_h_w] \n"  // 7
+          // b2
+          "trn1 v18.8b, v12.8b, v15.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v21.8b, v6.8b, v9.8b             \n"    // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+          // g2
+          "trn1 v19.8b, v13.8b, v16.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v22.8b, v7.8b, v10.8b             \n"   // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+          // r2
+          "trn1 v20.8b, v14.8b, v17.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v23.8b, v8.8b, v11.8b             \n"   // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+          "trn2 v12.8b, v12.8b, v15.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+          "trn2 v13.8b, v13.8b, v16.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+          "trn2 v14.8b, v14.8b, v17.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+          "trn2 v15.8b, v6.8b, v9.8b             \n"   // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+          "trn2 v16.8b, v7.8b, v10.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+          "trn2 v17.8b, v8.8b, v11.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+          // b2
+          "trn1 v6.4h, v18.4h, v21.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+          // g2
+          "trn1 v7.4h, v19.4h, v22.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+          // r2
+          "trn1 v8.4h, v20.4h, v23.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+          // bgr
+          "trn1 v9.4h, v12.4h, v15.4h             \n"   // v2={01 11 21 31 05 15
+                                                        // 25 35}
+          "trn1 v10.4h, v13.4h, v16.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+          "trn1 v11.4h, v14.4h, v17.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+          // bgr
+          "trn2 v18.4h, v18.4h, v21.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+          "trn2 v19.4h, v19.4h, v22.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+          "trn2 v20.4h, v20.4h, v23.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+          // bgr
+          "trn2 v21.4h, v12.4h, v15.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+          "trn2 v22.4h, v13.4h, v16.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+          "trn2 v23.4h, v14.4h, v17.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+          // b1 b2
+          "trn1 v12.2s, v24.2s, v6.2s             \n"  // v8={00 10 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v13.2s, v25.2s, v7.2s             \n"  // v6={00 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v14.2s, v26.2s, v8.2s             \n"  // v6={00 10 20 30 40 50
+                                                       // 60 70} r
+          // b1 b2
+          "trn2 v15.2s, v24.2s, v6.2s             \n"  // v8={04 14 24 34 44 54
+                                                       // 64 74} b
+          "trn2 v16.2s, v25.2s, v7.2s             \n"  // v6={04 14 24 34 44 54
+                                                       // 64 74} g
+          "trn2 v17.2s, v26.2s, v8.2s             \n"  // v6={04 14 24 34 44 54
+                                                       // 64 74} r
+          // b1 b2
+          "trn1 v6.2s, v27.2s, v9.2s             \n"   // v8={01 11 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v7.2s, v28.2s, v10.2s             \n"  // v6={01 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v8.2s, v29.2s, v11.2s             \n"  // v6={01 10 20 30 40 50
+                                                       // 60 70} r
+          // b1 b2
+          "trn2 v24.2s, v27.2s, v9.2s             \n"   // v8={05 10 20 30 40 50
+                                                        // 60 70} b
+          "trn2 v25.2s, v28.2s, v10.2s             \n"  // v6={05 10 20 30 40 50
+                                                        // 60 70} g
+          "trn2 v26.2s, v29.2s, v11.2s             \n"  // v6={05 10 20 30 40 50
+                                                        // 60 70} r
+          "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]], #24             \n"  // 00 10 20 30 04 14 24 34
+          "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr4]], #24             \n"  // 02 12 22 32
+          // b1 b2
+          "trn1 v9.2s, v0.2s, v18.2s             \n"   // v8={02 11 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v10.2s, v1.2s, v19.2s             \n"  // v6={02 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v11.2s, v2.2s, v20.2s             \n"  // v6={02 10 20 30 40 50
+                                                       // 60 70} r
+          "trn2 v27.2s, v0.2s, v18.2s             \n"  // v8={06 11 20 30 40 50
+                                                       // 60 70} b
+          "trn2 v28.2s, v1.2s, v19.2s             \n"  // v6={06 10 20 30 40 50
+                                                       // 60 70} g
+          "trn2 v29.2s, v2.2s, v20.2s             \n"  // v6={06 10 20 30 40 50
+                                                       // 60 70} r
+          // b1 b2
+          "trn1 v0.2s, v3.2s, v21.2s             \n"  // v8={03 11 20 30 40 50
+                                                      // 60 70} b
+          "trn1 v1.2s, v4.2s, v22.2s             \n"  // v6={03 10 20 30 40 50
+                                                      // 60 70} g
+          "trn1 v2.2s, v5.2s, v23.2s             \n"  // v6={03 10 20 30 40 50
+                                                      // 60 70} r
+          "trn2 v18.2s, v3.2s, v21.2s             \n"  // v8={07 11 20 30 40 50
+                                                       // 60 70} b
+          "trn2 v19.2s, v4.2s, v22.2s             \n"  // v6={07 10 20 30 40 50
+                                                       // 60 70} g
+          "trn2 v20.2s, v5.2s, v23.2s             \n"  // v6={07 10 20 30 40 50
+                                                       // 60 70} r
+          "st3 {v6.8b, v7.8b, v8.8b}, [%[outptr1]], #24             \n"  // 00
+                                                                         // 10
+                                                                         // 20
+                                                                         // 30
+                                                                         // 04
+                                                                         // 14
+                                                                         // 24
+                                                                         // 34
+          "st3 {v24.8b, v25.8b, v26.8b}, [%[outptr5]], #24             \n"  // 02 12 22 32
+          "st3 {v9.8b, v10.8b, v11.8b}, [%[outptr2]], #24             \n"  // 00
+                                                                           // 10
+                                                                           // 20
+                                                                           // 30
+                                                                           // 04
+                                                                           // 14
+                                                                           // 24
+                                                                           // 34
+          "st3 {v27.8b, v28.8b, v29.8b}, [%[outptr6]], #24             \n"  // 02 12 22 32
+          "st3 {v0.8b, v1.8b, v2.8b}, [%[outptr3]], #24             \n"  // 00
+                                                                         // 10
+                                                                         // 20
+                                                                         // 30
+                                                                         // 04
+                                                                         // 14
+                                                                         // 24
+                                                                         // 34
+          "st3 {v18.8b, v19.8b, v20.8b}, [%[outptr7]], #24             \n"  // 02 12 22 32
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [outptr4] "+r"(outptr4),
+            [outptr5] "+r"(outptr5),
+            [outptr6] "+r"(outptr6),
+            [outptr7] "+r"(outptr7),
+            [stride_h] "+r"(stride_h),
+            [stride_h_w] "+r"(stride_h_w)
+          :
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22",
+            "v23",
+            "v24",
+            "v25",
+            "v26",
+            "v27",
+            "v28",
+            "v29");
+    }
+    const uint8_t* inptr4 = inptr3 + win;
+    const uint8_t* inptr5 = inptr4 + win;
+    const uint8_t* inptr6 = inptr5 + win;
+    const uint8_t* inptr7 = inptr6 + win;
+    for (; j < w_in; j++) {
+      int tmpx = i * 3;
+      uint8_t* outptr = dst + (hout - j) * wout + tmpx;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+    }
+  }
+  for (; i < h_in; i++) {
+    const uint8_t* inptr0 = src + i * win;
+    for (int j = 0; j < w_in; j++) {
+      uint8_t* outptr0 = dst + (hout - j) * wout + i * 3;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+    }
+  }
+}
+#else
+void rotate270_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int w_out = h_in;
+  int h_out = w_in;
+  int win = w_in * 3;
+  int wout = w_out * 3;
+  int hremain = h_in % 8;
+  int stride_h = 4 * win;
+  int stride_h_w = 4 * win - 24;
+  int hout = h_out - 1;
+  // block 8*8. -- 8*8
+  int i = 0;
+  for (; i < h_in - 7; i += 8) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+    const uint8_t* inptr4 = inptr3 + win;
+    const uint8_t* inptr5 = inptr4 + win;
+    const uint8_t* inptr6 = inptr5 + win;
+    const uint8_t* inptr7 = inptr6 + win;
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr0], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr1], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr2], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        "pld [%[ptr3], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr4]]            @ preload a, 64byte\n"
+        "pld [%[ptr4], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr5]]            @ preload a, 64byte\n"
+        "pld [%[ptr5], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr6]]            @ preload a, 64byte\n"
+        "pld [%[ptr6], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr7]]            @ preload a, 64byte\n"
+        "pld [%[ptr7], #64]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3),
+          [ptr4] "r"(inptr4),
+          [ptr5] "r"(inptr5),
+          [ptr6] "r"(inptr6),
+          [ptr7] "r"(inptr7)
+        : "memory");
+    int j = 0;
+    for (; j < w_in; j++) {
+      int tmpx = i * 3;
+      uint8_t* outptr = dst + (hout - j) * wout + tmpx;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+    }
+  }
+  for (; i < h_in; i++) {
+    const uint8_t* inptr0 = src + i * win;
+    for (int j = 0; j < w_in; j++) {
+      uint8_t* outptr0 = dst + (hout - j) * wout + i * 3;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+    }
+  }
+}
+#endif
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+rotate:
+bgr9 bgr8 bgr7
+bgr6 bgr5 bgr4
+bgr3 bgr2 bgr1
+*/
+// filp y
+#ifdef __aarch64__
+void rotate180_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) {
+  int w_in = w * 3;
+  uint8_t zerobuff[w_in];  // NOLINT
+  memset(zerobuff, 0, w_in * sizeof(uint8_t));
+  int stride_w = 24;
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * w_in;
+    const uint8_t* inptr1 = inptr0 + w_in;
+    const uint8_t* inptr2 = inptr1 + w_in;
+    const uint8_t* inptr3 = inptr2 + w_in;
+    uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w;  // last col
+    uint8_t* outptr1 = outptr0 - w_in;
+    uint8_t* outptr2 = outptr1 - w_in;
+    uint8_t* outptr3 = outptr2 - w_in;
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    int j = 0;
+    for (; j < w - 7; j += 8) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 3:
+            inptr0 = zerobuff;
+            outptr0 = zerobuff;
+          case 2:
+            inptr1 = zerobuff;
+            outptr1 = zerobuff;
+          case 1:
+            inptr2 = zerobuff;
+            outptr2 = zerobuff;
+          case 0:
+            inptr3 = zerobuff;
+            outptr3 = zerobuff;
+          default:
+            break;
+        }
+      }
+      asm volatile(
+          "ld3  {v0.8b, v1.8b, v2.8b}, [%[inptr0]], #24    \n"  // v0={00,01,02,
+                                                                // 03, 04, 05,
+                                                                // 06, 07}"
+          "ld3  {v3.8b, v4.8b, v5.8b}, [%[inptr1]], #24     \n"  // v0={10,11,12,
+                                                                 // 13, 14, 15,
+                                                                 // 16, 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]], #24    \n"  // v0={20,21,22,
+                                                                // 23, 24, 25,
+                                                                // 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24    \n"  // v0={30,31,32,
+                                                                  // 33, 34, 35,
+                                                                  // 36, 37}"
+          "rev64  v12.8b, v0.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 b
+          "rev64  v13.8b, v1.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 g
+          "rev64  v14.8b, v2.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 r
+          "rev64  v15.8b, v3.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v16.8b, v4.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v17.8b, v5.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v18.8b, v6.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v19.8b, v7.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v20.8b, v8.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v21.8b, v9.8b                \n"   //@ reverse 07 06 05 04 03
+                                                     // 02 01 00
+          "rev64  v22.8b, v10.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00
+          "rev64  v23.8b, v11.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
+          "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]]             \n"   // 00 10
+                                                                        // 20 30
+                                                                        // 04 14
+                                                                        // 24 34
+          "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr1]]              \n"  // 02 12
+                                                                        // 22 32
+          "st3 {v18.8b, v19.8b, v20.8b}, [%[outptr2]]             \n"   // 01 11
+                                                                        // 21 31
+          "st3 {v21.8b, v22.8b, v23.8b}, [%[outptr3]]              \n"  // 03 13
+                                                                        // 23 33
+          "sub %[outptr0], %[outptr0], %[stride_w]       \n"  //@ ptr - stride_w
+          "sub %[outptr1], %[outptr1], %[stride_w]       \n"
+          "sub %[outptr2], %[outptr2], %[stride_w]       \n"
+          "sub %[outptr3], %[outptr3], %[stride_w]       \n"
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [stride_w] "+r"(stride_w)
+          :
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22",
+            "v23");
+    }
+    outptr3 += stride_w - 3;
+    outptr2 += stride_w - 3;
+    outptr1 += stride_w - 3;
+    outptr0 += stride_w - 3;
+    for (; j < w; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            outptr2 -= 6;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            outptr1 -= 6;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            outptr0 -= 6;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        outptr3 -= 6;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        outptr2 -= 6;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        outptr1 -= 6;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        outptr0 -= 6;
+      }
+    }
+  }
+}
+#else
+void rotate180_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) {
+  int w_in = w * 3;
+  uint8_t zerobuff[w_in];  // NOLINT
+  memset(zerobuff, 0, w_in * sizeof(uint8_t));
+  int stride_w = 24;
+  // 4*8
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * w_in;
+    const uint8_t* inptr1 = inptr0 + w_in;
+    const uint8_t* inptr2 = inptr1 + w_in;
+    const uint8_t* inptr3 = inptr2 + w_in;
+    uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w;  // last
+    uint8_t* outptr1 = outptr0 - w_in;
+    uint8_t* outptr2 = outptr1 - w_in;
+    uint8_t* outptr3 = outptr2 - w_in;
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    int j = 0;
+    for (; j < w - 7; j += 8) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 3:
+            inptr0 = zerobuff;
+            outptr0 = zerobuff;
+          case 2:
+            inptr1 = zerobuff;
+            outptr1 = zerobuff;
+          case 1:
+            inptr2 = zerobuff;
+            outptr2 = zerobuff;
+          case 0:
+            inptr3 = zerobuff;
+            outptr3 = zerobuff;
+          default:
+            break;
+        }
+      }
+      asm volatile(
+          "vld3.8  {d0, d1, d2}, [%[inptr0]]!   @ zip load r0, d0 =00 01 02 03 "
+          "04 05 06 07\n"
+          "vld3.8  {d3, d4, d5}, [%[inptr1]]!   @ zip load r1, d2 =10 11 12 13 "
+          "14 15 16 17\n"
+          "vld3.8  {d6, d7, d8}, [%[inptr2]]!   @ zip load r1, d4 =20 21 22 23 "
+          "24 25 26 27\n"
+          "vld3.8  {d9, d10, d11}, [%[inptr3]]!   @ zip load r1, d6 = 30 31 32 "
+          "33 34 35 36 37\n"
+          "vrev64.8  d12, d0               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d13, d1               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d14, d2               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d15, d3               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d16, d4               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d17, d5               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d18, d6               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d19, d7               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d20, d8               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d21, d9               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d22, d10               @ reverse 07 06 05 04 03 02 01 00 "
+          "\n"
+          "vrev64.8  d23, d11               @ reverse 07 06 05 04 03 02 01 00 "
+          "\n"
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
+          "vst3.8  {d12, d13, d14},    [%[outptr0]]   @ write "
+          "d0(q0,low),r00,r10 20 30\n"
+          "vst3.8  {d15, d16, d17},    [%[outptr1]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst3.8  {d18, d19, d20},    [%[outptr2]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst3.8  {d21, d22, d23},    [%[outptr3]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "sub %[outptr0], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr1], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr2], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr3], %[stride_w]       @ ptr - stride_w \n"
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [stride_w] "+r"(stride_w)
+          :
+          : "q0",
+            "q1",
+            "q2",
+            "q3",
+            "q4",
+            "q5",
+            "q6",
+            "q7",
+            "q8",
+            "q9",
+            "q10",
+            "q11",
+            "q12");
+    }
+    outptr3 += stride_w - 3;
+    outptr2 += stride_w - 3;
+    outptr1 += stride_w - 3;
+    outptr0 += stride_w - 3;
+    for (; j < w; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            outptr2 -= 6;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            outptr1 -= 6;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            outptr0 -= 6;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        outptr3 -= 6;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        outptr2 -= 6;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        outptr1 -= 6;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        outptr0 -= 6;
+      }
+    }
+  }
+}
+#endif
--- a/lite/tests/cv/anakin/bgr_to_tensor_hwc.cc
+++ b/lite/tests/cv/anakin/bgr_to_tensor_hwc.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/tests/cv/anakin/cv_utils.h"
+void bgr_to_tensor_hwc(const uint8_t* bgr,
+                       Tensor& output,  // NOLINT
+                       int width,
+                       int height,
+                       float* means,
+                       float* scales) {
+  int size = width * height;
+  float* ptr0 = output.mutable_data<float>();
+  float r_means = means[0];
+  float g_means = means[1];
+  float b_means = means[2];
+  float r_scales = scales[0];
+  float g_scales = scales[1];
+  float b_scales = scales[2];
+  int w = width;
+  int dim8 = w >> 3;
+  int remain = w - (dim8 << 3);
+  float32x4_t vrmean = vdupq_n_f32(r_means);
+  float32x4_t vgmean = vdupq_n_f32(g_means);
+  float32x4_t vbmean = vdupq_n_f32(b_means);
+  float32x4_t vrscale = vdupq_n_f32(r_scales);
+  float32x4_t vgscale = vdupq_n_f32(g_scales);
+  float32x4_t vbscale = vdupq_n_f32(b_scales);
+  for (int i = 0; i < height; i++) {
+    const uint8_t* ptr_bgr = bgr + i * width * 3;
+    float* ptr0_b = ptr0 + i * width;
+    float* ptr1_g = ptr0_b + size;
+    float* ptr2_r = ptr1_g + size;
+    for (int j = 0; j < dim8; j++) {
+      uint8x8x3_t vbgr = vld3_u8(ptr_bgr);
+      uint8x8_t vb = vbgr.val[0];
+      uint8x8_t vg = vbgr.val[1];
+      uint8x8_t vr = vbgr.val[2];
+      uint16x8_t vb_16 = vmovl_u8(vb);
+      uint16x8_t vg_16 = vmovl_u8(vg);
+      uint16x8_t vr_16 = vmovl_u8(vr);
+      uint32x4_t vb_low_32 = vmovl_u16(vget_low_u16(vb_16));
+      uint32x4_t vg_low_32 = vmovl_u16(vget_low_u16(vg_16));
+      uint32x4_t vr_low_32 = vmovl_u16(vget_low_u16(vr_16));
+      uint32x4_t vb_high_32 = vmovl_u16(vget_high_u16(vb_16));
+      uint32x4_t vg_high_32 = vmovl_u16(vget_high_u16(vg_16));
+      uint32x4_t vr_high_32 = vmovl_u16(vget_high_u16(vr_16));
+      float32x4_t vb_low_f32 = vcvtq_f32_u32(vb_low_32);
+      float32x4_t vr_low_f32 = vcvtq_f32_u32(vr_low_32);
+      float32x4_t vg_low_f32 = vcvtq_f32_u32(vg_low_32);
+      float32x4_t vb_high_f32 = vcvtq_f32_u32(vb_high_32);
+      float32x4_t vg_high_f32 = vcvtq_f32_u32(vg_high_32);
+      float32x4_t vr_high_f32 = vcvtq_f32_u32(vr_high_32);
+      vb_low_f32 = vsubq_f32(vb_low_f32, vbmean);
+      vg_low_f32 = vsubq_f32(vg_low_f32, vgmean);
+      vr_low_f32 = vsubq_f32(vr_low_f32, vrmean);
+      vb_high_f32 = vsubq_f32(vb_high_f32, vbmean);
+      vg_high_f32 = vsubq_f32(vg_high_f32, vgmean);
+      vr_high_f32 = vsubq_f32(vr_high_f32, vrmean);
+      vb_low_f32 = vmulq_f32(vb_low_f32, vbscale);
+      vg_low_f32 = vmulq_f32(vg_low_f32, vgscale);
+      vr_low_f32 = vmulq_f32(vr_low_f32, vrscale);
+      vb_high_f32 = vmulq_f32(vb_high_f32, vbscale);
+      vg_high_f32 = vmulq_f32(vg_high_f32, vgscale);
+      vr_high_f32 = vmulq_f32(vr_high_f32, vrscale);
+      vst1q_f32(ptr0_b, vb_low_f32);
+      vst1q_f32(ptr1_g, vg_low_f32);
+      vst1q_f32(ptr2_r, vr_low_f32);
+      ptr_bgr += 24;
+      vst1q_f32(ptr0_b + 4, vb_high_f32);
+      vst1q_f32(ptr1_g + 4, vg_high_f32);
+      vst1q_f32(ptr2_r + 4, vr_high_f32);
+      ptr0_b += 8;
+      ptr1_g += 8;
+      ptr2_r += 8;
+    }
+    for (int j = 0; j < remain; j++) {
+      *ptr0_b++ = (*ptr_bgr - b_means) * b_scales;  // NOLINT
+      ptr_bgr++;
+      *ptr1_g++ = (*ptr_bgr - g_means) * g_scales;  // NOLINT
+      ptr_bgr++;
+      *ptr2_r++ = (*ptr_bgr - r_means) * r_scales;  // NOLINT
+      ptr_bgr++;
+    }
+  }
+}
--- a/lite/tests/cv/anakin/bgra_flip_hwc.cc
+++ b/lite/tests/cv/anakin/bgra_flip_hwc.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/tests/cv/anakin/cv_utils.h"
+void flip_x_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+void flip_y_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+void flip_xy_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+// x: flip_num = 1 y: flip_num = -1 xy: flip_num = 0;
+void bgra_flip_hwc(
+    const uint8_t* src, uint8_t* dst, int w_in, int h_in, int flip_num) {
+  if (flip_num == 1) {  // x
+    flip_x_hwc_bgra(src, dst, w_in, h_in);
+  }
+  if (flip_num == -1) {  // y
+    flip_y_hwc_bgra(src, dst, w_in, h_in);
+  }
+  if (flip_num == 0) {  // xy
+    flip_xy_hwc_bgra(src, dst, w_in, h_in);
+  }
+}
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+rotate:
+bgr7 bgr8 bgr9
+bgr4 bgr5 bgr6
+bgr1 bgr2 bgr3
+*/
+#ifdef __aarch64__
+void flip_x_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int h = h_in - 1;
+  int win = w_in * 4;
+  uint8_t zerobuff[win];  // NOLINT
+  memset(zerobuff, 0, win * sizeof(uint8_t));
+  uint8_t zerobuff2[win];  // NOLINT
+  memset(zerobuff2, 0, win * sizeof(uint8_t));
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+    uint8_t* outptr0 = dst + (h - i) * win;  // last
+    uint8_t* outptr1 = outptr0 - win;
+    uint8_t* outptr2 = outptr1 - win;
+    uint8_t* outptr3 = outptr2 - win;
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    if (i + 3 >= h_in) {
+      switch ((i + 3) - h_in) {
+        case 3:
+          inptr0 = zerobuff;
+          outptr0 = zerobuff2;
+        case 2:
+          inptr1 = zerobuff;
+          outptr1 = zerobuff2;
+        case 1:
+          inptr2 = zerobuff;
+          outptr2 = zerobuff2;
+        case 0:
+          inptr3 = zerobuff;
+          outptr3 = zerobuff2;
+        default:
+          break;
+      }
+    }
+    int j = 0;
+    for (; j < w_in - 7; j += 8) {
+      asm volatile(
+          "ld4  {v0.8b, v1.8b, v2.8b, v3.8b}, [%[inptr0]], #32    \n"  // v0={00,01,02,
+                                                                       // 03,
+                                                                       // 04,
+                                                                       // 05,
+                                                                       // 06,
+                                                                       // 07}"
+          "ld4  {v4.8b, v5.8b, v6.8b, v7.8b}, [%[inptr1]], #32     \n"  // v0={10,11,12,
+                                                                        // 13,
+                                                                        // 14,
+                                                                        // 15,
+                                                                        // 16,
+                                                                        // 17}"
+          "ld4  {v8.8b, v9.8b, v10.8b, v11.8b}, [%[inptr2]], #32    \n"  // v0={20,21,22,
+                                                                         // 23,
+                                                                         // 24,
+                                                                         // 25,
+                                                                         // 26,
+                                                                         // 27}"
+          "ld4  {v12.8b, v13.8b, v14.8b, v15.8b}, [%[inptr3]], #32    \n"  // v0={30,31,32,
+          // 33,
+          // 34,
+          // 35,
+          // 36,
+          // 37}"
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
+          "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[outptr0]], #32  \n"  // 00 10 20
+                                                                     // 30 04 14
+                                                                     // 24 34
+          "st4 {v4.8b, v5.8b, v6.8b, v7.8b}, [%[outptr1]], #32            \n"  // 02 12 22 32
+          "st4 {v8.8b, v9.8b, v10.8b, v11.8b}, [%[outptr2]], #32             \n"  // 01 11 21 31
+          "st4 {v12.8b, v13.8b, v14.8b, v15.8b}, [%[outptr3]], #32             "
+          " \n"  // 03 13 23 33
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3)
+          :
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15");
+    }
+    for (; j < w_in; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+      }
+    }
+  }
+}
+#else
+void flip_x_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  // uint8_t zerobuff[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+  int win = w_in * 4;
+  uint8_t zerobuff[win];  // NOLINT
+  memset(zerobuff, 0, win * sizeof(uint8_t));
+  uint8_t zerobuff2[win];  // NOLINT
+  memset(zerobuff2, 0, win * sizeof(uint8_t));
+  int h = h_in - 1;
+  // 4*8
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+    uint8_t* outptr0 = dst + (h - i) * win;  // last
+    uint8_t* outptr1 = outptr0 - win;
+    uint8_t* outptr2 = outptr1 - win;
+    uint8_t* outptr3 = outptr2 - win;
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    if (i + 3 >= h_in) {
+      switch ((i + 3) - h_in) {
+        case 3:
+          inptr0 = zerobuff;
+          outptr0 = zerobuff2;
+        case 2:
+          inptr1 = zerobuff;
+          outptr1 = zerobuff2;
+        case 1:
+          inptr2 = zerobuff;
+          outptr2 = zerobuff2;
+        case 0:
+          inptr3 = zerobuff;
+          outptr3 = zerobuff2;
+        default:
+          break;
+      }
+    }
+    int j = 0;
+    for (; j < w_in - 7; j += 8) {
+      asm volatile(
+          "vld4.8  {d0, d1, d2, d3}, [%[inptr0]]!   @ zip load r0, d0 =00 01 "
+          "02 03 04 05 06 07\n"
+          "vld4.8  {d4, d5, d6, d7}, [%[inptr1]]!   @ zip load r1, d2 =10 11 "
+          "12 13 14 15 16 17\n"
+          "vld4.8  {d8, d9, d10, d11}, [%[inptr2]]!   @ zip load r1, d4 =20 21 "
+          "22 23 24 25 26 27\n"
+          "vld4.8  {d12, d13, d14, d15}, [%[inptr3]]!   @ zip load r1, d6 = 30 "
+          "31 32 33 34 35 36 37\n"
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
+          "vst4.8  {d0, d1, d2, d3},    [%[outptr0]]!   @ write "
+          "d0(q0,low),r00,r10 20 30\n"
+          "vst4.8  {d4, d5, d6, d7},    [%[outptr1]]!   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst4.8  {d8, d9, d10, d11},    [%[outptr2]]!   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst4.8  {d12, d13, d14, d15},    [%[outptr3]]!   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3)
+          :
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+    }
+    for (; j < w_in; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+      }
+    }
+  }
+}
+#endif
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+flip:
+bgr3 bgr2 bgr1
+bgr6 bgr5 bgr4
+bgr9 bgr8 bgr7
+*/
+#ifdef __aarch64__
+void flip_y_hwc_bgra(const uint8_t* src, uint8_t* dst, int w, int h_in) {
+  // uint8_t zerobuff[24] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+    //                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  int w_in = w * 4;
+  uint8_t zerobuff[w_in];  // NOLINT
+  memset(zerobuff, 0, w_in * sizeof(uint8_t));
+  uint8_t zerobuff2[w_in];  // NOLINT
+  memset(zerobuff2, 0, w_in * sizeof(uint8_t));
+  int stride_w = 32;
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * w_in;
+    const uint8_t* inptr1 = inptr0 + w_in;
+    const uint8_t* inptr2 = inptr1 + w_in;
+    const uint8_t* inptr3 = inptr2 + w_in;
+    uint8_t* outptr0 = dst + (i + 1) * w_in - stride_w;  // last col
+    uint8_t* outptr1 = outptr0 + w_in;
+    uint8_t* outptr2 = outptr1 + w_in;
+    uint8_t* outptr3 = outptr2 + w_in;
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    if (i + 3 >= h_in) {
+      switch ((i + 3) - h_in) {
+        case 3:
+          inptr0 = zerobuff;
+          outptr0 = zerobuff2;
+        case 2:
+          inptr1 = zerobuff;
+          outptr1 = zerobuff2;
+        case 1:
+          inptr2 = zerobuff;
+          outptr2 = zerobuff2;
+        case 0:
+          inptr3 = zerobuff;
+          outptr3 = zerobuff2;
+        default:
+          break;
+      }
+    }
+    int j = 0;
+    for (; j < w - 7; j += 8) {
+      asm volatile(
+          "ld4  {v0.8b, v1.8b, v2.8b, v3.8b}, [%[inptr0]], #32    \n"  // v0={00,01,02,
+                                                                       // 03,
+                                                                       // 04,
+                                                                       // 05,
+                                                                       // 06,
+                                                                       // 07}"
+          "ld4  {v4.8b, v5.8b, v6.8b, v7.8b}, [%[inptr1]], #32     \n"  // v0={10,11,12,
+                                                                        // 13,
+                                                                        // 14,
+                                                                        // 15,
+                                                                        // 16,
+                                                                        // 17}"
+          "ld4  {v8.8b, v9.8b, v10.8b, v11.8b}, [%[inptr2]], #32    \n"  // v0={20,21,22,
+                                                                         // 23,
+                                                                         // 24,
+                                                                         // 25,
+                                                                         // 26,
+                                                                         // 27}"
+          "ld4  {v12.8b, v13.8b, v14.8b, v15.8b}, [%[inptr3]], #32    \n"  // v0={30,31,32,
+          // 33,
+          // 34,
+          // 35,
+          // 36,
+          // 37}"
+          "rev64  v16.8b, v0.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 b
+          "rev64  v17.8b, v1.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 g
+          "rev64  v18.8b, v2.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 r
+          "rev64  v19.8b, v3.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v20.8b, v4.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v21.8b, v5.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v22.8b, v6.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v23.8b, v7.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v0.8b, v8.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00
+          "rev64  v1.8b, v9.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00
+          "rev64  v2.8b, v10.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v3.8b, v11.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v4.8b, v12.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v5.8b, v13.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v6.8b, v14.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v7.8b, v15.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
+          "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[outptr0]]             \n"  // 00 10 20 30 04 14 24 34
+          "st4 {v20.8b, v21.8b, v22.8b, v23.8b}, [%[outptr1]]              \n"  // 02 12 22 32
+          "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[outptr2]]             \n"  // 01
+                                                                           // 11
+                                                                           // 21
+                                                                           // 31
+          "st4 {v4.8b, v5.8b, v6.8b, v7.8b}, [%[outptr3]]              \n"  // 03 13 23 33
+          "sub %[outptr0], %[outptr0], %[stride_w]       \n"  //@ ptr - stride_w
+          "sub %[outptr1], %[outptr1], %[stride_w]       \n"
+          "sub %[outptr2], %[outptr2], %[stride_w]       \n"
+          "sub %[outptr3], %[outptr3], %[stride_w]       \n"
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [stride_w] "+r"(stride_w)
+          :
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22",
+            "v23");
+    }
+    outptr3 += stride_w - 4;
+    outptr2 += stride_w - 4;
+    outptr1 += stride_w - 4;
+    outptr0 += stride_w - 4;
+    for (; j < w; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            outptr2 -= 8;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            outptr1 -= 8;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            outptr0 -= 8;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        outptr3 -= 8;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        outptr2 -= 8;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        outptr1 -= 8;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        outptr0 -= 8;
+      }
+    }
+  }
+}
+#else
+void flip_y_hwc_bgra(const uint8_t* src, uint8_t* dst, int w, int h_in) {
+  int w_in = w * 4;
+  uint8_t zerobuff[w_in];  // NOLINT
+  memset(zerobuff, 0, w_in * sizeof(uint8_t));
+  uint8_t zerobuff2[w_in];  // NOLINT
+  memset(zerobuff2, 0, w_in * sizeof(uint8_t));
+  int stride_w = 32;
+  // 4*8
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * w_in;
+    const uint8_t* inptr1 = inptr0 + w_in;
+    const uint8_t* inptr2 = inptr1 + w_in;
+    const uint8_t* inptr3 = inptr2 + w_in;
+    uint8_t* outptr0 = dst + (i + 1) * w_in - stride_w;  // last
+    uint8_t* outptr1 = outptr0 + w_in;
+    uint8_t* outptr2 = outptr1 + w_in;
+    uint8_t* outptr3 = outptr2 + w_in;
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    int j = 0;
+    if (i + 3 >= h_in) {
+      switch ((i + 3) - h_in) {
+        case 3:
+          inptr0 = zerobuff;
+          outptr0 = zerobuff2;
+        case 2:
+          inptr1 = zerobuff;
+          outptr1 = zerobuff2;
+        case 1:
+          inptr2 = zerobuff;
+          outptr2 = zerobuff2;
+        case 0:
+          inptr3 = zerobuff;
+          outptr3 = zerobuff2;
+        default:
+          break;
+      }
+    }
+    for (; j < w - 7; j += 8) {
+      asm volatile(
+          "vld4.8  {d0, d1, d2, d3}, [%[inptr0]]!   @ zip load r0, d0 =00 01 "
+          "02 03 04 05 06 07\n"
+          "vld4.8  {d4, d5, d6, d7}, [%[inptr1]]!   @ zip load r1, d2 =10 11 "
+          "12 13 14 15 16 17\n"
+          "vld4.8  {d8, d9, d10, d11}, [%[inptr2]]!   @ zip load r1, d4 =20 21 "
+          "22 23 24 25 26 27\n"
+          "vld4.8  {d12, d13, d14, d15}, [%[inptr3]]!   @ zip load r1, d6 = 30 "
+          "31 32 33 34 35 36 37\n"
+          "vrev64.8  d16, d0               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d17, d1               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d18, d2               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d19, d3               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d20, d4               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d21, d5               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d22, d6               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d23, d7               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d0, d8               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d1, d9               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d2, d10               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d3, d11               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d4, d12               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d5, d13               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d6, d14               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d7, d15               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
+          "vst4.8  {d16, d17, d18, d19},    [%[outptr0]]   @ write "
+          "d0(q0,low),r00,r10 20 30\n"
+          "vst4.8  {d20, d21, d22, d23},    [%[outptr1]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst4.8  {d0, d1, d2, d3},    [%[outptr2]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst4.8  {d4, d5, d6, d7},    [%[outptr3]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "sub %[outptr0], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr1], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr2], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr3], %[stride_w]       @ ptr - stride_w \n"
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [stride_w] "+r"(stride_w)
+          :
+          : "q0",
+            "q1",
+            "q2",
+            "q3",
+            "q4",
+            "q5",
+            "q6",
+            "q7",
+            "q8",
+            "q9",
+            "q10",
+            "q11",
+            "q12");
+    }
+    outptr3 += stride_w - 4;
+    outptr2 += stride_w - 4;
+    outptr1 += stride_w - 4;
+    outptr0 += stride_w - 4;
+    for (; j < w; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            outptr2 -= 8;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            outptr1 -= 8;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            outptr0 -= 8;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        outptr3 -= 8;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        outptr2 -= 8;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        outptr1 -= 8;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        outptr0 -= 8;
+      }
+    }
+  }
+}
+#endif
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+flip:
+bgr9 bgr8 bgr7
+bgr6 bgr5 bgr4
+bgr3 bgr2 bgr1
+*/
+#ifdef __aarch64__
+void flip_xy_hwc_bgra(const uint8_t* src, uint8_t* dst, int w, int h_in) {
+  int stride_w = 32;
+  int w_in = w * 4;
+  uint8_t zerobuff[w_in];  // NOLINT
+  memset(zerobuff, 0, w_in * sizeof(uint8_t));
+  uint8_t zerobuff2[w_in];  // NOLINT
+  memset(zerobuff2, 0, w_in * sizeof(uint8_t));
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * w_in;
+    const uint8_t* inptr1 = inptr0 + w_in;
+    const uint8_t* inptr2 = inptr1 + w_in;
+    const uint8_t* inptr3 = inptr2 + w_in;
+    uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w;  // last col
+    uint8_t* outptr1 = outptr0 - w_in;
+    uint8_t* outptr2 = outptr1 - w_in;
+    uint8_t* outptr3 = outptr2 - w_in;
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    if (i + 3 >= h_in) {
+      switch ((i + 3) - h_in) {
+        case 3:
+          inptr0 = zerobuff;
+          outptr0 = zerobuff2;
+        case 2:
+          inptr1 = zerobuff;
+          outptr1 = zerobuff2;
+        case 1:
+          inptr2 = zerobuff;
+          outptr2 = zerobuff2;
+        case 0:
+          inptr3 = zerobuff;
+          outptr3 = zerobuff2;
+        default:
+          break;
+      }
+    }
+    int j = 0;
+    for (; j < w - 7; j += 8) {
+      asm volatile(
+          "ld4  {v0.8b, v1.8b, v2.8b, v3.8b}, [%[inptr0]], #32    \n"  // v0={00,01,02,
+                                                                       // 03,
+                                                                       // 04,
+                                                                       // 05,
+                                                                       // 06,
+                                                                       // 07}"
+          "ld4  {v4.8b, v5.8b, v6.8b, v7.8b}, [%[inptr1]], #32     \n"  // v0={10,11,12,
+                                                                        // 13,
+                                                                        // 14,
+                                                                        // 15,
+                                                                        // 16,
+                                                                        // 17}"
+          "ld4  {v8.8b, v9.8b, v10.8b, v11.8b}, [%[inptr2]], #32    \n"  // v0={20,21,22,
+                                                                         // 23,
+                                                                         // 24,
+                                                                         // 25,
+                                                                         // 26,
+                                                                         // 27}"
+          "ld4  {v12.8b, v13.8b, v14.8b, v15.8b}, [%[inptr3]], #32    \n"  // v0={30,31,32,
+          // 33,
+          // 34,
+          // 35,
+          // 36,
+          // 37}"
+          "rev64  v16.8b, v0.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 b
+          "rev64  v17.8b, v1.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 g
+          "rev64  v18.8b, v2.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 r
+          "rev64  v19.8b, v3.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v20.8b, v4.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v21.8b, v5.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v22.8b, v6.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v23.8b, v7.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v0.8b, v8.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00
+          "rev64  v1.8b, v9.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00
+          "rev64  v2.8b, v10.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v3.8b, v11.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v4.8b, v12.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v5.8b, v13.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v6.8b, v14.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v7.8b, v15.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
+          "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[outptr0]]             \n"  // 00 10 20 30 04 14 24 34
+          "st4 {v20.8b, v21.8b, v22.8b, v23.8b}, [%[outptr1]]              \n"  // 02 12 22 32
+          "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[outptr2]]             \n"  // 01
+                                                                           // 11
+                                                                           // 21
+                                                                           // 31
+          "st4 {v4.8b, v5.8b, v6.8b, v7.8b}, [%[outptr3]]              \n"  // 03 13 23 33
+          "sub %[outptr0], %[outptr0], %[stride_w]       \n"  //@ ptr - stride_w
+          "sub %[outptr1], %[outptr1], %[stride_w]       \n"
+          "sub %[outptr2], %[outptr2], %[stride_w]       \n"
+          "sub %[outptr3], %[outptr3], %[stride_w]       \n"
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [stride_w] "+r"(stride_w)
+          :
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22",
+            "v23");
+    }
+    outptr3 += stride_w - 4;
+    outptr2 += stride_w - 4;
+    outptr1 += stride_w - 4;
+    outptr0 += stride_w - 4;
+    for (; j < w; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            outptr2 -= 8;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            outptr1 -= 8;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            outptr0 -= 8;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        outptr3 -= 8;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        outptr2 -= 8;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        outptr1 -= 8;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        outptr0 -= 8;
+      }
+    }
+  }
+}
+#else
+void flip_xy_hwc_bgra(const uint8_t* src, uint8_t* dst, int w, int h_in) {
+  int w_in = w * 4;
+  uint8_t zerobuff[w_in];  // NOLINT
+  memset(zerobuff, 0, w_in * sizeof(uint8_t));
+  uint8_t zerobuff2[w_in];  // NOLINT
+  memset(zerobuff2, 0, w_in * sizeof(uint8_t));
+  int stride_w = 32;
+  // 4*8
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * w_in;
+    const uint8_t* inptr1 = inptr0 + w_in;
+    const uint8_t* inptr2 = inptr1 + w_in;
+    const uint8_t* inptr3 = inptr2 + w_in;
+    uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w;  // last
+    uint8_t* outptr1 = outptr0 - w_in;
+    uint8_t* outptr2 = outptr1 - w_in;
+    uint8_t* outptr3 = outptr2 - w_in;
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    if (i + 3 >= h_in) {
+      switch ((i + 3) - h_in) {
+        case 3:
+          inptr0 = zerobuff;
+          outptr0 = zerobuff2;
+        case 2:
+          inptr1 = zerobuff;
+          outptr1 = zerobuff2;
+        case 1:
+          inptr2 = zerobuff;
+          outptr2 = zerobuff2;
+        case 0:
+          inptr3 = zerobuff;
+          outptr3 = zerobuff2;
+        default:
+          break;
+      }
+    }
+    int j = 0;
+    for (; j < w - 7; j += 8) {
+      asm volatile(
+          "vld4.8  {d0, d1, d2, d3}, [%[inptr0]]!   @ zip load r0, d0 =00 01 "
+          "02 03 04 05 06 07\n"
+          "vld4.8  {d4, d5, d6, d7}, [%[inptr1]]!   @ zip load r1, d2 =10 11 "
+          "12 13 14 15 16 17\n"
+          "vld4.8  {d8, d9, d10, d11}, [%[inptr2]]!   @ zip load r1, d4 =20 21 "
+          "22 23 24 25 26 27\n"
+          "vld4.8  {d12, d13, d14, d15}, [%[inptr3]]!   @ zip load r1, d6 = 30 "
+          "31 32 33 34 35 36 37\n"
+          "vrev64.8  d16, d0               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d17, d1               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d18, d2               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d19, d3               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d20, d4               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d21, d5               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d22, d6               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d23, d7               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d0, d8               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d1, d9               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d2, d10               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d3, d11               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d4, d12               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d5, d13               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d6, d14               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d7, d15               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
+          "vst4.8  {d16, d17, d18, d19},    [%[outptr0]]   @ write "
+          "d0(q0,low),r00,r10 20 30\n"
+          "vst4.8  {d20, d21, d22, d23},    [%[outptr1]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst4.8  {d0, d1, d2, d3},    [%[outptr2]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst4.8  {d4, d5, d6, d7},    [%[outptr3]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "sub %[outptr0], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr1], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr2], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr3], %[stride_w]       @ ptr - stride_w \n"
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [stride_w] "+r"(stride_w)
+          :
+          : "q0",
+            "q1",
+            "q2",
+            "q3",
+            "q4",
+            "q5",
+            "q6",
+            "q7",
+            "q8",
+            "q9",
+            "q10",
+            "q11",
+            "q12");
+    }
+    outptr3 += stride_w - 4;
+    outptr2 += stride_w - 4;
+    outptr1 += stride_w - 4;
+    outptr0 += stride_w - 4;
+    for (; j < w; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            outptr2 -= 8;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            outptr1 -= 8;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            outptr0 -= 8;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        outptr3 -= 8;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        outptr2 -= 8;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        outptr1 -= 8;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        outptr0 -= 8;
+      }
+    }
+  }
+}
+#endif
--- a/lite/tests/cv/anakin/bgra_resize.cc
+++ b/lite/tests/cv/anakin/bgra_resize.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <limits.h>
+#include <math.h>
+#include "lite/tests/cv/anakin/cv_utils.h"
+void resize_four_channel(
+    const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
+void bgra_resize(const uint8_t* src,
+                 uint8_t* dst,
+                 int w_in,
+                 int h_in,
+                 int w_out,
+                 int h_out) {
+  if (w_out == w_in && h_out == h_in) {
+    memcpy(dst, src, sizeof(char) * w_in * h_in * 4);
+    return;
+  }
+  // y
+  resize_four_channel(src, w_in * 4, h_in, dst, w_out * 4, h_out);
+}
+void resize_four_channel(const uint8_t* src,
+                         int w_in,
+                         int h_in,
+                         uint8_t* dst,
+                         int w_out,
+                         int h_out) {
+  const int resize_coef_bits = 11;
+  const int resize_coef_scale = 1 << resize_coef_bits;
+  double scale_x = static_cast<double>(w_in) / w_out;
+  double scale_y = static_cast<double>(h_in) / h_out;
+  int* buf = new int[w_out * 2 + h_out * 2];
+  int* xofs = buf;          // new int[w];
+  int* yofs = buf + w_out;  // new int[h];
+  int16_t* ialpha =
+      reinterpret_cast<int16_t*>(buf + w_out + h_out);  // new int16_t[w * 2];
+  int16_t* ibeta = reinterpret_cast<int16_t*>(buf + w_out * 2 +
+                                              h_out);  // new int16_t[h * 2];
+  float fx = 0.f;
+  float fy = 0.f;
+  int sx = 0.f;
+  int sy = 0.f;
+#define SATURATE_CAST_int16_t(X)                                             \
+  (int16_t)::std::min(                                                       \
+      ::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
+      SHRT_MAX);
+  for (int dx = 0; dx < w_out / 4; dx++) {
+    fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
+    sx = floor(fx);
+    fx -= sx;
+    if (sx < 0) {
+      sx = 0;
+      fx = 0.f;
+    }
+    if (sx >= w_in - 1) {
+      sx = w_in - 2;
+      fx = 1.f;
+    }
+    xofs[dx] = sx * 4;
+    float a0 = (1.f - fx) * resize_coef_scale;
+    float a1 = fx * resize_coef_scale;
+    ialpha[dx * 2] = SATURATE_CAST_int16_t(a0);
+    ialpha[dx * 2 + 1] = SATURATE_CAST_int16_t(a1);
+  }
+  for (int dy = 0; dy < h_out; dy++) {
+    fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
+    sy = floor(fy);
+    fy -= sy;
+    if (sy < 0) {
+      sy = 0;
+      fy = 0.f;
+    }
+    if (sy >= h_in - 1) {
+      sy = h_in - 2;
+      fy = 1.f;
+    }
+    yofs[dy] = sy;
+    float b0 = (1.f - fy) * resize_coef_scale;
+    float b1 = fy * resize_coef_scale;
+    ibeta[dy * 2] = SATURATE_CAST_int16_t(b0);
+    ibeta[dy * 2 + 1] = SATURATE_CAST_int16_t(b1);
+  }
+#undef SATURATE_CAST_int16_t
+  // loop body
+  int16_t* rowsbuf0 = new int16_t[w_out + 1];
+  int16_t* rowsbuf1 = new int16_t[w_out + 1];
+  int16_t* rows0 = rowsbuf0;
+  int16_t* rows1 = rowsbuf1;
+  int prev_sy1 = -1;
+  for (int dy = 0; dy < h_out; dy++) {
+    int sy = yofs[dy];
+    if (sy == prev_sy1) {
+      // hresize one row
+      int16_t* rows0_old = rows0;
+      rows0 = rows1;
+      rows1 = rows0_old;
+      const uint8_t* S1 = src + w_in * (sy + 1);
+      const int16_t* ialphap = ialpha;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out / 4; dx++) {
+        int sx = xofs[dx];
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+        const uint8_t* S1p = S1 + sx;
+        int tmp = dx * 4;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[4] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[5] * a1) >> 4;
+        rows1p[tmp + 2] = (S1p[2] * a0 + S1p[6] * a1) >> 4;
+        rows1p[tmp + 3] = (S1p[3] * a0 + S1p[7] * a1) >> 4;
+        ialphap += 2;
+      }
+    } else {
+      // hresize two rows
+      const uint8_t* S0 = src + w_in * (sy);
+      const uint8_t* S1 = src + w_in * (sy + 1);
+      const int16_t* ialphap = ialpha;
+      int16_t* rows0p = rows0;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out / 4; dx++) {
+        int sx = xofs[dx];
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+        const uint8_t* S0p = S0 + sx;
+        const uint8_t* S1p = S1 + sx;
+        int tmp = dx * 4;
+        rows0p[tmp] = (S0p[0] * a0 + S0p[4] * a1) >> 4;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[4] * a1) >> 4;
+        rows0p[tmp + 1] = (S0p[1] * a0 + S0p[5] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[5] * a1) >> 4;
+        rows0p[tmp + 2] = (S0p[2] * a0 + S0p[6] * a1) >> 4;
+        rows1p[tmp + 2] = (S1p[2] * a0 + S1p[6] * a1) >> 4;
+        rows0p[tmp + 3] = (S0p[3] * a0 + S0p[7] * a1) >> 4;
+        rows1p[tmp + 3] = (S1p[3] * a0 + S1p[7] * a1) >> 4;
+        ialphap += 2;
+      }
+    }
+    prev_sy1 = sy + 1;
+    // vresize
+    int16_t b0 = ibeta[0];
+    int16_t b1 = ibeta[1];
+    int16_t* rows0p = rows0;
+    int16_t* rows1p = rows1;
+    uint8_t* dp_ptr = dst + w_out * (dy);
+    int cnt = w_out >> 3;
+    int remain = w_out - (cnt << 3);
+    int16x4_t _b0 = vdup_n_s16(b0);
+    int16x4_t _b1 = vdup_n_s16(b1);
+    int32x4_t _v2 = vdupq_n_s32(2);
+    for (cnt = w_out >> 3; cnt > 0; cnt--) {
+      int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
+      int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
+      int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
+      int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
+      int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
+      int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
+      int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
+      int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
+      int32x4_t _acc = _v2;
+      _acc = vsraq_n_s32(
+          _acc, _rows0p_sr4_mb0, 16);  // _acc >> 16 + _rows0p_sr4_mb0 >> 16
+      _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
+      int32x4_t _acc_1 = _v2;
+      _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
+      _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
+      int16x4_t _acc16 = vshrn_n_s32(_acc, 2);  // _acc >> 2
+      int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
+      uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
+      vst1_u8(dp_ptr, _dout);
+      dp_ptr += 8;
+      rows0p += 8;
+      rows1p += 8;
+    }
+    for (; remain; --remain) {
+      // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
+      *dp_ptr++ =
+          (uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
+                     (int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
+                    2);
+    }
+    ibeta += 2;
+  }
+  delete[] buf;
+  delete[] rowsbuf0;
+  delete[] rowsbuf1;
+}
--- a/lite/tests/cv/anakin/bgra_rotate_hwc.cc
+++ b/lite/tests/cv/anakin/bgra_rotate_hwc.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/tests/cv/anakin/cv_utils.h"
+void rotate90_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+void rotate270_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+void rotate180_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+void bgra_rotate_hwc(
+    const uint8_t* src, uint8_t* dst, int w_in, int h_in, int angle) {
+  if (angle == 90) {
+    rotate90_hwc_bgra(src, dst, w_in, h_in);
+  }
+  if (angle == 270) {
+    rotate270_hwc_bgra(src, dst, w_in, h_in);
+  }
+  if (angle == 180) {
+    rotate180_hwc_bgra(src, dst, w_in, h_in);
+  }
+}
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+rotate:
+bgr7 bgr4 bgr1
+bgr8 bgr5 bgr2
+bgr9 bgr6 bgr3
+*/
+void rotate90_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int w_out = h_in;
+  int h_out = w_in;
+  int win = w_in * 4;
+  int wout = w_out * 4;
+  int hremain = h_in % 8;
+  int stride_h = 4 * win;
+  int stride_h_w = 4 * win - 32;
+  int ww = w_out - 8;
+  // block 8*8. -- 8*8
+  int i = 0;
+  for (i = 0; i < h_in - 7; i += 8) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+    const uint8_t* inptr4 = inptr3 + win;
+    const uint8_t* inptr5 = inptr4 + win;
+    const uint8_t* inptr6 = inptr5 + win;
+    const uint8_t* inptr7 = inptr6 + win;
+#ifdef __aarch64__
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr0], #64]   \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        "prfm   pldl1keep, [%[ptr3], #64]   \n"
+        "prfm   pldl1keep, [%[ptr4]]        \n"
+        "prfm   pldl1keep, [%[ptr4], #64]   \n"
+        "prfm   pldl1keep, [%[ptr5]]        \n"
+        "prfm   pldl1keep, [%[ptr5], #64]   \n"
+        "prfm   pldl1keep, [%[ptr6]]        \n"
+        "prfm   pldl1keep, [%[ptr6], #64]   \n"
+        "prfm   pldl1keep, [%[ptr7]]        \n"
+        "prfm   pldl1keep, [%[ptr7], #64]   \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3),
+          [ptr4] "r"(inptr4),
+          [ptr5] "r"(inptr5),
+          [ptr6] "r"(inptr6),
+          [ptr7] "r"(inptr7)
+        : "memory");
+#else
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr0], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr1], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr2], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        "pld [%[ptr3], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr4]]            @ preload a, 64byte\n"
+        "pld [%[ptr4], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr5]]            @ preload a, 64byte\n"
+        "pld [%[ptr5], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr6]]            @ preload a, 64byte\n"
+        "pld [%[ptr6], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr7]]            @ preload a, 64byte\n"
+        "pld [%[ptr7], #64]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3),
+          [ptr4] "r"(inptr4),
+          [ptr5] "r"(inptr5),
+          [ptr6] "r"(inptr6),
+          [ptr7] "r"(inptr7)
+        : "memory");
+#endif
+    int j = 0;
+    for (; j < w_in; j++) {
+      int tmpx = (ww - i) * 4;
+      uint8_t* outptr = dst + j * wout + tmpx;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+    }
+  }
+  ww = w_out - 1;
+  for (; i < h_in; i++) {
+    const uint8_t* inptr0 = src + i * win;
+    for (int j = 0; j < w_in; j++) {
+      uint8_t* outptr0 = dst + j * wout + (ww - i) * 4;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+    }
+  }
+}
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+rotate:
+bgr3 bgr6 bgr9
+bgr2 bgr5 bgr8
+bgr1 bgr4 bgr7
+*/
+// dst = (h_out - 1) * w_out
+// 类似rotate90，将输出结果倒着输出 或者先rotate90,然后沿Y轴翻转
+void rotate270_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int w_out = h_in;
+  int h_out = w_in;
+  int win = w_in * 4;
+  int wout = w_out * 4;
+  int hremain = h_in % 8;
+  int stride_h = 4 * win;
+  int stride_h_w = 4 * win - 32;
+  int hout = h_out - 1;
+  // block 8*8. -- 8*8
+  int i = 0;
+  for (; i < h_in - 7; i += 8) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+    const uint8_t* inptr4 = inptr3 + win;
+    const uint8_t* inptr5 = inptr4 + win;
+    const uint8_t* inptr6 = inptr5 + win;
+    const uint8_t* inptr7 = inptr6 + win;
+    int j = 0;
+#ifdef __aarch64__
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr0], #64]   \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        "prfm   pldl1keep, [%[ptr3], #64]   \n"
+        "prfm   pldl1keep, [%[ptr4]]        \n"
+        "prfm   pldl1keep, [%[ptr4], #64]   \n"
+        "prfm   pldl1keep, [%[ptr5]]        \n"
+        "prfm   pldl1keep, [%[ptr5], #64]   \n"
+        "prfm   pldl1keep, [%[ptr6]]        \n"
+        "prfm   pldl1keep, [%[ptr6], #64]   \n"
+        "prfm   pldl1keep, [%[ptr7]]        \n"
+        "prfm   pldl1keep, [%[ptr7], #64]   \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3),
+          [ptr4] "r"(inptr4),
+          [ptr5] "r"(inptr5),
+          [ptr6] "r"(inptr6),
+          [ptr7] "r"(inptr7)
+        : "memory");
+#else
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr0], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr1], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr2], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        "pld [%[ptr3], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr4]]            @ preload a, 64byte\n"
+        "pld [%[ptr4], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr5]]            @ preload a, 64byte\n"
+        "pld [%[ptr5], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr6]]            @ preload a, 64byte\n"
+        "pld [%[ptr6], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr7]]            @ preload a, 64byte\n"
+        "pld [%[ptr7], #64]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3),
+          [ptr4] "r"(inptr4),
+          [ptr5] "r"(inptr5),
+          [ptr6] "r"(inptr6),
+          [ptr7] "r"(inptr7)
+        : "memory");
+#endif
+    for (; j < w_in; j++) {
+      int tmpx = i * 4;
+      uint8_t* outptr = dst + (hout - j) * wout + tmpx;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+    }
+  }
+  for (; i < h_in; i++) {
+    const uint8_t* inptr0 = src + i * win;
+    for (int j = 0; j < w_in; j++) {
+      uint8_t* outptr0 = dst + (hout - j) * wout + i * 4;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+    }
+  }
+}
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+rotate:
+bgr9 bgr8 bgr7
+bgr6 bgr5 bgr4
+bgr3 bgr2 bgr1
+*/
+// filp y
+void rotate180_hwc_bgra(const uint8_t* src, uint8_t* dst, int w, int h_in) {
+  int w_in = w * 4;
+  uint8_t zerobuff[w_in];  // NOLINT
+  memset(zerobuff, 0, w_in * sizeof(uint8_t));
+  int stride_w = 4;
+  // 4*8
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * w_in;
+    const uint8_t* inptr1 = inptr0 + w_in;
+    const uint8_t* inptr2 = inptr1 + w_in;
+    const uint8_t* inptr3 = inptr2 + w_in;
+    uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w;  // last
+    uint8_t* outptr1 = outptr0 - w_in;
+    uint8_t* outptr2 = outptr1 - w_in;
+    uint8_t* outptr3 = outptr2 - w_in;
+    if (i + 3 >= h_in) {
+      switch ((i + 3) - h_in) {
+        case 3:
+          inptr0 = zerobuff;
+        case 2:
+          inptr1 = zerobuff;
+        case 1:
+          inptr2 = zerobuff;
+        case 0:
+          inptr3 = zerobuff;
+        default:
+          break;
+      }
+    }
+#ifdef __aarch64__
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr0], #64]   \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        "prfm   pldl1keep, [%[ptr3], #64]   \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+#else
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr0], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr1], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr2], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        "pld [%[ptr3], #64]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+#endif
+    int j = 0;
+    for (; j < w; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            outptr2 -= 8;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            outptr1 -= 8;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            outptr0 -= 8;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        outptr3 -= 8;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        outptr2 -= 8;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        outptr1 -= 8;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        outptr0 -= 8;
+      }
+    }
+  }
+}
--- a/lite/tests/cv/anakin/bgra_to_tensor_hwc.cc
+++ b/lite/tests/cv/anakin/bgra_to_tensor_hwc.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/tests/cv/anakin/cv_utils.h"
+void bgra_to_tensor_hwc(const uint8_t* bgr,
+                        Tensor& output,  // NOLINT
+                        int width,
+                        int height,
+                        float* means,
+                        float* scales) {
+  int size = width * height;
+  float* ptr0 = output.mutable_data<float>();
+  float r_means = means[0];
+  float g_means = means[1];
+  float b_means = means[2];
+  float r_scales = scales[0];
+  float g_scales = scales[1];
+  float b_scales = scales[2];
+  int dim8 = width >> 3;
+  int remain = wwidth - (dim8 << 3);
+  float32x4_t vrmean = vdupq_n_f32(r_means);
+  float32x4_t vgmean = vdupq_n_f32(g_means);
+  float32x4_t vbmean = vdupq_n_f32(b_means);
+  float32x4_t vrscale = vdupq_n_f32(r_scales);
+  float32x4_t vgscale = vdupq_n_f32(g_scales);
+  float32x4_t vbscale = vdupq_n_f32(b_scales);
+  for (int i = 0; i < height; i++) {
+    const uint8_t* ptr_bgr = bgr + i * width * 4;
+    float* ptr0_b = ptr0 + i * width;
+    float* ptr1_g = ptr0_b + size;
+    float* ptr2_r = ptr1_g + size;
+    for (int j = 0; j < dim8; j++) {
+      uint8x8x4_t vbgr = vld4_u8(ptr_bgr);
+      uint8x8_t vb = vbgr.val[0];
+      uint8x8_t vg = vbgr.val[1];
+      uint8x8_t vr = vbgr.val[2];
+      uint16x8_t vb_16 = vmovl_u8(vb);
+      uint16x8_t vg_16 = vmovl_u8(vg);
+      uint16x8_t vr_16 = vmovl_u8(vr);
+      uint32x4_t vb_low_32 = vmovl_u16(vget_low_u16(vb_16));
+      uint32x4_t vg_low_32 = vmovl_u16(vget_low_u16(vg_16));
+      uint32x4_t vr_low_32 = vmovl_u16(vget_low_u16(vr_16));
+      uint32x4_t vb_high_32 = vmovl_u16(vget_high_u16(vb_16));
+      uint32x4_t vg_high_32 = vmovl_u16(vget_high_u16(vg_16));
+      uint32x4_t vr_high_32 = vmovl_u16(vget_high_u16(vr_16));
+      float32x4_t vb_low_f32 = vcvtq_f32_u32(vb_low_32);
+      float32x4_t vr_low_f32 = vcvtq_f32_u32(vr_low_32);
+      float32x4_t vg_low_f32 = vcvtq_f32_u32(vg_low_32);
+      float32x4_t vb_high_f32 = vcvtq_f32_u32(vb_high_32);
+      float32x4_t vg_high_f32 = vcvtq_f32_u32(vg_high_32);
+      float32x4_t vr_high_f32 = vcvtq_f32_u32(vr_high_32);
+      vb_low_f32 = vsubq_f32(vb_low_f32, vbmean);
+      vg_low_f32 = vsubq_f32(vg_low_f32, vgmean);
+      vr_low_f32 = vsubq_f32(vr_low_f32, vrmean);
+      vb_high_f32 = vsubq_f32(vb_high_f32, vbmean);
+      vg_high_f32 = vsubq_f32(vg_high_f32, vgmean);
+      vr_high_f32 = vsubq_f32(vr_high_f32, vrmean);
+      vb_low_f32 = vmulq_f32(vb_low_f32, vbscale);
+      vg_low_f32 = vmulq_f32(vg_low_f32, vgscale);
+      vr_low_f32 = vmulq_f32(vr_low_f32, vrscale);
+      vb_high_f32 = vmulq_f32(vb_high_f32, vbscale);
+      vg_high_f32 = vmulq_f32(vg_high_f32, vgscale);
+      vr_high_f32 = vmulq_f32(vr_high_f32, vrscale);
+      vst1q_f32(ptr0_b, vb_low_f32);
+      vst1q_f32(ptr1_g, vg_low_f32);
+      vst1q_f32(ptr2_r, vr_low_f32);
+      ptr_bgr += 32;
+      vst1q_f32(ptr0_b + 4, vb_high_f32);
+      vst1q_f32(ptr1_g + 4, vg_high_f32);
+      vst1q_f32(ptr2_r + 4, vr_high_f32);
+      ptr0_b += 8;
+      ptr1_g += 8;
+      ptr2_r += 8;
+    }
+    for (int j = 0; j < remain; j++) {
+      *ptr0_b++ = (*ptr_bgr - b_means) * b_scales;
+      ptr_bgr++;
+      *ptr1_g++ = (*ptr_bgr - g_means) * g_scales;
+      ptr_bgr++;
+      *ptr2_r++ = (*ptr_bgr - r_means) * r_scales;
+      ptr_bgr++;
+      ptr_bgr++;
+    }
+  }
+}
--- a/lite/tests/cv/anakin/cv_utils.cc
+++ b/lite/tests/cv/anakin/cv_utils.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/tests/cv/anakin/cv_utils.h"
+void image_basic_convert(const uint8_t* src,
+                         uint8_t* dst,
+                         ImageFormat srcFormat,
+                         ImageFormat dstFormat,
+                         int srcw,
+                         int srch,
+                         int out_size) {
+  if (srcFormat == dstFormat) {
+    // copy
+    memcpy(dst, src, sizeof(uint8_t) * out_size);
+    return;
+  } else {
+    if (srcFormat == ImageFormat::NV12 &&
+        (dstFormat == ImageFormat::BGR || dstFormat == ImageFormat::RGB)) {
+      nv12_to_bgr(src, dst, srcw, srch);
+    } else if (srcFormat == ImageFormat::NV21 &&
+               (dstFormat == ImageFormat::BGR ||
+                dstFormat == ImageFormat::RGB)) {
+      nv21_to_bgr(src, dst, srcw, srch);
+    } else if (srcFormat == ImageFormat::NV12 &&
+               (dstFormat == ImageFormat::BGRA ||
+                dstFormat == ImageFormat::RGBA)) {
+      nv12_to_bgra(src, dst, srcw, srch);
+    } else if (srcFormat == ImageFormat::NV21 &&
+               (dstFormat == ImageFormat::BGRA ||
+                dstFormat == ImageFormat::RGBA)) {
+      nv21_to_bgra(src, dst, srcw, srch);
+    } else {
+      printf("bais-anakin srcFormat: %d, dstFormat: %d does not support! \n",
+             srcFormat,
+             dstFormat);
+    }
+  }
+}
+void image_basic_resize(const uint8_t* src,
+                        uint8_t* dst,
+                        ImageFormat srcFormat,
+                        int srcw,
+                        int srch,
+                        int dstw,
+                        int dsth) {
+  int size = srcw * srch;
+  if (srcw == dstw && srch == dsth) {
+    if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
+      size = srcw * (static_cast<int>(1.5 * srch));
+    } else if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
+      size = 3 * srcw * srch;
+    } else if (srcFormat == ImageFormat::BGRA ||
+               srcFormat == ImageFormat::RGBA) {
+      size = 4 * srcw * srch;
+    }
+    memcpy(dst, src, sizeof(uint8_t) * size);
+    return;
+  } else {
+    if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
+      nv21_resize(src, dst, srcw, srch, dstw, dsth);
+    } else if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
+      bgr_resize(src, dst, srcw, srch, dstw, dsth);
+    } else if (srcFormat == ImageFormat::BGRA ||
+               srcFormat == ImageFormat::RGBA) {
+      bgra_resize(src, dst, srcw, srch, dstw, dsth);
+    } else {
+      printf("anakin doesn't support this type: %d\n",
+             static_cast<int>(srcFormat));
+    }
+  }
+}
+void image_basic_flip(const uint8_t* src,
+                      uint8_t* dst,
+                      ImageFormat srcFormat,
+                      int srcw,
+                      int srch,
+                      int flip_num) {
+  if (flip_num == -1) {
+    flip_num = 0;  // xy
+  } else if (flip_num == 0) {
+    flip_num = 1;  // x
+  } else if (flip_num == 1) {
+    flip_num = -1;  // y
+  }
+  if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
+    bgr_flip_hwc(src, dst, srcw, srch, flip_num);
+  } else if (srcFormat == ImageFormat::BGRA || srcFormat == ImageFormat::RGBA) {
+    bgra_flip_hwc(src, dst, srcw, srch, flip_num);
+  } else {
+    printf("anakin doesn't support this type: %d\n",
+           static_cast<int>(srcFormat));
+  }
+}
+void image_basic_rotate(const uint8_t* src,
+                        uint8_t* dst,
+                        ImageFormat srcFormat,
+                        int srcw,
+                        int srch,
+                        float rotate_num) {
+  if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
+    bgr_rotate_hwc(src, dst, srcw, srch, rotate_num);
+  } else if (srcFormat == ImageFormat::BGRA || srcFormat == ImageFormat::RGBA) {
+    bgra_rotate_hwc(src, dst, srcw, srch, rotate_num);
+  } else {
+    printf("anakin doesn't support this type: %d\n",
+           static_cast<int>(srcFormat));
+  }
+}
+void image_basic_to_tensor(const uint8_t* in_data,
+                           Tensor dst,
+                           ImageFormat srcFormat,
+                           LayoutType layout,
+                           int srcw,
+                           int srch,
+                           float* means,
+                           float* scales) {
+  if (layout == LayoutType::kNCHW &&
+      (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB)) {
+    bgr_to_tensor_hwc(in_data, dst, srcw, srch, means, scales);
+  } else if (layout == LayoutType::kNCHW && (srcFormat == ImageFormat::BGRA ||
+                                             srcFormat == ImageFormat::RGBA)) {
+    bgra_to_tensor_hwc(in_data, dst, srcw, srch, means, scales);
+  } else {
+    printf("anakin doesn't support this type: %d\n",
+           static_cast<int>(srcFormat));
+  }
+}
--- a/lite/tests/cv/anakin/cv_utils.h
+++ b/lite/tests/cv/anakin/cv_utils.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <arm_neon.h>
+#include "lite/core/tensor.h"
+#include "lite/utils/cv/paddle_image_preprocess.h"
+typedef paddle::lite::utils::cv::ImageFormat ImageFormat;
+typedef paddle::lite::utils::cv::FlipParam FlipParam;
+typedef paddle::lite::Tensor Tensor;
+typedef paddle::lite_api::DataLayoutType LayoutType;
+void rotate(const uint8_t* src, uint8_t* dst, int srcw, int srch, int angle);
+void bgr_rotate_hwc(
+    const uint8_t* src, uint8_t* dst, int srcw, int srch, int angle);
+void bgra_rotate_hwc(
+    const uint8_t* src, uint8_t* dst, int srcw, int srch, int angle);
+// x: flip_num = 1 y: flip_num = -1 xy: flip_num = 0;
+void flip(const uint8_t* src, uint8_t* dst, int srcw, int srch, int flip_num);
+// x: flip_num = 1 y: flip_num = -1 xy: flip_num = 0;
+void bgr_flip_hwc(
+    const uint8_t* src, uint8_t* dst, int srcw, int srch, int flip_num);
+// x: flip_num = 1 y: flip_num = -1 xy: flip_num = 0;
+void bgra_flip_hwc(
+    const uint8_t* src, uint8_t* dst, int srcw, int srch, int flip_num);
+// y_w = srcw, y_h = srch uv_w = srcw uv_h = 1/2 * srch
+void nv21_resize(
+    const uint8_t* src, uint8_t* dst, int srcw, int srch, int dstw, int dsth);
+void bgr_resize(
+    const uint8_t* src, uint8_t* dst, int srcw, int srch, int dstw, int dsth);
+void bgra_resize(
+    const uint8_t* src, uint8_t* dst, int srcw, int srch, int dstw, int dsth);
+// nv21(yvu)  to BGR: stroe hwc dsth * dstw = srch * (srcw) y_w = srcw
+// y_h = srch uv_w = srcw uv_h = 1/2 * srch
+void nv21_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch);
+// nv12(yuv)  to BGR:store hwc dsth * dstw = srch * srcw y_w = srcw
+// y_h = srch uv_w = srcw uv_h = 1/2 * srch
+void nv12_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch);
+// nv21(yvu)  to BGRA: stroe hwc dsth * dstw = srch * (srcw) y_w = srcw
+// y_h = srch uv_w = srcw uv_h = 1/2 * srch
+void nv21_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch);
+// nv12(yuv)  to BGRA:store hwc dsth * dstw = srch * srcw y_w = srcw
+// y_h = srch uv_w = srcw uv_h = 1/2 * srch
+void nv12_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch);
+// bgr output.w == width output.h == height/3
+void bgr_to_tensor_hcw(const uint8_t* bgr,
+                       Tensor& output,  // NOLINT
+                       int width,
+                       int height,
+                       float* means,
+                       float* scales);
+// bgr output.w == width / 3 output.h == height
+void bgr_to_tensor_hwc(const uint8_t* bgr,
+                       Tensor& output,  // NOLINT
+                       int width,
+                       int height,
+                       float* means,
+                       float* scales);
+// bgra output.w == width / 4 output.h == height
+void bgra_to_tensor_hwc(const uint8_t* bgr,
+                        Tensor& output,  // NOLINT
+                        int width,
+                        int height,
+                        float* means,
+                        float* scales);
+// yvu   y_w = width, y_h = height uv_w = width uv_h = 1/2 * height
+void nv21_to_tensor(const uint8_t* nv21,
+                    Tensor& output,  // NOLINT
+                    int width,
+                    int height,
+                    float* means,
+                    float* scales);
+// yuv  y_w = width, y_h = height uv_w = width uv_h = 1/2 * height
+void nv12_to_tensor(const uint8_t* nv12,
+                    Tensor& output,  // NOLINT
+                    int width,
+                    int height,
+                    float* means,
+                    float* scales);
+// clang-format on
+void image_basic_convert(const uint8_t* src,
+                         uint8_t* dst,
+                         ImageFormat srcFormat,
+                         ImageFormat dstFormat,
+                         int srcw,
+                         int srch,
+                         int out_size);
+void image_basic_resize(const uint8_t* src,
+                        uint8_t* dst,
+                        ImageFormat srcFormat,
+                        int srcw,
+                        int srch,
+                        int dstw,
+                        int dsth);
+void image_basic_flip(const uint8_t* src,
+                      uint8_t* dst,
+                      ImageFormat srcFormat,
+                      int srcw,
+                      int srch,
+                      int flip_num);
+void image_basic_rotate(const uint8_t* src,
+                        uint8_t* dst,
+                        ImageFormat srcFormat,
+                        int srcw,
+                        int srch,
+                        float rotate_num);
+void image_basic_to_tensor(const uint8_t* in_data,
+                           Tensor dst,
+                           ImageFormat srcFormat,
+                           LayoutType layout,
+                           int srcw,
+                           int srch,
+                           float* means,
+                           float* scales);
--- a/lite/tests/cv/anakin/nv12_to_bgr.cc
+++ b/lite/tests/cv/anakin/nv12_to_bgr.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <math.h>
+#include "lite/tests/cv/anakin/cv_utils.h"
+/*
+R = Y + 1.402*(V-128);
+G = Y - 0.34414*(U-128) - 0.71414*(V-128);
+B = Y + 1.772*(U-128);
+float: a*b = ((a << 7)*b )>>7
+ra = 1.402 *128 = 179.456 = 179
+ga = 0.34414 * 64 = 44.3721 = 44
+gb = 0.71414 * 64 = 91.40992 = 91
+ba = 1.772 * 62 = 226.816 = 227
+*/
+// yuv  store hwc bgrbgr dsth * dstw = srch * srcw y_w = srcw
+// y_h = srch uv_w = srcw uv_h = 1/2 * srch
+void nv12_to_bgr(const unsigned char* src,
+                 unsigned char* dst,
+                 int srcw,
+                 int srch) {
+  int y_h = srch;
+  int vu_h = 1 / 2 * srch;
+  const unsigned char* y = src;
+  const unsigned char* vu = src + y_h * srcw;
+  int wout = srcw * 3;
+  int16x8_t bias = vdupq_n_s16(128);
+  int16x8_t ga = vdupq_n_s16(44);
+  int16x8_t ra = vdupq_n_s16(179);
+  int16x8_t ba = vdupq_n_s16(227);
+  int16x8_t gb = vdupq_n_s16(91);
+  int16x8_t zero = vdupq_n_s16(0);
+  int16x8_t max = vdupq_n_s16(255);
+  uint8_t* zerobuf = new uint8_t[srcw];
+  uint8_t* writebuf = new uint8_t[wout];
+  memset(zerobuf, 0, sizeof(uint8_t) * srcw);
+  for (int i = 0; i < y_h; i += 2) {
+    const unsigned char* ptr_y1 = y + i * srcw;
+    const unsigned char* ptr_y2 = ptr_y1 + srcw;
+    const unsigned char* ptr_vu = vu + (i / 2) * srcw;
+    unsigned char* ptr_bgr1 = dst + i * wout;
+    unsigned char* ptr_bgr2 = ptr_bgr1 + wout;
+    if (i + 2 > y_h) {
+      ptr_y2 = zerobuf;
+      ptr_bgr2 = writebuf;
+    }
+// 2*16
+#ifdef __aarch64__
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr_y1]]                \n"
+        "prfm   pldl1keep, [%[ptr_y1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_y2]]        \n"
+        "prfm   pldl1keep, [%[ptr_y2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_vu]]        \n"
+        "prfm   pldl1keep, [%[ptr_vu], #64]   \n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#else
+    asm volatile(
+        "pld [%[ptr_y1]]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y1], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y2]]            @ preload a, 64byte\n"
+        "pld [%[ptr_y2], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_vu]]            @ preload a, 64byte\n"
+        "pld [%[ptr_vu], #128]                         @ preload a, 64byte\n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#endif
+    int j = 0;
+    for (; j < srcw - 15; j += 16) {
+      uint8x8x2_t y1 = vld2_u8(ptr_y1);  // d8 = y0y2y4y6...y14 d9 =
+                                         // y1y3y5...y15
+      uint8x8x2_t vu =
+          vld2_u8(ptr_vu);  // d0 = v0v1v2v3v4v5...v7 d1 = u0u1u2...u7
+      uint8x8x2_t y2 = vld2_u8(ptr_y2);
+      uint16x8_t v = vmovl_u8(vu.val[1]);
+      uint16x8_t u = vmovl_u8(vu.val[0]);
+      int16x8_t v_s = vreinterpretq_s16_u16(v);
+      int16x8_t u_s = vreinterpretq_s16_u16(u);
+      int16x8_t v_bias = vsubq_s16(v_s, bias);
+      int16x8_t u_bias = vsubq_s16(u_s, bias);
+      // G = Y - 0.34414*(U-128) - 0.71414*(V-128);
+      int16x8_t g0 = vmulq_s16(ga, u_bias);
+      // R = Y + 1.402*(V-128);
+      int16x8_t r0 = vmulq_s16(ra, v_bias);
+      // B = Y + 1.772*(U-128);
+      int16x8_t b0 = vmulq_s16(ba, u_bias);
+      g0 = vmlaq_s16(g0, gb, v_bias);
+      int16x8_t y1_0_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[0]));
+      int16x8_t y1_1_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[1]));
+      int16x8_t y2_0_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[0]));
+      int16x8_t y2_1_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[1]));
+      int16x8_t r0_bias = vshrq_n_s16(r0, 7);  // r0 / 128
+      int16x8_t b0_bias = vshrq_n_s16(b0, 7);
+      int16x8_t g0_bias = vshrq_n_s16(g0, 7);
+      int16x8_t r0_1 = vaddq_s16(y1_0_8, r0_bias);
+      int16x8_t b0_1 = vaddq_s16(y1_0_8, b0_bias);
+      int16x8_t g0_1 = vsubq_s16(y1_0_8, g0_bias);  // g0_1 = y1_0_8 - g0_1
+      int16x8_t r0_2 = vaddq_s16(y1_1_8, r0_bias);
+      int16x8_t b0_2 = vaddq_s16(y1_1_8, b0_bias);
+      int16x8_t g0_2 = vsubq_s16(y1_1_8, g0_bias);
+      r0_1 = vmaxq_s16(r0_1, zero);
+      b0_1 = vmaxq_s16(b0_1, zero);
+      g0_1 = vmaxq_s16(g0_1, zero);
+      r0_2 = vmaxq_s16(r0_2, zero);
+      b0_2 = vmaxq_s16(b0_2, zero);
+      g0_2 = vmaxq_s16(g0_2, zero);
+      r0_1 = vminq_s16(r0_1, max);
+      b0_1 = vminq_s16(b0_1, max);
+      g0_1 = vminq_s16(g0_1, max);
+      r0_2 = vminq_s16(r0_2, max);
+      b0_2 = vminq_s16(b0_2, max);
+      g0_2 = vminq_s16(g0_2, max);
+      uint8x8_t r00 = vreinterpret_u8_s8(vmovn_s16(r0_1));
+      uint8x8_t b00 = vreinterpret_u8_s8(vmovn_s16(b0_1));
+      uint8x8_t g00 = vreinterpret_u8_s8(vmovn_s16(g0_1));
+      uint8x8_t r01 = vreinterpret_u8_s8(vmovn_s16(r0_2));
+      uint8x8_t b01 = vreinterpret_u8_s8(vmovn_s16(b0_2));
+      uint8x8_t g01 = vreinterpret_u8_s8(vmovn_s16(g0_2));
+      int16x8_t r1_1 = vaddq_s16(y2_0_8, r0_bias);
+      int16x8_t b1_1 = vaddq_s16(y2_0_8, b0_bias);
+      int16x8_t g1_1 = vsubq_s16(y2_0_8, g0_bias);  // g0_1 = y1_0_8 - g0_1
+      int16x8_t r1_2 = vaddq_s16(y2_1_8, r0_bias);
+      int16x8_t b1_2 = vaddq_s16(y2_1_8, b0_bias);
+      int16x8_t g1_2 = vsubq_s16(y2_1_8, g0_bias);
+      uint8x8x2_t r00_0 = vtrn_u8(r00, r01);  // 014589  236710
+      uint8x8x2_t b00_0 = vtrn_u8(b00, b01);
+      uint8x8x2_t g00_0 = vtrn_u8(g00, g01);
+      r1_1 = vmaxq_s16(r1_1, zero);
+      b1_1 = vmaxq_s16(b1_1, zero);
+      g1_1 = vmaxq_s16(g1_1, zero);
+      r1_2 = vmaxq_s16(r1_2, zero);
+      b1_2 = vmaxq_s16(b1_2, zero);
+      g1_2 = vmaxq_s16(g1_2, zero);
+      uint16x4_t r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
+      uint16x4_t r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
+      uint16x4_t b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
+      uint16x4_t b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
+      uint16x4_t g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
+      uint16x4_t g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
+      uint16x4x2_t r00_1 = vtrn_u16(r0_16, r1_16);  // 012389 456710
+      uint16x4x2_t b00_1 = vtrn_u16(b0_16, b1_16);
+      uint16x4x2_t g00_1 = vtrn_u16(g0_16, g1_16);
+      r1_1 = vminq_s16(r1_1, max);
+      b1_1 = vminq_s16(b1_1, max);
+      g1_1 = vminq_s16(g1_1, max);
+      r1_2 = vminq_s16(r1_2, max);
+      b1_2 = vminq_s16(b1_2, max);
+      g1_2 = vminq_s16(g1_2, max);
+      uint32x2_t r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
+      uint32x2_t r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
+      uint32x2_t b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
+      uint32x2_t b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
+      uint32x2_t g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
+      uint32x2_t g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
+      uint32x2x2_t r00_2 = vtrn_u32(r0_32, r1_32);  // 01234567 8910
+      uint32x2x2_t b00_2 = vtrn_u32(b0_32, b1_32);
+      uint32x2x2_t g00_2 = vtrn_u32(g0_32, g1_32);
+      r00 = vreinterpret_u8_s8(vmovn_s16(r1_1));
+      b00 = vreinterpret_u8_s8(vmovn_s16(b1_1));
+      g00 = vreinterpret_u8_s8(vmovn_s16(g1_1));
+      r01 = vreinterpret_u8_s8(vmovn_s16(r1_2));
+      b01 = vreinterpret_u8_s8(vmovn_s16(b1_2));
+      g01 = vreinterpret_u8_s8(vmovn_s16(g1_2));
+      uint8x8_t r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
+      uint8x8_t b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
+      uint8x8_t g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
+      uint8x8_t r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
+      uint8x8_t b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
+      uint8x8_t g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
+      uint8x8x3_t v_bgr;
+      v_bgr.val[0] = b0_8;
+      v_bgr.val[1] = g0_8;
+      v_bgr.val[2] = r0_8;
+      r00_0 = vtrn_u8(r00, r01);  // 014589  236710
+      b00_0 = vtrn_u8(b00, b01);
+      g00_0 = vtrn_u8(g00, g01);
+      vst3_u8(ptr_bgr1, v_bgr);
+      r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
+      r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
+      b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
+      b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
+      g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
+      g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
+      ptr_bgr1 += 24;
+      uint8x8x3_t v_bgr1;
+      v_bgr1.val[0] = b1_8;
+      v_bgr1.val[1] = g1_8;
+      v_bgr1.val[2] = r1_8;
+      r00_1 = vtrn_u16(r0_16, r1_16);  // 012389 456710
+      b00_1 = vtrn_u16(b0_16, b1_16);
+      g00_1 = vtrn_u16(g0_16, g1_16);
+      vst3_u8(ptr_bgr1, v_bgr1);
+      r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
+      r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
+      b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
+      b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
+      g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
+      g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
+      ptr_bgr1 += 24;
+      r00_2 = vtrn_u32(r0_32, r1_32);  // 01234567 8910
+      b00_2 = vtrn_u32(b0_32, b1_32);
+      g00_2 = vtrn_u32(g0_32, g1_32);
+      ptr_vu += 16;
+      ptr_y1 += 16;
+      ptr_y2 += 16;
+      r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
+      b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
+      g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
+      r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
+      b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
+      g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
+      v_bgr.val[0] = b0_8;
+      v_bgr.val[1] = g0_8;
+      v_bgr.val[2] = r0_8;
+      v_bgr1.val[0] = b1_8;
+      v_bgr1.val[1] = g1_8;
+      v_bgr1.val[2] = r1_8;
+      vst3_u8(ptr_bgr2, v_bgr);
+      vst3_u8(ptr_bgr2 + 24, v_bgr1);
+      ptr_bgr2 += 48;
+    }
+    // two data
+    for (; j < srcw; j += 2) {
+      unsigned char _y0 = ptr_y1[0];
+      unsigned char _y1 = ptr_y1[1];
+      unsigned char _v = ptr_vu[1];
+      unsigned char _u = ptr_vu[0];
+      unsigned char _y0_1 = ptr_y2[0];
+      unsigned char _y1_1 = ptr_y2[1];
+      int ra = floor((179 * (_v - 128)) >> 7);
+      int ga = floor((44 * (_u - 128) + 91 * (_v - 128)) >> 7);
+      int ba = floor((227 * (_u - 128)) >> 7);
+      int r = _y0 + ra;
+      int g = _y0 - ga;
+      int b = _y0 + ba;
+      int r1 = _y1 + ra;
+      int g1 = _y1 - ga;
+      int b1 = _y1 + ba;
+      r = r < 0 ? 0 : (r > 255) ? 255 : r;
+      g = g < 0 ? 0 : (g > 255) ? 255 : g;
+      b = b < 0 ? 0 : (b > 255) ? 255 : b;
+      r1 = r1 < 0 ? 0 : (r1 > 255) ? 255 : r1;
+      g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1;
+      b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1;
+      *ptr_bgr1++ = b;
+      *ptr_bgr1++ = g;
+      *ptr_bgr1++ = r;
+      int r2 = _y0_1 + ra;
+      int g2 = _y0_1 - ga;
+      int b2 = _y0_1 + ba;
+      int r3 = _y1_1 + ra;
+      int g3 = _y1_1 - ga;
+      int b3 = _y1_1 + ba;
+      r2 = r2 < 0 ? 0 : (r2 > 255) ? 255 : r2;
+      g2 = g2 < 0 ? 0 : (g2 > 255) ? 255 : g2;
+      b2 = b2 < 0 ? 0 : (b2 > 255) ? 255 : b2;
+      r3 = r3 < 0 ? 0 : (r3 > 255) ? 255 : r3;
+      g3 = g3 < 0 ? 0 : (g3 > 255) ? 255 : g3;
+      b3 = b3 < 0 ? 0 : (b3 > 255) ? 255 : b3;
+      *ptr_bgr1++ = b1;
+      *ptr_bgr1++ = g1;
+      *ptr_bgr1++ = r1;
+      *ptr_bgr2++ = b2;
+      *ptr_bgr2++ = g2;
+      *ptr_bgr2++ = r2;
+      ptr_y1 += 2;
+      ptr_y2 += 2;
+      ptr_vu += 2;
+      *ptr_bgr2++ = b3;
+      *ptr_bgr2++ = g3;
+      *ptr_bgr2++ = r3;
+    }
+  }
+}
--- a/lite/tests/cv/anakin/nv12_to_bgra.cc
+++ b/lite/tests/cv/anakin/nv12_to_bgra.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <math.h>
+#include "lite/tests/cv/anakin/cv_utils.h"
+/*
+R = Y + 1.402*(V-128);
+G = Y - 0.34414*(U-128) - 0.71414*(V-128);
+B = Y + 1.772*(U-128);
+A = 255
+float compute a*b = ((a << 7)*b )>>7
+ra = 1.402 *128 = 179.456 = 179
+ga = 0.34414 * 64 = 44.3721 = 44
+gb = 0.71414 * 64 = 91.40992 = 91
+ba = 1.772 * 62 = 226.816 = 227
+*/
+// yuv  store hwc bgrabgra dsth * dstw = srch * srcw y_w = srcw
+// y_h = srch uv_w = srcw uv_h = 1/2 * srch
+void nv12_to_bgra(const unsigned char* src,
+                  unsigned char* dst,
+                  int srcw,
+                  int srch) {
+  int y_h = srch;
+  int vu_h = 1 / 2 * srch;
+  const unsigned char* y = src;
+  const unsigned char* vu = src + y_h * srcw;
+  int wout = srcw * 4;
+  int16x8_t bias = vdupq_n_s16(128);
+  int16x8_t ga = vdupq_n_s16(44);
+  int16x8_t ra = vdupq_n_s16(179);
+  int16x8_t ba = vdupq_n_s16(227);
+  int16x8_t gb = vdupq_n_s16(91);
+  int16x8_t zero = vdupq_n_s16(0);
+  int16x8_t max = vdupq_n_s16(255);
+  uint8x8_t a_8 = vdup_n_u8(255);
+  for (int i = 0; i < y_h; i += 2) {
+    const unsigned char* ptr_y1 = y + i * srcw;
+    const unsigned char* ptr_y2 = ptr_y1 + srcw;
+    const unsigned char* ptr_vu = vu + (i / 2) * srcw;
+    unsigned char* ptr_bgr1 = dst + i * wout;
+    unsigned char* ptr_bgr2 = ptr_bgr1 + wout;
+// 2*16
+#ifdef __aarch64__
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr_y1]]                \n"
+        "prfm   pldl1keep, [%[ptr_y1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_y2]]        \n"
+        "prfm   pldl1keep, [%[ptr_y2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_vu]]        \n"
+        "prfm   pldl1keep, [%[ptr_vu], #64]   \n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#else
+    asm volatile(
+        "pld [%[ptr_y1]]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y1], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y2]]            @ preload a, 64byte\n"
+        "pld [%[ptr_y2], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_vu]]            @ preload a, 64byte\n"
+        "pld [%[ptr_vu], #128]                         @ preload a, 64byte\n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#endif
+    int j = 0;
+    for (; j < srcw - 15; j += 16) {
+      uint8x8x2_t y1 = vld2_u8(ptr_y1);  // d8 = y0y2y4y6...y14 d9 =
+                                         // y1y3y5...y15
+      uint8x8x2_t vu =
+          vld2_u8(ptr_vu);  // d0 = v0v1v2v3v4v5...v7 d1 = u0u1u2...u7
+      uint8x8x2_t y2 = vld2_u8(ptr_y2);
+      uint16x8_t v = vmovl_u8(vu.val[1]);
+      uint16x8_t u = vmovl_u8(vu.val[0]);
+      int16x8_t v_s = vreinterpretq_s16_u16(v);
+      int16x8_t u_s = vreinterpretq_s16_u16(u);
+      int16x8_t v_bias = vsubq_s16(v_s, bias);
+      int16x8_t u_bias = vsubq_s16(u_s, bias);
+      // G = Y - 0.34414*(U-128) - 0.71414*(V-128);
+      int16x8_t g0 = vmulq_s16(ga, u_bias);
+      // R = Y + 1.402*(V-128);
+      int16x8_t r0 = vmulq_s16(ra, v_bias);
+      // B = Y + 1.772*(U-128);
+      int16x8_t b0 = vmulq_s16(ba, u_bias);
+      g0 = vmlaq_s16(g0, gb, v_bias);
+      int16x8_t y1_0_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[0]));
+      int16x8_t y1_1_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[1]));
+      int16x8_t y2_0_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[0]));
+      int16x8_t y2_1_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[1]));
+      int16x8_t r0_bias = vshrq_n_s16(r0, 7);  // r0 / 128
+      int16x8_t b0_bias = vshrq_n_s16(b0, 7);
+      int16x8_t g0_bias = vshrq_n_s16(g0, 7);
+      int16x8_t r0_1 = vaddq_s16(y1_0_8, r0_bias);
+      int16x8_t b0_1 = vaddq_s16(y1_0_8, b0_bias);
+      int16x8_t g0_1 = vsubq_s16(y1_0_8, g0_bias);  // g0_1 = y1_0_8 - g0_1
+      int16x8_t r0_2 = vaddq_s16(y1_1_8, r0_bias);
+      int16x8_t b0_2 = vaddq_s16(y1_1_8, b0_bias);
+      int16x8_t g0_2 = vsubq_s16(y1_1_8, g0_bias);
+      r0_1 = vmaxq_s16(r0_1, zero);
+      b0_1 = vmaxq_s16(b0_1, zero);
+      g0_1 = vmaxq_s16(g0_1, zero);
+      r0_2 = vmaxq_s16(r0_2, zero);
+      b0_2 = vmaxq_s16(b0_2, zero);
+      g0_2 = vmaxq_s16(g0_2, zero);
+      r0_1 = vminq_s16(r0_1, max);
+      b0_1 = vminq_s16(b0_1, max);
+      g0_1 = vminq_s16(g0_1, max);
+      r0_2 = vminq_s16(r0_2, max);
+      b0_2 = vminq_s16(b0_2, max);
+      g0_2 = vminq_s16(g0_2, max);
+      uint8x8_t r00 = vreinterpret_u8_s8(vmovn_s16(r0_1));
+      uint8x8_t b00 = vreinterpret_u8_s8(vmovn_s16(b0_1));
+      uint8x8_t g00 = vreinterpret_u8_s8(vmovn_s16(g0_1));
+      uint8x8_t r01 = vreinterpret_u8_s8(vmovn_s16(r0_2));
+      uint8x8_t b01 = vreinterpret_u8_s8(vmovn_s16(b0_2));
+      uint8x8_t g01 = vreinterpret_u8_s8(vmovn_s16(g0_2));
+      int16x8_t r1_1 = vaddq_s16(y2_0_8, r0_bias);
+      int16x8_t b1_1 = vaddq_s16(y2_0_8, b0_bias);
+      int16x8_t g1_1 = vsubq_s16(y2_0_8, g0_bias);  // g0_1 = y1_0_8 - g0_1
+      int16x8_t r1_2 = vaddq_s16(y2_1_8, r0_bias);
+      int16x8_t b1_2 = vaddq_s16(y2_1_8, b0_bias);
+      int16x8_t g1_2 = vsubq_s16(y2_1_8, g0_bias);
+      uint8x8x2_t r00_0 = vtrn_u8(r00, r01);  // 014589  236710
+      uint8x8x2_t b00_0 = vtrn_u8(b00, b01);
+      uint8x8x2_t g00_0 = vtrn_u8(g00, g01);
+      r1_1 = vmaxq_s16(r1_1, zero);
+      b1_1 = vmaxq_s16(b1_1, zero);
+      g1_1 = vmaxq_s16(g1_1, zero);
+      r1_2 = vmaxq_s16(r1_2, zero);
+      b1_2 = vmaxq_s16(b1_2, zero);
+      g1_2 = vmaxq_s16(g1_2, zero);
+      uint16x4_t r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
+      uint16x4_t r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
+      uint16x4_t b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
+      uint16x4_t b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
+      uint16x4_t g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
+      uint16x4_t g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
+      uint16x4x2_t r00_1 = vtrn_u16(r0_16, r1_16);  // 012389 456710
+      uint16x4x2_t b00_1 = vtrn_u16(b0_16, b1_16);
+      uint16x4x2_t g00_1 = vtrn_u16(g0_16, g1_16);
+      r1_1 = vminq_s16(r1_1, max);
+      b1_1 = vminq_s16(b1_1, max);
+      g1_1 = vminq_s16(g1_1, max);
+      r1_2 = vminq_s16(r1_2, max);
+      b1_2 = vminq_s16(b1_2, max);
+      g1_2 = vminq_s16(g1_2, max);
+      uint32x2_t r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
+      uint32x2_t r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
+      uint32x2_t b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
+      uint32x2_t b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
+      uint32x2_t g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
+      uint32x2_t g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
+      uint32x2x2_t r00_2 = vtrn_u32(r0_32, r1_32);  // 01234567 8910
+      uint32x2x2_t b00_2 = vtrn_u32(b0_32, b1_32);
+      uint32x2x2_t g00_2 = vtrn_u32(g0_32, g1_32);
+      r00 = vreinterpret_u8_s8(vmovn_s16(r1_1));
+      b00 = vreinterpret_u8_s8(vmovn_s16(b1_1));
+      g00 = vreinterpret_u8_s8(vmovn_s16(g1_1));
+      r01 = vreinterpret_u8_s8(vmovn_s16(r1_2));
+      b01 = vreinterpret_u8_s8(vmovn_s16(b1_2));
+      g01 = vreinterpret_u8_s8(vmovn_s16(g1_2));
+      uint8x8_t r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
+      uint8x8_t b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
+      uint8x8_t g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
+      uint8x8_t r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
+      uint8x8_t b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
+      uint8x8_t g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
+      uint8x8x4_t v_bgr;
+      v_bgr.val[0] = b0_8;
+      v_bgr.val[1] = g0_8;
+      v_bgr.val[2] = r0_8;
+      v_bgr.val[3] = a_8;
+      r00_0 = vtrn_u8(r00, r01);  // 014589  236710
+      b00_0 = vtrn_u8(b00, b01);
+      g00_0 = vtrn_u8(g00, g01);
+      vst4_u8(ptr_bgr1, v_bgr);
+      r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
+      r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
+      b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
+      b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
+      g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
+      g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
+      ptr_bgr1 += 32;
+      uint8x8x4_t v_bgr1;
+      v_bgr1.val[0] = b1_8;
+      v_bgr1.val[1] = g1_8;
+      v_bgr1.val[2] = r1_8;
+      v_bgr1.val[3] = a_8;
+      r00_1 = vtrn_u16(r0_16, r1_16);  // 012389 456710
+      b00_1 = vtrn_u16(b0_16, b1_16);
+      g00_1 = vtrn_u16(g0_16, g1_16);
+      vst4_u8(ptr_bgr1, v_bgr1);
+      r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
+      r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
+      b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
+      b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
+      g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
+      g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
+      ptr_bgr1 += 32;
+      r00_2 = vtrn_u32(r0_32, r1_32);  // 01234567 8910
+      b00_2 = vtrn_u32(b0_32, b1_32);
+      g00_2 = vtrn_u32(g0_32, g1_32);
+      ptr_vu += 16;
+      ptr_y1 += 16;
+      ptr_y2 += 16;
+      r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
+      b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
+      g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
+      r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
+      b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
+      g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
+      v_bgr.val[0] = b0_8;
+      v_bgr.val[1] = g0_8;
+      v_bgr.val[2] = r0_8;
+      v_bgr1.val[0] = b1_8;
+      v_bgr1.val[1] = g1_8;
+      v_bgr1.val[2] = r1_8;
+      vst4_u8(ptr_bgr2, v_bgr);
+      vst4_u8(ptr_bgr2 + 32, v_bgr1);
+      ptr_bgr2 += 64;
+    }
+    // two data
+    for (; j < srcw; j += 2) {
+      unsigned char _y0 = ptr_y1[0];
+      unsigned char _y1 = ptr_y1[1];
+      unsigned char _v = ptr_vu[1];
+      unsigned char _u = ptr_vu[0];
+      unsigned char _y0_1 = ptr_y2[0];
+      unsigned char _y1_1 = ptr_y2[1];
+      int ra = floor((179 * (_v - 128)) >> 7);
+      int ga = floor((44 * (_u - 128) + 91 * (_v - 128)) >> 7);
+      int ba = floor((227 * (_u - 128)) >> 7);
+      int r = _y0 + ra;
+      int g = _y0 - ga;
+      int b = _y0 + ba;
+      int r1 = _y1 + ra;
+      int g1 = _y1 - ga;
+      int b1 = _y1 + ba;
+      r = r < 0 ? 0 : (r > 255) ? 255 : r;
+      g = g < 0 ? 0 : (g > 255) ? 255 : g;
+      b = b < 0 ? 0 : (b > 255) ? 255 : b;
+      r1 = r1 < 0 ? 0 : (r1 > 255) ? 255 : r1;
+      g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1;
+      b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1;
+      *ptr_bgr1++ = b;
+      *ptr_bgr1++ = g;
+      *ptr_bgr1++ = r;
+      *ptr_bgr1++ = 255;
+      int r2 = _y0_1 + ra;
+      int g2 = _y0_1 - ga;
+      int b2 = _y0_1 + ba;
+      int r3 = _y1_1 + ra;
+      int g3 = _y1_1 - ga;
+      int b3 = _y1_1 + ba;
+      r2 = r2 < 0 ? 0 : (r2 > 255) ? 255 : r2;
+      g2 = g2 < 0 ? 0 : (g2 > 255) ? 255 : g2;
+      b2 = b2 < 0 ? 0 : (b2 > 255) ? 255 : b2;
+      r3 = r3 < 0 ? 0 : (r3 > 255) ? 255 : r3;
+      g3 = g3 < 0 ? 0 : (g3 > 255) ? 255 : g3;
+      b3 = b3 < 0 ? 0 : (b3 > 255) ? 255 : b3;
+      *ptr_bgr1++ = b1;
+      *ptr_bgr1++ = g1;
+      *ptr_bgr1++ = r1;
+      *ptr_bgr1++ = 255;
+      *ptr_bgr2++ = b2;
+      *ptr_bgr2++ = g2;
+      *ptr_bgr2++ = r2;
+      *ptr_bgr2++ = 255;
+      ptr_y1 += 2;
+      ptr_y2 += 2;
+      ptr_vu += 2;
+      *ptr_bgr2++ = b3;
+      *ptr_bgr2++ = g3;
+      *ptr_bgr2++ = r3;
+      *ptr_bgr2++ = 255;
+    }
+  }
+}
--- a/lite/tests/cv/anakin/nv21_resize.cc
+++ b/lite/tests/cv/anakin/nv21_resize.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <limits.h>
+#include <math.h>
+#include "lite/tests/cv/anakin/cv_utils.h"
+void resize_one_channel(
+    const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
+void resize_one_channel_uv(
+    const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
+void nv21_resize(const uint8_t* src,
+                 uint8_t* dst,
+                 int w_in,
+                 int h_in,
+                 int w_out,
+                 int h_out) {
+  if (w_out == w_in && h_out == h_in) {
+    printf("nv21_resize equal \n");
+    memcpy(dst, src, sizeof(uint8_t) * w_in * static_cast<int>(1.5 * h_in));
+    return;
+  }
+  int y_h = h_in;
+  int uv_h = h_in / 2;
+  const uint8_t* y_ptr = src;
+  const uint8_t* uv_ptr = src + y_h * w_in;
+  // out
+  int dst_y_h = h_out;
+  int dst_uv_h = h_out / 2;
+  uint8_t* dst_ptr = dst + dst_y_h * w_out;
+  resize_one_channel(y_ptr, w_in, y_h, dst, w_out, dst_y_h);
+  resize_one_channel_uv(uv_ptr, w_in, uv_h, dst_ptr, w_out, dst_uv_h);
+}
+void resize_one_channel(const uint8_t* src,
+                        int w_in,
+                        int h_in,
+                        uint8_t* dst,
+                        int w_out,
+                        int h_out) {
+  const int resize_coef_bits = 11;
+  const int resize_coef_scale = 1 << resize_coef_bits;
+  double scale_x = static_cast<double>(w_in) / w_out;
+  double scale_y = static_cast<double>(h_in) / h_out;
+  int* buf = new int[w_out * 2 + h_out * 2];
+  int* xofs = buf;          // new int[w];
+  int* yofs = buf + w_out;  // new int[h];
+  int16_t* ialpha =
+      reinterpret_cast<int16_t*>(buf + w_out + h_out);  // new short[w * 2];
+  int16_t* ibeta =
+      reinterpret_cast<int16_t*>(buf + w_out * 2 + h_out);  // new short[h * 2];
+  float fx = 0.f;
+  float fy = 0.f;
+  int sx = 0;
+  int sy = 0;
+#define SATURATE_CAST_SHORT(X)                                               \
+  (int16_t)::std::min(                                                       \
+      ::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
+      SHRT_MAX);
+  for (int dx = 0; dx < w_out; dx++) {
+    fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
+    sx = floor(fx);
+    fx -= sx;
+    if (sx < 0) {
+      sx = 0;
+      fx = 0.f;
+    }
+    if (sx >= w_in - 1) {
+      sx = w_in - 2;
+      fx = 1.f;
+    }
+    xofs[dx] = sx;
+    float a0 = (1.f - fx) * resize_coef_scale;
+    float a1 = fx * resize_coef_scale;
+    ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
+    ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
+  }
+  for (int dy = 0; dy < h_out; dy++) {
+    fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
+    sy = floor(fy);
+    fy -= sy;
+    if (sy < 0) {
+      sy = 0;
+      fy = 0.f;
+    }
+    if (sy >= h_in - 1) {
+      sy = h_in - 2;
+      fy = 1.f;
+    }
+    yofs[dy] = sy;
+    float b0 = (1.f - fy) * resize_coef_scale;
+    float b1 = fy * resize_coef_scale;
+    ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
+    ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
+  }
+#undef SATURATE_CAST_SHORT
+  // loop body
+  int16_t* rowsbuf0 = new int16_t[w_out + 1];
+  int16_t* rowsbuf1 = new int16_t[w_out + 1];
+  int16_t* rows0 = rowsbuf0;
+  int16_t* rows1 = rowsbuf1;
+  int prev_sy1 = -1;
+  for (int dy = 0; dy < h_out; dy++) {
+    int sy = yofs[dy];
+    if (sy == prev_sy1) {
+      // hresize one row
+      int16_t* rows0_old = rows0;
+      rows0 = rows1;
+      rows1 = rows0_old;
+      const uint8_t* S1 = src + w_in * (sy + 1);
+      const int16_t* ialphap = ialpha;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out; dx++) {
+        int sx = xofs[dx];
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+        const uint8_t* S1p = S1 + sx;
+        rows1p[dx] = (S1p[0] * a0 + S1p[1] * a1) >> 4;
+        ialphap += 2;
+      }
+    } else {
+      // hresize two rows
+      const uint8_t* S0 = src + w_in * (sy);
+      const uint8_t* S1 = src + w_in * (sy + 1);
+      const int16_t* ialphap = ialpha;
+      int16_t* rows0p = rows0;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out; dx++) {
+        int sx = xofs[dx];
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+        const uint8_t* S0p = S0 + sx;
+        const uint8_t* S1p = S1 + sx;
+        rows0p[dx] = (S0p[0] * a0 + S0p[1] * a1) >> 4;
+        rows1p[dx] = (S1p[0] * a0 + S1p[1] * a1) >> 4;
+        ialphap += 2;
+      }
+    }
+    prev_sy1 = sy + 1;
+    // vresize
+    int16_t b0 = ibeta[0];
+    int16_t b1 = ibeta[1];
+    int16_t* rows0p = rows0;
+    int16_t* rows1p = rows1;
+    uint8_t* dp_ptr = dst + w_out * (dy);
+    int cnt = w_out >> 3;
+    int remain = w_out - (cnt << 3);
+    int16x4_t _b0 = vdup_n_s16(b0);
+    int16x4_t _b1 = vdup_n_s16(b1);
+    int32x4_t _v2 = vdupq_n_s32(2);
+// #pragma omp parallel for
+#if 1  // __aarch64__
+    for (cnt = w_out >> 3; cnt > 0; cnt--) {
+      int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
+      int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
+      int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
+      int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
+      int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
+      int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
+      int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
+      int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
+      int32x4_t _acc = _v2;
+      _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
+      _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
+      int32x4_t _acc_1 = _v2;
+      _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
+      _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
+      int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
+      int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
+      uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
+      vst1_u8(dp_ptr, _dout);
+      dp_ptr += 8;
+      rows0p += 8;
+      rows1p += 8;
+    }
+#else
+#pragma omp parallel for
+    if (cnt > 0) {
+      asm volatile(
+          "mov        r4, #2          \n"
+          "vdup.s32   q12, r4         \n"
+          "0:                         \n"
+          "pld        [%[rows0p], #128]      \n"
+          "pld        [%[rows1p], #128]      \n"
+          "vld1.s16   {d2-d3}, [%[rows0p]]!\n"
+          "vld1.s16   {d6-d7}, [%[rows0p]]!\n"
+          "pld        [%[rows0p], #128]      \n"
+          "pld        [%[rows1p], #128]      \n"
+          "vmull.s16  q0, d2, %[_b0]     \n"
+          "vmull.s16  q1, d3, %[_b0]     \n"
+          "vmull.s16  q2, d6, %[_b1]     \n"
+          "vmull.s16  q3, d7, %[_b1]     \n"
+          "vld1.s16   {d2-d3}, [%[rows0p]]!\n"
+          "vld1.s16   {d6-d7}, [%[rows0p]]!\n"
+          "vorr.s32   q10, q12, q12   \n"
+          "vorr.s32   q11, q12, q12   \n"
+          "vsra.s32   q10, q0, #16    \n"
+          "vsra.s32   q11, q1, #16    \n"
+          "vsra.s32   q10, q2, #16    \n"
+          "vsra.s32   q11, q3, #16    \n"
+          "vmull.s16  q0, d2, %[_b0]     \n"
+          "vmull.s16  q1, d3, %[_b0]     \n"
+          "vmull.s16  q2, d6, %[_b1]     \n"
+          "vmull.s16  q3, d7, %[_b1]     \n"
+          "vsra.s32   q10, q0, #16    \n"
+          "vsra.s32   q11, q1, #16    \n"
+          "vsra.s32   q10, q2, #16    \n"
+          "vsra.s32   q11, q3, #16    \n"
+          "vshrn.s32  d20, q10, #2    \n"
+          "vshrn.s32  d21, q11, #2    \n"
+          "vqmovun.s16 d20, q10        \n"
+          "vst1.8     {d20}, [%[dp]]!    \n"
+          "subs       %[cnt], #1          \n"
+          "bne        0b              \n"
+          "sub        %[rows0p], #16         \n"
+          "sub        %[rows1p], #16         \n"
+          : [rows0p] "+r"(rows0p),
+            [rows1p] "+r"(rows1p),
+            [_b0] "+w"(_b0),
+            [_b1] "+w"(_b1),
+            [cnt] "+r"(cnt),
+            [dp] "+r"(dp_ptr)
+          :
+          : "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12");
+    }
+#endif  // __aarch64__
+    for (; remain; --remain) {
+      //             D[x] = (rows0[x]*b0 + rows1[x]*b1) >>
+      //             INTER_RESIZE_COEF_BITS;
+      *dp_ptr++ =
+          (uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
+                     (int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
+                    2);
+    }
+    ibeta += 2;
+  }
+  delete[] buf;
+  delete[] rowsbuf0;
+  delete[] rowsbuf1;
+}
+void resize_one_channel_uv(const uint8_t* src,
+                           int w_in,
+                           int h_in,
+                           uint8_t* dst,
+                           int w_out,
+                           int h_out) {
+  const int resize_coef_bits = 11;
+  const int resize_coef_scale = 1 << resize_coef_bits;
+  double scale_x = static_cast<double>(w_in) / w_out;
+  double scale_y = static_cast<double>(h_in) / h_out;
+  int* buf = new int[w_out * 2 + h_out * 2];
+  int* xofs = buf;          // new int[w];
+  int* yofs = buf + w_out;  // new int[h];
+  int16_t* ialpha =
+      reinterpret_cast<int16_t*>(buf + w_out + h_out);  // new int16_t[w * 2];
+  int16_t* ibeta = reinterpret_cast<int16_t*>(buf + w_out * 2 +
+                                              h_out);  // new int16_t[h * 2];
+  float fx = 0.f;
+  float fy = 0.f;
+  int sx = 0.f;
+  int sy = 0.f;
+#define SATURATE_CAST_SHORT(X)                                               \
+  (int16_t)::std::min(                                                       \
+      ::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
+      SHRT_MAX);
+  for (int dx = 0; dx < w_out / 2; dx++) {
+    fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
+    sx = floor(fx);
+    fx -= sx;
+    if (sx < 0) {
+      sx = 0;
+      fx = 0.f;
+    }
+    if (sx >= w_in - 1) {
+      sx = w_in - 2;
+      fx = 1.f;
+    }
+    xofs[dx] = sx;
+    float a0 = (1.f - fx) * resize_coef_scale;
+    float a1 = fx * resize_coef_scale;
+    ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
+    ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
+  }
+  for (int dy = 0; dy < h_out; dy++) {
+    fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
+    sy = floor(fy);
+    fy -= sy;
+    if (sy < 0) {
+      sy = 0;
+      fy = 0.f;
+    }
+    if (sy >= h_in - 1) {
+      sy = h_in - 2;
+      fy = 1.f;
+    }
+    yofs[dy] = sy;
+    float b0 = (1.f - fy) * resize_coef_scale;
+    float b1 = fy * resize_coef_scale;
+    ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
+    ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
+  }
+#undef SATURATE_CAST_SHORT
+  // loop body
+  int16_t* rowsbuf0 = new int16_t[w_out + 1];
+  int16_t* rowsbuf1 = new int16_t[w_out + 1];
+  int16_t* rows0 = rowsbuf0;
+  int16_t* rows1 = rowsbuf1;
+  int prev_sy1 = -1;
+  for (int dy = 0; dy < h_out; dy++) {
+    int sy = yofs[dy];
+    if (sy == prev_sy1) {
+      // hresize one row
+      int16_t* rows0_old = rows0;
+      rows0 = rows1;
+      rows1 = rows0_old;
+      const uint8_t* S1 = src + w_in * (sy + 1);
+      const int16_t* ialphap = ialpha;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out / 2; dx++) {
+        int sx = xofs[dx] * 2;
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+        const uint8_t* S1p = S1 + sx;
+        int tmp = dx * 2;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[2] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[3] * a1) >> 4;
+        ialphap += 2;
+      }
+    } else {
+      // hresize two rows
+      const uint8_t* S0 = src + w_in * (sy);
+      const uint8_t* S1 = src + w_in * (sy + 1);
+      const int16_t* ialphap = ialpha;
+      int16_t* rows0p = rows0;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out / 2; dx++) {
+        int sx = xofs[dx] * 2;
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+        const uint8_t* S0p = S0 + sx;
+        const uint8_t* S1p = S1 + sx;
+        int tmp = dx * 2;
+        rows0p[tmp] = (S0p[0] * a0 + S0p[2] * a1) >> 4;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[2] * a1) >> 4;
+        rows0p[tmp + 1] = (S0p[1] * a0 + S0p[3] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[3] * a1) >> 4;
+        ialphap += 2;
+      }
+    }
+    prev_sy1 = sy + 1;
+    // vresize
+    int16_t b0 = ibeta[0];
+    int16_t b1 = ibeta[1];
+    int16_t* rows0p = rows0;
+    int16_t* rows1p = rows1;
+    uint8_t* dp_ptr = dst + w_out * (dy);
+    int cnt = w_out >> 3;
+    int remain = w_out - (cnt << 3);
+    int16x4_t _b0 = vdup_n_s16(b0);
+    int16x4_t _b1 = vdup_n_s16(b1);
+    int32x4_t _v2 = vdupq_n_s32(2);
+    for (cnt = w_out >> 3; cnt > 0; cnt--) {
+      int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
+      int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
+      int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
+      int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
+      int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
+      int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
+      int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
+      int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
+      int32x4_t _acc = _v2;
+      _acc = vsraq_n_s32(
+          _acc, _rows0p_sr4_mb0, 16);  // _acc >> 16 + _rows0p_sr4_mb0 >> 16
+      _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
+      int32x4_t _acc_1 = _v2;
+      _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
+      _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
+      int16x4_t _acc16 = vshrn_n_s32(_acc, 2);  // _acc >> 2
+      int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
+      uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
+      vst1_u8(dp_ptr, _dout);
+      dp_ptr += 8;
+      rows0p += 8;
+      rows1p += 8;
+    }
+    for (; remain; --remain) {
+      // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
+      *dp_ptr++ =
+          (uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
+                     (int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
+                    2);
+    }
+    ibeta += 2;
+  }
+  delete[] buf;
+  delete[] rowsbuf0;
+  delete[] rowsbuf1;
+}
--- a/lite/tests/cv/anakin/nv21_to_bgr.cc
+++ b/lite/tests/cv/anakin/nv21_to_bgr.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <math.h>
+#include "lite/tests/cv/anakin/cv_utils.h"
+/*
+R = Y + 1.402*(V-128);
+G = Y - 0.34414*(U-128) - 0.71414*(V-128);
+float compute: a*b = ((a << 7)*b )>>7
+ra = 1.402 *128 = 179.456 = 179
+ga = 0.34414 * 64 = 44.3721 = 44
+gb = 0.71414 * 64 = 91.40992 = 91
+ba = 1.772 * 62 = 226.816 = 227
+*/
+// yvu  store hwc bgrbgr dsth * dstw = srch * srcw y_w = srcw
+// y_h = srch uv_w = srcw uv_h = 1/2 * srch
+void nv21_to_bgr(const unsigned char* src,
+                 unsigned char* dst,
+                 int srcw,
+                 int srch) {
+  int y_h = srch;
+  int wout = srcw * 3;
+  const unsigned char* y = src;
+  const unsigned char* vu = src + y_h * srcw;
+  int16x8_t bias = vdupq_n_s16(128);
+  int16x8_t ga = vdupq_n_s16(44);
+  int16x8_t ra = vdupq_n_s16(179);
+  int16x8_t ba = vdupq_n_s16(227);
+  int16x8_t gb = vdupq_n_s16(91);
+  int16x8_t zero = vdupq_n_s16(0);
+  int16x8_t max = vdupq_n_s16(255);
+  for (int i = 0; i < y_h; i += 2) {
+    const unsigned char* ptr_y1 = y + i * srcw;
+    const unsigned char* ptr_y2 = ptr_y1 + srcw;
+    const unsigned char* ptr_vu = vu + (i / 2) * srcw;
+    unsigned char* ptr_bgr1 = dst + i * wout;
+    unsigned char* ptr_bgr2 = ptr_bgr1 + wout;
+// 2*16
+#ifdef __aarch64__
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr_y1]]                \n"
+        "prfm   pldl1keep, [%[ptr_y1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_y2]]        \n"
+        "prfm   pldl1keep, [%[ptr_y2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_vu]]        \n"
+        "prfm   pldl1keep, [%[ptr_vu], #64]   \n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#else
+    asm volatile(
+        "pld [%[ptr_y1]]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y1], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y2]]            @ preload a, 64byte\n"
+        "pld [%[ptr_y2], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_vu]]            @ preload a, 64byte\n"
+        "pld [%[ptr_vu], #128]                         @ preload a, 64byte\n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#endif
+    int j = 0;
+    for (; j < srcw - 15; j += 16) {
+      uint8x8x2_t y1 = vld2_u8(ptr_y1);  // d8 = y0y2y4y6...y14 d9 =
+                                         // y1y3y5...y15
+      uint8x8x2_t vu =
+          vld2_u8(ptr_vu);  // d0 = v0v1v2v3v4v5...v7 d1 = u0u1u2...u7
+      uint8x8x2_t y2 = vld2_u8(ptr_y2);
+      uint16x8_t v = vmovl_u8(vu.val[0]);
+      uint16x8_t u = vmovl_u8(vu.val[1]);
+      int16x8_t v_s = vreinterpretq_s16_u16(v);
+      int16x8_t u_s = vreinterpretq_s16_u16(u);
+      int16x8_t v_bias = vsubq_s16(v_s, bias);
+      int16x8_t u_bias = vsubq_s16(u_s, bias);
+      // G = Y - 0.34414*(U-128) - 0.71414*(V-128);
+      int16x8_t g0 = vmulq_s16(ga, u_bias);
+      // R = Y + 1.402*(V-128);
+      int16x8_t r0 = vmulq_s16(ra, v_bias);
+      // B = Y + 1.772*(U-128);
+      int16x8_t b0 = vmulq_s16(ba, u_bias);
+      g0 = vmlaq_s16(g0, gb, v_bias);
+      int16x8_t y1_0_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[0]));
+      int16x8_t y1_1_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[1]));
+      int16x8_t y2_0_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[0]));
+      int16x8_t y2_1_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[1]));
+      int16x8_t r0_bias = vshrq_n_s16(r0, 7);  // r0 / 128
+      int16x8_t b0_bias = vshrq_n_s16(b0, 7);
+      int16x8_t g0_bias = vshrq_n_s16(g0, 7);
+      int16x8_t r0_1 = vaddq_s16(y1_0_8, r0_bias);
+      int16x8_t b0_1 = vaddq_s16(y1_0_8, b0_bias);
+      int16x8_t g0_1 = vsubq_s16(y1_0_8, g0_bias);  // g0_1 = y1_0_8 - g0_1
+      int16x8_t r0_2 = vaddq_s16(y1_1_8, r0_bias);
+      int16x8_t b0_2 = vaddq_s16(y1_1_8, b0_bias);
+      int16x8_t g0_2 = vsubq_s16(y1_1_8, g0_bias);
+      r0_1 = vmaxq_s16(r0_1, zero);
+      b0_1 = vmaxq_s16(b0_1, zero);
+      g0_1 = vmaxq_s16(g0_1, zero);
+      r0_2 = vmaxq_s16(r0_2, zero);
+      b0_2 = vmaxq_s16(b0_2, zero);
+      g0_2 = vmaxq_s16(g0_2, zero);
+      r0_1 = vminq_s16(r0_1, max);
+      b0_1 = vminq_s16(b0_1, max);
+      g0_1 = vminq_s16(g0_1, max);
+      r0_2 = vminq_s16(r0_2, max);
+      b0_2 = vminq_s16(b0_2, max);
+      g0_2 = vminq_s16(g0_2, max);
+      uint8x8_t r00 = vreinterpret_u8_s8(vmovn_s16(r0_1));
+      uint8x8_t b00 = vreinterpret_u8_s8(vmovn_s16(b0_1));
+      uint8x8_t g00 = vreinterpret_u8_s8(vmovn_s16(g0_1));
+      uint8x8_t r01 = vreinterpret_u8_s8(vmovn_s16(r0_2));
+      uint8x8_t b01 = vreinterpret_u8_s8(vmovn_s16(b0_2));
+      uint8x8_t g01 = vreinterpret_u8_s8(vmovn_s16(g0_2));
+      int16x8_t r1_1 = vaddq_s16(y2_0_8, r0_bias);
+      int16x8_t b1_1 = vaddq_s16(y2_0_8, b0_bias);
+      int16x8_t g1_1 = vsubq_s16(y2_0_8, g0_bias);  // g0_1 = y1_0_8 - g0_1
+      int16x8_t r1_2 = vaddq_s16(y2_1_8, r0_bias);
+      int16x8_t b1_2 = vaddq_s16(y2_1_8, b0_bias);
+      int16x8_t g1_2 = vsubq_s16(y2_1_8, g0_bias);
+      uint8x8x2_t r00_0 = vtrn_u8(r00, r01);  // 014589  236710
+      uint8x8x2_t b00_0 = vtrn_u8(b00, b01);
+      uint8x8x2_t g00_0 = vtrn_u8(g00, g01);
+      r1_1 = vmaxq_s16(r1_1, zero);
+      b1_1 = vmaxq_s16(b1_1, zero);
+      g1_1 = vmaxq_s16(g1_1, zero);
+      r1_2 = vmaxq_s16(r1_2, zero);
+      b1_2 = vmaxq_s16(b1_2, zero);
+      g1_2 = vmaxq_s16(g1_2, zero);
+      uint16x4_t r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
+      uint16x4_t r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
+      uint16x4_t b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
+      uint16x4_t b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
+      uint16x4_t g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
+      uint16x4_t g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
+      uint16x4x2_t r00_1 = vtrn_u16(r0_16, r1_16);  // 012389 456710
+      uint16x4x2_t b00_1 = vtrn_u16(b0_16, b1_16);
+      uint16x4x2_t g00_1 = vtrn_u16(g0_16, g1_16);
+      r1_1 = vminq_s16(r1_1, max);
+      b1_1 = vminq_s16(b1_1, max);
+      g1_1 = vminq_s16(g1_1, max);
+      r1_2 = vminq_s16(r1_2, max);
+      b1_2 = vminq_s16(b1_2, max);
+      g1_2 = vminq_s16(g1_2, max);
+      uint32x2_t r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
+      uint32x2_t r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
+      uint32x2_t b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
+      uint32x2_t b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
+      uint32x2_t g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
+      uint32x2_t g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
+      uint32x2x2_t r00_2 = vtrn_u32(r0_32, r1_32);  // 01234567 8910
+      uint32x2x2_t b00_2 = vtrn_u32(b0_32, b1_32);
+      uint32x2x2_t g00_2 = vtrn_u32(g0_32, g1_32);
+      r00 = vreinterpret_u8_s8(vmovn_s16(r1_1));
+      b00 = vreinterpret_u8_s8(vmovn_s16(b1_1));
+      g00 = vreinterpret_u8_s8(vmovn_s16(g1_1));
+      r01 = vreinterpret_u8_s8(vmovn_s16(r1_2));
+      b01 = vreinterpret_u8_s8(vmovn_s16(b1_2));
+      g01 = vreinterpret_u8_s8(vmovn_s16(g1_2));
+      uint8x8_t r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
+      uint8x8_t b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
+      uint8x8_t g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
+      uint8x8_t r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
+      uint8x8_t b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
+      uint8x8_t g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
+      uint8x8x3_t v_bgr;
+      v_bgr.val[0] = b0_8;
+      v_bgr.val[1] = g0_8;
+      v_bgr.val[2] = r0_8;
+      r00_0 = vtrn_u8(r00, r01);  // 014589  236710
+      b00_0 = vtrn_u8(b00, b01);
+      g00_0 = vtrn_u8(g00, g01);
+      vst3_u8(ptr_bgr1, v_bgr);
+      r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
+      r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
+      b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
+      b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
+      g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
+      g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
+      ptr_bgr1 += 24;
+      uint8x8x3_t v_bgr1;
+      v_bgr1.val[0] = b1_8;
+      v_bgr1.val[1] = g1_8;
+      v_bgr1.val[2] = r1_8;
+      r00_1 = vtrn_u16(r0_16, r1_16);  // 012389 456710
+      b00_1 = vtrn_u16(b0_16, b1_16);
+      g00_1 = vtrn_u16(g0_16, g1_16);
+      vst3_u8(ptr_bgr1, v_bgr1);
+      r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
+      r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
+      b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
+      b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
+      g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
+      g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
+      ptr_bgr1 += 24;
+      r00_2 = vtrn_u32(r0_32, r1_32);  // 01234567 8910
+      b00_2 = vtrn_u32(b0_32, b1_32);
+      g00_2 = vtrn_u32(g0_32, g1_32);
+      ptr_vu += 16;
+      ptr_y1 += 16;
+      ptr_y2 += 16;
+      r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
+      b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
+      g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
+      r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
+      b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
+      g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
+      v_bgr.val[0] = b0_8;
+      v_bgr.val[1] = g0_8;
+      v_bgr.val[2] = r0_8;
+      v_bgr1.val[0] = b1_8;
+      v_bgr1.val[1] = g1_8;
+      v_bgr1.val[2] = r1_8;
+      vst3_u8(ptr_bgr2, v_bgr);
+      vst3_u8(ptr_bgr2 + 24, v_bgr1);
+      ptr_bgr2 += 48;
+    }
+    // two data
+    for (; j < srcw; j += 2) {
+      unsigned char _y0 = ptr_y1[0];
+      unsigned char _y1 = ptr_y1[1];
+      unsigned char _v = ptr_vu[0];
+      unsigned char _u = ptr_vu[1];
+      unsigned char _y0_1 = ptr_y2[0];
+      unsigned char _y1_1 = ptr_y2[1];
+      int ra = floor((179 * (_v - 128)) >> 7);
+      int ga = floor((44 * (_u - 128) + 91 * (_v - 128)) >> 7);
+      int ba = floor((227 * (_u - 128)) >> 7);
+      int r = _y0 + ra;
+      int g = _y0 - ga;
+      int b = _y0 + ba;
+      int r1 = _y1 + ra;
+      int g1 = _y1 - ga;
+      int b1 = _y1 + ba;
+      r = r < 0 ? 0 : (r > 255) ? 255 : r;
+      g = g < 0 ? 0 : (g > 255) ? 255 : g;
+      b = b < 0 ? 0 : (b > 255) ? 255 : b;
+      r1 = r1 < 0 ? 0 : (r1 > 255) ? 255 : r1;
+      g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1;
+      b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1;
+      *ptr_bgr1++ = b;
+      *ptr_bgr1++ = g;
+      *ptr_bgr1++ = r;
+      int r2 = _y0_1 + ra;
+      int g2 = _y0_1 - ga;
+      int b2 = _y0_1 + ba;
+      int r3 = _y1_1 + ra;
+      int g3 = _y1_1 - ga;
+      int b3 = _y1_1 + ba;
+      r2 = r2 < 0 ? 0 : (r2 > 255) ? 255 : r2;
+      g2 = g2 < 0 ? 0 : (g2 > 255) ? 255 : g2;
+      b2 = b2 < 0 ? 0 : (b2 > 255) ? 255 : b2;
+      r3 = r3 < 0 ? 0 : (r3 > 255) ? 255 : r3;
+      g3 = g3 < 0 ? 0 : (g3 > 255) ? 255 : g3;
+      b3 = b3 < 0 ? 0 : (b3 > 255) ? 255 : b3;
+      *ptr_bgr1++ = b1;
+      *ptr_bgr1++ = g1;
+      *ptr_bgr1++ = r1;
+      *ptr_bgr2++ = b2;
+      *ptr_bgr2++ = g2;
+      *ptr_bgr2++ = r2;
+      ptr_y1 += 2;
+      ptr_y2 += 2;
+      ptr_vu += 2;
+      *ptr_bgr2++ = b3;
+      *ptr_bgr2++ = g3;
+      *ptr_bgr2++ = r3;
+    }
+  }
+}
--- a/lite/tests/cv/anakin/nv21_to_bgra.cc
+++ b/lite/tests/cv/anakin/nv21_to_bgra.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <math.h>
+#include "lite/tests/cv/anakin/cv_utils.h"
+/*
+R = Y + 1.402*(V-128);
+G = Y - 0.34414*(U-128) - 0.71414*(V-128);
+B = Y + 1.772*(U-128);
+A = 255
+float-compute: a*b = ((a << 7)*b )>>7
+ra = 1.402 *128 = 179.456 = 179
+ga = 0.34414 * 64 = 44.3721 = 44
+gb = 0.71414 * 64 = 91.40992 = 91
+ba = 1.772 * 62 = 226.816 = 227
+*/
+// yvu  store hwc bgrabgra dsth * dstw = srch * srcw y_w = srcw
+// y_h = srch uv_w = srcw uv_h = 1/2 * srch
+void nv21_to_bgra(const unsigned char* src,
+                  unsigned char* dst,
+                  int srcw,
+                  int srch) {
+  int y_h = srch;
+  int vu_h = 1 / 2 * srch;
+  const unsigned char* y = src;
+  const unsigned char* vu = src + y_h * srcw;
+  int wout = srcw * 4;
+  int16x8_t bias = vdupq_n_s16(128);
+  int16x8_t ga = vdupq_n_s16(44);
+  int16x8_t ra = vdupq_n_s16(179);
+  int16x8_t ba = vdupq_n_s16(227);
+  int16x8_t gb = vdupq_n_s16(91);
+  int16x8_t zero = vdupq_n_s16(0);
+  int16x8_t max = vdupq_n_s16(255);
+  uint8x8_t a_8 = vdup_n_u8(255);
+  for (int i = 0; i < y_h; i += 2) {
+    const unsigned char* ptr_y1 = y + i * srcw;
+    const unsigned char* ptr_y2 = ptr_y1 + srcw;
+    const unsigned char* ptr_vu = vu + (i / 2) * srcw;
+    unsigned char* ptr_bgr1 = dst + i * wout;
+    unsigned char* ptr_bgr2 = ptr_bgr1 + wout;
+// 2*16
+#ifdef __aarch64__
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr_y1]]                \n"
+        "prfm   pldl1keep, [%[ptr_y1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_y2]]        \n"
+        "prfm   pldl1keep, [%[ptr_y2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_vu]]        \n"
+        "prfm   pldl1keep, [%[ptr_vu], #64]   \n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#else
+    asm volatile(
+        "pld [%[ptr_y1]]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y1], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y2]]            @ preload a, 64byte\n"
+        "pld [%[ptr_y2], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_vu]]            @ preload a, 64byte\n"
+        "pld [%[ptr_vu], #128]                         @ preload a, 64byte\n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#endif
+    int j = 0;
+    for (; j < srcw - 15; j += 16) {
+      uint8x8x2_t y1 = vld2_u8(ptr_y1);  // d8 = y0y2y4y6...y14 d9 =
+                                         // y1y3y5...y15
+      uint8x8x2_t vu =
+          vld2_u8(ptr_vu);  // d0 = v0v1v2v3v4v5...v7 d1 = u0u1u2...u7
+      uint8x8x2_t y2 = vld2_u8(ptr_y2);
+      uint16x8_t v = vmovl_u8(vu.val[0]);
+      uint16x8_t u = vmovl_u8(vu.val[1]);
+      int16x8_t v_s = vreinterpretq_s16_u16(v);
+      int16x8_t u_s = vreinterpretq_s16_u16(u);
+      int16x8_t v_bias = vsubq_s16(v_s, bias);
+      int16x8_t u_bias = vsubq_s16(u_s, bias);
+      // G = Y - 0.34414*(U-128) - 0.71414*(V-128);
+      int16x8_t g0 = vmulq_s16(ga, u_bias);
+      // R = Y + 1.402*(V-128);
+      int16x8_t r0 = vmulq_s16(ra, v_bias);
+      // B = Y + 1.772*(U-128);
+      int16x8_t b0 = vmulq_s16(ba, u_bias);
+      g0 = vmlaq_s16(g0, gb, v_bias);
+      int16x8_t y1_0_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[0]));
+      int16x8_t y1_1_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[1]));
+      int16x8_t y2_0_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[0]));
+      int16x8_t y2_1_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[1]));
+      int16x8_t r0_bias = vshrq_n_s16(r0, 7);  // r0 / 128
+      int16x8_t b0_bias = vshrq_n_s16(b0, 7);
+      int16x8_t g0_bias = vshrq_n_s16(g0, 7);
+      int16x8_t r0_1 = vaddq_s16(y1_0_8, r0_bias);
+      int16x8_t b0_1 = vaddq_s16(y1_0_8, b0_bias);
+      int16x8_t g0_1 = vsubq_s16(y1_0_8, g0_bias);  // g0_1 = y1_0_8 - g0_1
+      int16x8_t r0_2 = vaddq_s16(y1_1_8, r0_bias);
+      int16x8_t b0_2 = vaddq_s16(y1_1_8, b0_bias);
+      int16x8_t g0_2 = vsubq_s16(y1_1_8, g0_bias);
+      r0_1 = vmaxq_s16(r0_1, zero);
+      b0_1 = vmaxq_s16(b0_1, zero);
+      g0_1 = vmaxq_s16(g0_1, zero);
+      r0_2 = vmaxq_s16(r0_2, zero);
+      b0_2 = vmaxq_s16(b0_2, zero);
+      g0_2 = vmaxq_s16(g0_2, zero);
+      r0_1 = vminq_s16(r0_1, max);
+      b0_1 = vminq_s16(b0_1, max);
+      g0_1 = vminq_s16(g0_1, max);
+      r0_2 = vminq_s16(r0_2, max);
+      b0_2 = vminq_s16(b0_2, max);
+      g0_2 = vminq_s16(g0_2, max);
+      uint8x8_t r00 = vreinterpret_u8_s8(vmovn_s16(r0_1));
+      uint8x8_t b00 = vreinterpret_u8_s8(vmovn_s16(b0_1));
+      uint8x8_t g00 = vreinterpret_u8_s8(vmovn_s16(g0_1));
+      uint8x8_t r01 = vreinterpret_u8_s8(vmovn_s16(r0_2));
+      uint8x8_t b01 = vreinterpret_u8_s8(vmovn_s16(b0_2));
+      uint8x8_t g01 = vreinterpret_u8_s8(vmovn_s16(g0_2));
+      int16x8_t r1_1 = vaddq_s16(y2_0_8, r0_bias);
+      int16x8_t b1_1 = vaddq_s16(y2_0_8, b0_bias);
+      int16x8_t g1_1 = vsubq_s16(y2_0_8, g0_bias);  // g0_1 = y1_0_8 - g0_1
+      int16x8_t r1_2 = vaddq_s16(y2_1_8, r0_bias);
+      int16x8_t b1_2 = vaddq_s16(y2_1_8, b0_bias);
+      int16x8_t g1_2 = vsubq_s16(y2_1_8, g0_bias);
+      uint8x8x2_t r00_0 = vtrn_u8(r00, r01);  // 014589  236710
+      uint8x8x2_t b00_0 = vtrn_u8(b00, b01);
+      uint8x8x2_t g00_0 = vtrn_u8(g00, g01);
+      r1_1 = vmaxq_s16(r1_1, zero);
+      b1_1 = vmaxq_s16(b1_1, zero);
+      g1_1 = vmaxq_s16(g1_1, zero);
+      r1_2 = vmaxq_s16(r1_2, zero);
+      b1_2 = vmaxq_s16(b1_2, zero);
+      g1_2 = vmaxq_s16(g1_2, zero);
+      uint16x4_t r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
+      uint16x4_t r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
+      uint16x4_t b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
+      uint16x4_t b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
+      uint16x4_t g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
+      uint16x4_t g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
+      uint16x4x2_t r00_1 = vtrn_u16(r0_16, r1_16);  // 012389 456710
+      uint16x4x2_t b00_1 = vtrn_u16(b0_16, b1_16);
+      uint16x4x2_t g00_1 = vtrn_u16(g0_16, g1_16);
+      r1_1 = vminq_s16(r1_1, max);
+      b1_1 = vminq_s16(b1_1, max);
+      g1_1 = vminq_s16(g1_1, max);
+      r1_2 = vminq_s16(r1_2, max);
+      b1_2 = vminq_s16(b1_2, max);
+      g1_2 = vminq_s16(g1_2, max);
+      uint32x2_t r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
+      uint32x2_t r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
+      uint32x2_t b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
+      uint32x2_t b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
+      uint32x2_t g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
+      uint32x2_t g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
+      uint32x2x2_t r00_2 = vtrn_u32(r0_32, r1_32);  // 01234567 8910
+      uint32x2x2_t b00_2 = vtrn_u32(b0_32, b1_32);
+      uint32x2x2_t g00_2 = vtrn_u32(g0_32, g1_32);
+      r00 = vreinterpret_u8_s8(vmovn_s16(r1_1));
+      b00 = vreinterpret_u8_s8(vmovn_s16(b1_1));
+      g00 = vreinterpret_u8_s8(vmovn_s16(g1_1));
+      r01 = vreinterpret_u8_s8(vmovn_s16(r1_2));
+      b01 = vreinterpret_u8_s8(vmovn_s16(b1_2));
+      g01 = vreinterpret_u8_s8(vmovn_s16(g1_2));
+      uint8x8_t r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
+      uint8x8_t b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
+      uint8x8_t g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
+      uint8x8_t r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
+      uint8x8_t b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
+      uint8x8_t g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
+      uint8x8x4_t v_bgr;
+      v_bgr.val[0] = b0_8;
+      v_bgr.val[1] = g0_8;
+      v_bgr.val[2] = r0_8;
+      v_bgr.val[3] = a_8;
+      r00_0 = vtrn_u8(r00, r01);  // 014589  236710
+      b00_0 = vtrn_u8(b00, b01);
+      g00_0 = vtrn_u8(g00, g01);
+      vst4_u8(ptr_bgr1, v_bgr);
+      r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
+      r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
+      b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
+      b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
+      g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
+      g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
+      ptr_bgr1 += 32;
+      uint8x8x4_t v_bgr1;
+      v_bgr1.val[0] = b1_8;
+      v_bgr1.val[1] = g1_8;
+      v_bgr1.val[2] = r1_8;
+      v_bgr1.val[3] = a_8;
+      r00_1 = vtrn_u16(r0_16, r1_16);  // 012389 456710
+      b00_1 = vtrn_u16(b0_16, b1_16);
+      g00_1 = vtrn_u16(g0_16, g1_16);
+      vst4_u8(ptr_bgr1, v_bgr1);
+      r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
+      r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
+      b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
+      b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
+      g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
+      g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
+      ptr_bgr1 += 32;
+      r00_2 = vtrn_u32(r0_32, r1_32);  // 01234567 8910
+      b00_2 = vtrn_u32(b0_32, b1_32);
+      g00_2 = vtrn_u32(g0_32, g1_32);
+      ptr_vu += 16;
+      ptr_y1 += 16;
+      ptr_y2 += 16;
+      r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
+      b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
+      g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
+      r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
+      b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
+      g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
+      v_bgr.val[0] = b0_8;
+      v_bgr.val[1] = g0_8;
+      v_bgr.val[2] = r0_8;
+      v_bgr1.val[0] = b1_8;
+      v_bgr1.val[1] = g1_8;
+      v_bgr1.val[2] = r1_8;
+      vst4_u8(ptr_bgr2, v_bgr);
+      vst4_u8(ptr_bgr2 + 32, v_bgr1);
+      ptr_bgr2 += 64;
+    }
+    // two data
+    for (; j < srcw; j += 2) {
+      unsigned char _y0 = ptr_y1[0];
+      unsigned char _y1 = ptr_y1[1];
+      unsigned char _v = ptr_vu[0];
+      unsigned char _u = ptr_vu[1];
+      unsigned char _y0_1 = ptr_y2[0];
+      unsigned char _y1_1 = ptr_y2[1];
+      int ra = floor((179 * (_v - 128)) >> 7);
+      int ga = floor((44 * (_u - 128) + 91 * (_v - 128)) >> 7);
+      int ba = floor((227 * (_u - 128)) >> 7);
+      int r = _y0 + ra;
+      int g = _y0 - ga;
+      int b = _y0 + ba;
+      int r1 = _y1 + ra;
+      int g1 = _y1 - ga;
+      int b1 = _y1 + ba;
+      r = r < 0 ? 0 : (r > 255) ? 255 : r;
+      g = g < 0 ? 0 : (g > 255) ? 255 : g;
+      b = b < 0 ? 0 : (b > 255) ? 255 : b;
+      r1 = r1 < 0 ? 0 : (r1 > 255) ? 255 : r1;
+      g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1;
+      b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1;
+      *ptr_bgr1++ = b;
+      *ptr_bgr1++ = g;
+      *ptr_bgr1++ = r;
+      *ptr_bgr1++ = 255;
+      int r2 = _y0_1 + ra;
+      int g2 = _y0_1 - ga;
+      int b2 = _y0_1 + ba;
+      int r3 = _y1_1 + ra;
+      int g3 = _y1_1 - ga;
+      int b3 = _y1_1 + ba;
+      r2 = r2 < 0 ? 0 : (r2 > 255) ? 255 : r2;
+      g2 = g2 < 0 ? 0 : (g2 > 255) ? 255 : g2;
+      b2 = b2 < 0 ? 0 : (b2 > 255) ? 255 : b2;
+      r3 = r3 < 0 ? 0 : (r3 > 255) ? 255 : r3;
+      g3 = g3 < 0 ? 0 : (g3 > 255) ? 255 : g3;
+      b3 = b3 < 0 ? 0 : (b3 > 255) ? 255 : b3;
+      *ptr_bgr1++ = b1;
+      *ptr_bgr1++ = g1;
+      *ptr_bgr1++ = r1;
+      *ptr_bgr1++ = 255;
+      *ptr_bgr2++ = b2;
+      *ptr_bgr2++ = g2;
+      *ptr_bgr2++ = r2;
+      *ptr_bgr2++ = 255;
+      ptr_y1 += 2;
+      ptr_y2 += 2;
+      ptr_vu += 2;
+      *ptr_bgr2++ = b3;
+      *ptr_bgr2++ = g3;
+      *ptr_bgr2++ = r3;
+      *ptr_bgr2++ = 255;
+    }
+  }
+}
--- a/lite/tests/cv/image_profiler_test.cc
+++ b/lite/tests/cv/image_profiler_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <math.h>
+#include <random>
+#include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
+#include "lite/tests/cv/anakin/cv_utils.h"
+#include "lite/tests/utils/tensor_utils.h"
+#include "lite/utils/cv/paddle_image_preprocess.h"
+#include "time.h"  // NOLINT
+DEFINE_int32(cluster, 3, "cluster id");
+DEFINE_int32(threads, 1, "threads num");
+DEFINE_int32(warmup, 0, "warmup times");
+DEFINE_int32(repeats, 10, "repeats times");
+DEFINE_bool(basic_test, false, "do all tests");
+DEFINE_bool(check_result, true, "check the result");
+DEFINE_int32(srcFormat, 12, "input image format NV12");
+DEFINE_int32(dstFormat, 3, "output image format BGR");
+DEFINE_int32(srch, 1920, "input height");
+DEFINE_int32(srcw, 1080, "input width");
+DEFINE_int32(dsth, 960, "output height");
+DEFINE_int32(dstw, 540, "output width");
+DEFINE_int32(angle, 90, "rotate angel");
+DEFINE_int32(flip_num, 0, "flip x");
+DEFINE_int32(layout, 1, "layout nchw");
+typedef paddle::lite::utils::cv::ImageFormat ImageFormat;
+typedef paddle::lite::utils::cv::FlipParam FlipParam;
+typedef paddle::lite_api::DataLayoutType LayoutType;
+typedef paddle::lite::utils::cv::TransParam TransParam;
+typedef paddle::lite::utils::cv::ImagePreprocess ImagePreprocess;
+typedef paddle::lite_api::Tensor Tensor_api;
+typedef paddle::lite::Tensor Tensor;
+using paddle::lite::profile::Timer;
+void fill_tensor_host_rand(uint8_t* dio, int64_t size) {
+  uint seed = 256;
+  for (int64_t i = 0; i < size; ++i) {
+    dio[i] = rand_r(&seed) % 256;  // -128;
+  }
+}
+void print_int8(uint8_t* ptr, int size, int width) {
+  for (int i = 0; i < size; i++) {
+    printf("%d  ", *ptr++);
+    if ((i + 1) % width == 0) {
+      printf("\n");
+    }
+  }
+  printf("\n");
+}
+void print_int(int* ptr, int size, int width) {
+  int j = 0;
+  for (int i = 0; i < size; i++) {
+    printf("%d  ", *ptr++);
+    if ((i + 1) % width == 0) {
+      printf("\n");
+    }
+  }
+  printf("\n");
+}
+void print_fp32(const float* ptr, int size, int width) {
+  int j = 0;
+  for (int i = 0; i < size; i++) {
+    printf("%f  ", *ptr++);
+    if ((i + 1) % width == 0) {
+      printf("\n");
+    }
+  }
+  printf("\n");
+}
+#ifdef LITE_WITH_ARM
+void test_convert(const std::vector<int>& cluster_id,
+                  const std::vector<int>& thread_num,
+                  int srcw,
+                  int srch,
+                  int dstw,
+                  int dsth,
+                  ImageFormat srcFormat,
+                  ImageFormat dstFormat,
+                  float rotate,
+                  FlipParam flip,
+                  LayoutType layout,
+                  int test_iter = 10) {
+  for (auto& cls : cluster_id) {
+    for (auto& th : thread_num) {
+      std::unique_ptr<paddle::lite::KernelContext> ctx1(
+          new paddle::lite::KernelContext);
+      auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+      ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
+      LOG(INFO) << "cluster: " << cls << ", threads: " << th;
+      int size = 3 * srch * srcw;
+      if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
+        size = ceil(1.5 * srch) * srcw;
+      } else if (srcFormat == ImageFormat::BGRA ||
+                 srcFormat == ImageFormat::RGBA) {
+        size = 4 * srch * srcw;
+      } else if (srcFormat == ImageFormat::GRAY) {
+        size = srch * srcw;
+      }
+      uint8_t* src = new uint8_t[size];
+      fill_tensor_host_rand(src, size);
+      int out_size = srch * srcw;
+      if (dstFormat == ImageFormat::NV12 || dstFormat == ImageFormat::NV21) {
+        out_size = ceil(1.5 * srch) * srcw;
+      } else if (dstFormat == ImageFormat::BGR ||
+                 dstFormat == ImageFormat::RGB) {
+        out_size = 3 * srch * srcw;
+      } else if (dstFormat == ImageFormat::BGRA ||
+                 dstFormat == ImageFormat::RGBA) {
+        out_size = 4 * srch * srcw;
+      } else if (dstFormat == ImageFormat::GRAY) {
+        out_size = srch * srcw;
+      }
+      uint8_t* basic_dst = new uint8_t[out_size];
+      uint8_t* lite_dst = new uint8_t[out_size];
+      Timer t_basic, t_lite;
+      LOG(INFO) << "basic Convert compute";
+      for (int i = 0; i < test_iter; i++) {
+        t_basic.Start();
+        image_basic_convert(src,
+                            basic_dst,
+                            (ImageFormat)srcFormat,
+                            (ImageFormat)dstFormat,
+                            srcw,
+                            srch,
+                            out_size);
+        t_basic.Stop();
+      }
+      LOG(INFO) << "image baisc Convert avg time : " << t_basic.LapTimes().Avg()
+                << ", min time: " << t_basic.LapTimes().Min()
+                << ", max time: " << t_basic.LapTimes().Max();
+      LOG(INFO) << "lite Convert compute";
+      TransParam tparam;
+      tparam.ih = srch;
+      tparam.iw = srcw;
+      tparam.oh = srch;
+      tparam.ow = srcw;
+      tparam.flip_param = flip;
+      tparam.rotate_param = rotate;
+      ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
+      for (int i = 0; i < test_iter; ++i) {
+        t_lite.Start();
+        image_preprocess.imageConvert(src, lite_dst);
+        t_lite.Stop();
+      }
+      LOG(INFO) << "image Convert avg time : " << t_lite.LapTimes().Avg()
+                << ", min time: " << t_lite.LapTimes().Min()
+                << ", max time: " << t_lite.LapTimes().Max();
+      LOG(INFO) << "basic Convert compute";
+      double max_ratio = 0;
+      double max_diff = 0;
+      const double eps = 1e-6f;
+      if (FLAGS_check_result) {
+        LOG(INFO) << "diff, image convert size: " << out_size;
+        uint8_t* diff_v = new uint8_t[out_size];
+        for (int i = 0; i < out_size; i++) {
+          uint8_t a = lite_dst[i];
+          uint8_t b = basic_dst[i];
+          uint8_t diff1 = a - b;
+          uint8_t diff = diff1 > 0 ? diff1 : -diff1;
+          diff_v[i] = diff;
+          if (max_diff < diff) {
+            max_diff = diff;
+            max_ratio = 2.0 * max_diff / (a + b + eps);
+          }
+        }
+        if (std::abs(max_ratio) >= 1e-5f) {
+          int width = size / srch;
+          printf("din: \n");
+          print_int8(src, size, width);
+          width = out_size / srch;
+          printf("saber result: \n");
+          print_int8(lite_dst, out_size, width);
+          printf("basic result: \n");
+          print_int8(basic_dst, out_size, width);
+          printf("diff result: \n");
+          print_int8(diff_v, out_size, width);
+        }
+        delete[] diff_v;
+        LOG(INFO) << "compare result, max diff: " << max_diff
+                  << ", max ratio: " << max_ratio;
+        bool rst = std::abs(max_ratio) < 1e-5f;
+        CHECK_EQ(rst, true) << "compute result error";
+      }
+      LOG(INFO) << "image convert end";
+    }
+  }
+}
+void test_resize(const std::vector<int>& cluster_id,
+                 const std::vector<int>& thread_num,
+                 int srcw,
+                 int srch,
+                 int dstw,
+                 int dsth,
+                 ImageFormat srcFormat,
+                 ImageFormat dstFormat,
+                 float rotate,
+                 FlipParam flip,
+                 LayoutType layout,
+                 int test_iter = 10) {
+  test_iter = 1;
+  for (auto& cls : cluster_id) {
+    for (auto& th : thread_num) {
+      std::unique_ptr<paddle::lite::KernelContext> ctx1(
+          new paddle::lite::KernelContext);
+      auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+      ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
+      LOG(INFO) << "cluster: " << cls << ", threads: " << th;
+      int size = 3 * srch * srcw;
+      if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
+        size = ceil(1.5 * srch) * srcw;
+      } else if (srcFormat == ImageFormat::BGRA ||
+                 srcFormat == ImageFormat::RGBA) {
+        size = 4 * srch * srcw;
+      } else if (srcFormat == ImageFormat::GRAY) {
+        size = srch * srcw;
+      }
+      uint8_t* src = new uint8_t[size];
+      fill_tensor_host_rand(src, size);
+      int out_size = dsth * dstw;
+      if (dstFormat == ImageFormat::NV12 || dstFormat == ImageFormat::NV21) {
+        out_size = ceil(1.5 * dsth) * dstw;
+      } else if (dstFormat == ImageFormat::BGR ||
+                 dstFormat == ImageFormat::RGB) {
+        out_size = 3 * dsth * dstw;
+      } else if (dstFormat == ImageFormat::BGRA ||
+                 dstFormat == ImageFormat::RGBA) {
+        out_size = 4 * dsth * dstw;
+      } else if (dstFormat == ImageFormat::GRAY) {
+        out_size = dsth * dstw;
+      }
+      uint8_t* basic_dst = new uint8_t[out_size];
+      uint8_t* lite_dst = new uint8_t[out_size];
+      Timer t_rotate;
+      Timer t_basic, t_lite;
+      LOG(INFO) << "baisc resize compute";
+      for (int i = 0; i < test_iter; i++) {
+        t_basic.Start();
+        image_basic_resize(
+            src, basic_dst, (ImageFormat)dstFormat, srcw, srch, dstw, dsth);
+        t_basic.Stop();
+      }
+      LOG(INFO) << "image baisc Resize avg time : " << t_basic.LapTimes().Avg()
+                << ", min time: " << t_basic.LapTimes().Min()
+                << ", max time: " << t_basic.LapTimes().Max();
+      LOG(INFO) << "lite resize compute";
+      TransParam tparam;
+      tparam.ih = srch;
+      tparam.iw = srcw;
+      tparam.oh = dsth;
+      tparam.ow = dstw;
+      tparam.flip_param = flip;
+      tparam.rotate_param = rotate;
+      ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
+      for (int i = 0; i < test_iter; ++i) {
+        t_rotate.Start();
+        image_preprocess.imageResize(src, lite_dst);
+        t_rotate.Stop();
+      }
+      LOG(INFO) << "image Resize avg time : " << t_rotate.LapTimes().Avg()
+                << ", min time: " << t_rotate.LapTimes().Min()
+                << ", max time: " << t_rotate.LapTimes().Max();
+      double max_ratio = 0;
+      double max_diff = 0;
+      const double eps = 1e-6f;
+      if (FLAGS_check_result) {
+        LOG(INFO) << "diff, image Resize size: " << out_size;
+        int* diff_v = new int[out_size];
+        for (int i = 0; i < out_size; i++) {
+          uint8_t a = lite_dst[i];
+          uint8_t b = basic_dst[i];
+          int diff1 = a - b;  // basic resize and saber resize 在float ->
+          // int转换时存在误差，误差范围是{-1, 1}
+          int diff = 0;
+          if (diff1 < -1 || diff1 > 1) diff = diff1 < 0 ? -diff1 : diff1;
+          diff_v[i] = diff;
+          if (diff > 1 && max_diff < diff) {
+            max_diff = diff;
+            printf("i: %d, lite: %d, basic: %d \n", i, a, b);
+            max_ratio = 2.0 * max_diff / (a + b + eps);
+          }
+        }
+        if (std::abs(max_ratio) >= 1e-5f) {
+          int width = size / srcw;
+          printf("din: \n");
+          print_int8(src, size, width);
+          width = out_size / dstw;
+          printf("saber result: \n");
+          print_int8(lite_dst, out_size, width);
+          printf("basic result: \n");
+          print_int8(basic_dst, out_size, width);
+          printf("diff result: \n");
+          print_int(diff_v, out_size, width);
+        }
+        delete[] diff_v;
+        LOG(INFO) << "compare result, max diff: " << max_diff
+                  << ", max ratio: " << max_ratio;
+        bool rst = std::abs(max_ratio) < 1e-5f;
+        CHECK_EQ(rst, true) << "compute result error";
+      }
+      LOG(INFO) << "image Resize end";
+    }
+  }
+}
+void test_flip(const std::vector<int>& cluster_id,
+               const std::vector<int>& thread_num,
+               int srcw,
+               int srch,
+               int dstw,
+               int dsth,
+               ImageFormat srcFormat,
+               ImageFormat dstFormat,
+               float rotate,
+               FlipParam flip,
+               LayoutType layout,
+               int test_iter = 10) {
+  for (auto& cls : cluster_id) {
+    for (auto& th : thread_num) {
+      std::unique_ptr<paddle::lite::KernelContext> ctx1(
+          new paddle::lite::KernelContext);
+      auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+      ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
+      LOG(INFO) << "cluster: " << cls << ", threads: " << th;
+      int size = 3 * srch * srcw;
+      if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
+        size = ceil(1.5 * srch) * srcw;
+      } else if (srcFormat == ImageFormat::BGRA ||
+                 srcFormat == ImageFormat::RGBA) {
+        size = 4 * srch * srcw;
+      } else if (srcFormat == ImageFormat::GRAY) {
+        size = srch * srcw;
+      }
+      uint8_t* src = new uint8_t[size];
+      fill_tensor_host_rand(src, size);
+      int out_size = srch * srcw;
+      if (dstFormat == ImageFormat::NV12 || dstFormat == ImageFormat::NV21) {
+        out_size = ceil(1.5 * srch) * srcw;
+      } else if (dstFormat == ImageFormat::BGR ||
+                 dstFormat == ImageFormat::RGB) {
+        out_size = 3 * srch * srcw;
+      } else if (dstFormat == ImageFormat::BGRA ||
+                 dstFormat == ImageFormat::RGBA) {
+        out_size = 4 * srch * srcw;
+      } else if (dstFormat == ImageFormat::GRAY) {
+        out_size = srch * srcw;
+      }
+      uint8_t* basic_dst = new uint8_t[out_size];
+      uint8_t* lite_dst = new uint8_t[out_size];
+      LOG(INFO) << "basic flip compute";
+      Timer t_basic, t_lite;
+      for (int i = 0; i < test_iter; i++) {
+        t_basic.Start();
+        image_basic_flip(
+            src, basic_dst, (ImageFormat)dstFormat, srcw, srch, flip);
+        t_basic.Stop();
+      }
+      LOG(INFO) << "image baisc flip avg time : " << t_basic.LapTimes().Avg()
+                << ", min time: " << t_basic.LapTimes().Min()
+                << ", max time: " << t_basic.LapTimes().Max();
+      LOG(INFO) << "lite flip compute";
+      TransParam tparam;
+      tparam.ih = srch;
+      tparam.iw = srcw;
+      tparam.oh = srch;
+      tparam.ow = srcw;
+      tparam.flip_param = flip;
+      tparam.rotate_param = rotate;
+      ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
+      for (int i = 0; i < test_iter; ++i) {
+        t_lite.Start();
+        image_preprocess.imageFlip(src, lite_dst);
+        t_lite.Stop();
+      }
+      LOG(INFO) << "image flip avg time : " << t_lite.LapTimes().Avg()
+                << ", min time: " << t_lite.LapTimes().Min()
+                << ", max time: " << t_lite.LapTimes().Max();
+      double max_ratio = 0;
+      double max_diff = 0;
+      const double eps = 1e-6f;
+      if (FLAGS_check_result) {
+        LOG(INFO) << "diff, image flip size: " << out_size;
+        uint8_t* diff_v = new uint8_t[out_size];
+        for (int i = 0; i < out_size; i++) {
+          uint8_t a = lite_dst[i];
+          uint8_t b = basic_dst[i];
+          uint8_t diff1 = a - b;
+          uint8_t diff = diff1 > 0 ? diff1 : -diff1;
+          diff_v[i] = diff;
+          if (max_diff < diff) {
+            max_diff = diff;
+            max_ratio = 2.0 * max_diff / (a + b + eps);
+          }
+        }
+        if (std::abs(max_ratio) >= 1e-5f) {
+          int width = size / srch;
+          printf("din: \n");
+          print_int8(src, size, width);
+          width = out_size / srch;
+          printf("saber result: \n");
+          print_int8(lite_dst, out_size, width);
+          printf("basic result: \n");
+          print_int8(basic_dst, out_size, width);
+          printf("diff result: \n");
+          print_int8(diff_v, out_size, width);
+        }
+        delete[] diff_v;
+        LOG(INFO) << "compare result, max diff: " << max_diff
+                  << ", max ratio: " << max_ratio;
+        bool rst = std::abs(max_ratio) < 1e-5f;
+        CHECK_EQ(rst, true) << "compute result error";
+      }
+      LOG(INFO) << "image flip end";
+    }
+  }
+}
+void test_rotate(const std::vector<int>& cluster_id,
+                 const std::vector<int>& thread_num,
+                 int srcw,
+                 int srch,
+                 int dstw,
+                 int dsth,
+                 ImageFormat srcFormat,
+                 ImageFormat dstFormat,
+                 float rotate,
+                 FlipParam flip,
+                 LayoutType layout,
+                 int test_iter = 10) {
+  for (auto& cls : cluster_id) {
+    for (auto& th : thread_num) {
+      std::unique_ptr<paddle::lite::KernelContext> ctx1(
+          new paddle::lite::KernelContext);
+      auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+      ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
+      LOG(INFO) << "cluster: " << cls << ", threads: " << th;
+      int size = 3 * srch * srcw;
+      if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
+        size = ceil(1.5 * srch) * srcw;
+      } else if (srcFormat == ImageFormat::BGRA ||
+                 srcFormat == ImageFormat::RGBA) {
+        size = 4 * srch * srcw;
+      } else if (srcFormat == ImageFormat::GRAY) {
+        size = srch * srcw;
+      }
+      uint8_t* src = new uint8_t[size];
+      fill_tensor_host_rand(src, size);
+      int out_size = srch * srcw;
+      if (dstFormat == ImageFormat::NV12 || dstFormat == ImageFormat::NV21) {
+        out_size = ceil(1.5 * srch) * srcw;
+      } else if (dstFormat == ImageFormat::BGR ||
+                 dstFormat == ImageFormat::RGB) {
+        out_size = 3 * srch * srcw;
+      } else if (dstFormat == ImageFormat::BGRA ||
+                 dstFormat == ImageFormat::RGBA) {
+        out_size = 4 * srch * srcw;
+      } else if (dstFormat == ImageFormat::GRAY) {
+        out_size = srch * srcw;
+      }
+      uint8_t* basic_dst = new uint8_t[out_size];
+      uint8_t* lite_dst = new uint8_t[out_size];
+      LOG(INFO) << "basic rotate compute";
+      Timer t_basic, t_lite;
+      for (int i = 0; i < test_iter; i++) {
+        t_basic.Start();
+        image_basic_rotate(
+            src, basic_dst, (ImageFormat)dstFormat, srcw, srch, rotate);
+        t_basic.Stop();
+      }
+      LOG(INFO) << "image baisc rotate avg time : " << t_basic.LapTimes().Avg()
+                << ", min time: " << t_basic.LapTimes().Min()
+                << ", max time: " << t_basic.LapTimes().Max();
+      LOG(INFO) << "lite rotate compute";
+      TransParam tparam;
+      tparam.ih = srch;
+      tparam.iw = srcw;
+      tparam.oh = srch;
+      tparam.ow = srcw;
+      tparam.flip_param = flip;
+      tparam.rotate_param = rotate;
+      ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
+      for (int i = 0; i < test_iter; ++i) {
+        t_lite.Start();
+        image_preprocess.imageRotate(src, lite_dst);
+        t_lite.Stop();
+      }
+      LOG(INFO) << "image rotate avg time : " << t_lite.LapTimes().Avg()
+                << ", min time: " << t_lite.LapTimes().Min()
+                << ", max time: " << t_lite.LapTimes().Max();
+      double max_ratio = 0;
+      double max_diff = 0;
+      const double eps = 1e-6f;
+      if (FLAGS_check_result) {
+        LOG(INFO) << "diff, image rotate size: " << out_size;
+        uint8_t* diff_v = new uint8_t[out_size];
+        for (int i = 0; i < out_size; i++) {
+          uint8_t a = lite_dst[i];
+          uint8_t b = basic_dst[i];
+          uint8_t diff1 = a - b;
+          uint8_t diff = diff1 > 0 ? diff1 : -diff1;
+          diff_v[i] = diff;
+          if (max_diff < diff) {
+            max_diff = diff;
+            max_ratio = 2.0 * max_diff / (a + b + eps);
+          }
+        }
+        if (std::abs(max_ratio) >= 1e-5f) {
+          int width = size / srch;
+          printf("din: \n");
+          print_int8(src, size, width);
+          width = out_size / srch;
+          printf("saber result: \n");
+          print_int8(lite_dst, out_size, width);
+          printf("basic result: \n");
+          print_int8(basic_dst, out_size, width);
+          printf("diff result: \n");
+          print_int8(diff_v, out_size, width);
+        }
+        delete[] diff_v;
+        LOG(INFO) << "compare result, max diff: " << max_diff
+                  << ", max ratio: " << max_ratio;
+        bool rst = std::abs(max_ratio) < 1e-5f;
+        CHECK_EQ(rst, true) << "compute result error";
+      }
+      LOG(INFO) << "image rotate end";
+    }
+  }
+}
+void test_to_tensor(const std::vector<int>& cluster_id,
+                    const std::vector<int>& thread_num,
+                    int srcw,
+                    int srch,
+                    int dstw,
+                    int dsth,
+                    ImageFormat srcFormat,
+                    ImageFormat dstFormat,
+                    float rotate,
+                    FlipParam flip,
+                    LayoutType layout,
+                    int test_iter = 10) {
+  for (auto& cls : cluster_id) {
+    for (auto& th : thread_num) {
+      std::unique_ptr<paddle::lite::KernelContext> ctx1(
+          new paddle::lite::KernelContext);
+      auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+      ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
+      LOG(INFO) << "cluster: " << cls << ", threads: " << th;
+      int size = 3 * srch * srcw;
+      if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
+        size = ceil(1.5 * srch) * srcw;
+      } else if (srcFormat == ImageFormat::BGRA ||
+                 srcFormat == ImageFormat::RGBA) {
+        size = 4 * srch * srcw;
+      } else if (srcFormat == ImageFormat::GRAY) {
+        size = srch * srcw;
+      }
+      uint8_t* src = new uint8_t[size];
+      fill_tensor_host_rand(src, size);
+      int out_size = srch * srcw;
+      int resize = dstw * dsth;
+      if (dstFormat == ImageFormat::NV12 || dstFormat == ImageFormat::NV21) {
+        out_size = ceil(1.5 * srch) * srcw;
+        resize = ceil(1.5 * dsth) * dstw;
+      } else if (dstFormat == ImageFormat::BGR ||
+                 dstFormat == ImageFormat::RGB) {
+        out_size = 3 * srch * srcw;
+        resize = 3 * dsth * dstw;
+      } else if (dstFormat == ImageFormat::BGRA ||
+                 dstFormat == ImageFormat::RGBA) {
+        out_size = 4 * srch * srcw;
+        resize = 4 * dsth * dstw;
+      } else if (dstFormat == ImageFormat::GRAY) {
+        out_size = srch * srcw;
+        resize = dsth * dstw;
+      }
+      // out
+      std::vector<int64_t> shape_out = {1, 3, dsth, dstw};
+      Tensor tensor;
+      Tensor tensor_basic;
+      tensor.Resize(shape_out);
+      tensor_basic.Resize(shape_out);
+      tensor.set_precision(PRECISION(kFloat));
+      tensor_basic.set_precision(PRECISION(kFloat));
+      float means[3] = {127.5f, 127.5f, 127.5f};
+      float scales[3] = {1 / 127.5f, 1 / 127.5f, 1 / 127.5f};
+      Timer t_basic, t_lite;
+      LOG(INFO) << "basic to tensor compute: ";
+      for (int i = 0; i < test_iter; i++) {
+        t_basic.Start();
+        image_basic_to_tensor(src,
+                              tensor_basic,
+                              (ImageFormat)dstFormat,
+                              layout,
+                              dstw,
+                              dsth,
+                              means,
+                              scales);
+        t_basic.Stop();
+      }
+      LOG(INFO) << "image baisc to_tensor avg time : "
+                << t_basic.LapTimes().Avg()
+                << ", min time: " << t_basic.LapTimes().Min()
+                << ", max time: " << t_basic.LapTimes().Max();
+      LOG(INFO) << "lite to_tensor compute";
+      TransParam tparam;
+      tparam.ih = srch;
+      tparam.iw = srcw;
+      tparam.oh = dsth;
+      tparam.ow = dstw;
+      tparam.flip_param = flip;
+      tparam.rotate_param = rotate;
+      Tensor_api dst_tensor(&tensor);
+      dst_tensor.Resize(shape_out);
+      ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
+      for (int i = 0; i < test_iter; ++i) {
+        t_lite.Start();
+        image_preprocess.image2Tensor(src,
+                                      &dst_tensor,
+                                      (ImageFormat)dstFormat,
+                                      dstw,
+                                      dsth,
+                                      layout,
+                                      means,
+                                      scales);
+        t_lite.Stop();
+      }
+      LOG(INFO) << "image tensor avg time : " << t_lite.LapTimes().Avg()
+                << ", min time: " << t_lite.LapTimes().Min()
+                << ", max time: " << t_lite.LapTimes().Max();
+      double max_ratio = 0;
+      double max_diff = 0;
+      const double eps = 1e-6f;
+      if (FLAGS_check_result) {
+        max_ratio = 0;
+        max_diff = 0;
+        LOG(INFO) << "diff, iamge to tensor size: " << tensor.numel();
+        const float* ptr_a = tensor.data<float>();
+        const float* ptr_b = tensor_basic.data<float>();
+        int ss = tensor.numel();
+        float* diff_v = new float[ss];
+        for (int i = 0; i < ss; i++) {
+          int a = ptr_a[i];
+          int b = ptr_b[i];
+          int diff1 = a - b;
+          int diff = 0;
+          if (diff1 < -1 || diff1 > 1) diff = diff1 < 0 ? -diff1 : diff1;
+          diff_v[i] = diff;
+          if (max_diff < diff) {
+            max_diff = diff;
+            max_ratio = 2.0 * max_diff / (a + b + eps);
+          }
+        }
+        if (std::abs(max_ratio) >= 1e-5f) {
+          int width = resize / srch;
+          printf("din: \n");
+          print_int8(src, resize, width);
+          printf("saber result: \n");
+          print_fp32(ptr_a, resize, width);
+          printf("basic result: \n");
+          print_fp32(ptr_b, resize, width);
+          printf("diff result: \n");
+          print_fp32(diff_v, resize, width);
+        }
+        LOG(INFO) << "compare result, max diff: " << max_diff
+                  << ", max ratio: " << max_ratio;
+        bool rst = std::abs(max_ratio) < 1e-5f;
+        CHECK_EQ(rst, true) << "compute result error";
+        LOG(INFO) << "iamge to tensor end";
+      }
+    }
+  }
+}
+void print_info(ImageFormat srcFormat,
+                ImageFormat dstFormat,
+                int srcw,
+                int srch,
+                int dstw,
+                int dsth,
+                float rotate_num,
+                int flip_num,
+                int layout) {
+  paddle::lite::DeviceInfo::Init();
+  LOG(INFO) << " input tensor size, num= " << 1 << ", channel= " << 1
+            << ", height= " << srch << ", width= " << srcw
+            << ", srcFormat= " << (ImageFormat)srcFormat;
+  // RGBA = 0, BGRA, RGB, BGR, GRAY, NV21 = 11, NV12,
+  if (srcFormat == ImageFormat::NV21) {
+    LOG(INFO) << "srcFormat: NV21";
+  }
+  if (srcFormat == ImageFormat::NV12) {
+    LOG(INFO) << "srcFormat: NV12";
+  }
+  if (srcFormat == ImageFormat::GRAY) {
+    LOG(INFO) << "srcFormat: GRAY";
+  }
+  if (srcFormat == ImageFormat::BGRA) {
+    LOG(INFO) << "srcFormat: BGRA";
+  }
+  if (srcFormat == ImageFormat::BGR) {
+    LOG(INFO) << "srcFormat: BGR";
+  }
+  if (srcFormat == ImageFormat::RGBA) {
+    LOG(INFO) << "srcFormat: RGBA";
+  }
+  if (srcFormat == ImageFormat::RGB) {
+    LOG(INFO) << "srcFormat: RGB";
+  }
+  LOG(INFO) << " output tensor size, num=" << 1 << ", channel=" << 1
+            << ", height=" << dsth << ", width=" << dstw
+            << ", dstFormat= " << (ImageFormat)dstFormat;
+  if (dstFormat == ImageFormat::NV21) {
+    LOG(INFO) << "dstFormat: NV21";
+  }
+  if (dstFormat == ImageFormat::NV12) {
+    LOG(INFO) << "dstFormat: NV12";
+  }
+  if (dstFormat == ImageFormat::GRAY) {
+    LOG(INFO) << "dstFormat: GRAY";
+  }
+  if (dstFormat == ImageFormat::BGRA) {
+    LOG(INFO) << "dstFormat: BGRA";
+  }
+  if (dstFormat == ImageFormat::BGR) {
+    LOG(INFO) << "dstFormat: BGR";
+  }
+  if (dstFormat == ImageFormat::RGBA) {
+    LOG(INFO) << "dstFormat: RGBA";
+  }
+  if (dstFormat == ImageFormat::RGB) {
+    LOG(INFO) << "dstFormat: RGB";
+  }
+  LOG(INFO) << "Rotate = " << rotate_num;
+  if (flip_num == -1) {
+    LOG(INFO) << "Flip XY";
+  } else if (flip_num == 0) {
+    LOG(INFO) << "Flip X";
+  } else if (flip_num == 1) {
+    LOG(INFO) << "Flip Y";
+  }
+  if (layout == 1) {
+    LOG(INFO) << "Layout NCHW";
+  } else if (layout == 3) {
+    LOG(INFO) << "Layout NHWC";
+  }
+}
+#if 0
+TEST(TestImageConvertRand, test_func_image_convert_preprocess) {
+  if (FLAGS_basic_test) {
+    for (auto w : {1, 4, 8, 16, 112, 224, 1092}) {
+      for (auto h : {1, 4, 16, 112, 224}) {
+        for (auto rotate : {180}) {
+          for (auto flip : {0}) {
+            for (auto srcFormat : {12}) {
+              for (auto dstFormat : {0, 1, 2, 3}) {
+                for (auto layout : {1}) {
+                  // RGBA = 0, BGRA, RGB, BGR, GRAY, NV21 = 11, NV12
+                  if ((srcFormat == ImageFormat::RGB ||
+                      srcFormat == ImageFormat::BGR) &&
+                      (dstFormat == ImageFormat::RGBA ||
+                       dstFormat == ImageFormat::BGRA)) {
+                    continue;  // anakin is not suupport
+                  }
+                  print_info((ImageFormat)srcFormat,
+                            (ImageFormat)dstFormat,
+                            w,
+                            h,
+                            w,
+                            h,
+                            rotate,
+                            flip,
+                            layout);
+                  test_convert({FLAGS_cluster},
+                               {1},
+                               w,
+                               h,
+                               w,
+                               h,
+                               (ImageFormat)srcFormat,
+                               (ImageFormat)dstFormat,
+                               rotate,
+                               (FlipParam)flip,
+                               (LayoutType)layout,
+                               FLAGS_repeats);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+#endif
+#if 0
+TEST(TestImageResizeRand, test_func_image_resize_preprocess) {
+  if (FLAGS_basic_test) {
+    for (auto w : {8, 16, 112, 224, 1092}) {
+      for (auto h : {4, 16, 112, 224}) {
+        for (auto ww : {8, 32, 112}) {
+          for (auto hh : {8, 112}) {
+            for (auto rotate : {180}) {
+              for (auto flip : {0}) {
+                for (auto srcFormat : {0, 1, 2, 3, 11, 12}) {
+                  for (auto layout : {1}) {
+                    auto dstFormat = srcFormat;
+                    print_info((ImageFormat)srcFormat,
+                                (ImageFormat)dstFormat,
+                                w,
+                                h,
+                                ww,
+                                hh,
+                                rotate,
+                                flip,
+                                layout);
+                    test_resize({FLAGS_cluster},
+                                {1},
+                                w,
+                                h,
+                                ww,
+                                hh,
+                                (ImageFormat)srcFormat,
+                                (ImageFormat)dstFormat,
+                                rotate,
+                                (FlipParam)flip,
+                                (LayoutType)layout,
+                                FLAGS_repeats);
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+#endif
+#if 1
+TEST(TestImageFlipRand, test_func_image_flip_preprocess) {
+  if (FLAGS_basic_test) {
+    for (auto w : {1, 8, 16, 112, 224, 1092}) {
+      for (auto h : {1, 16, 112, 224}) {
+        for (auto rotate : {90}) {
+          for (auto flip : {-1, 0, 1}) {
+            for (auto srcFormat : {0, 1, 2, 3}) {
+              for (auto layout : {1}) {
+                auto dstFormat = srcFormat;
+                print_info((ImageFormat)srcFormat,
+                           (ImageFormat)dstFormat,
+                           w,
+                           h,
+                           w,
+                           h,
+                           rotate,
+                           flip,
+                           layout);
+                test_flip({FLAGS_cluster},
+                          {1},
+                          w,
+                          h,
+                          w,
+                          h,
+                          (ImageFormat)srcFormat,
+                          (ImageFormat)dstFormat,
+                          rotate,
+                          (FlipParam)flip,
+                          (LayoutType)layout,
+                          FLAGS_repeats);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+#endif
+#if 1
+TEST(TestImageRotateRand, test_func_image_rotate_preprocess) {
+  if (FLAGS_basic_test) {
+    for (auto w : {1, 8, 16, 112, 224, 1092}) {
+      for (auto h : {1, 16, 112, 224}) {
+        for (auto rotate : {90, 180, 270}) {
+          for (auto flip : {0}) {
+            for (auto srcFormat : {0, 1, 2, 3}) {
+              for (auto layout : {1}) {
+                auto dstFormat = srcFormat;
+                print_info((ImageFormat)srcFormat,
+                           (ImageFormat)dstFormat,
+                           w,
+                           h,
+                           w,
+                           h,
+                           rotate,
+                           flip,
+                           layout);
+                test_rotate({FLAGS_cluster},
+                            {1},
+                            w,
+                            h,
+                            w,
+                            h,
+                            (ImageFormat)srcFormat,
+                            (ImageFormat)dstFormat,
+                            rotate,
+                            (FlipParam)flip,
+                            (LayoutType)layout,
+                            FLAGS_repeats);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+#endif
+#if 1
+TEST(TestImageToTensorRand, test_func_image_to_tensor_preprocess) {
+  if (FLAGS_basic_test) {
+    for (auto w : {1, 8, 16, 112, 224, 1092}) {
+      for (auto h : {1, 16, 112, 224}) {
+        for (auto rotate : {90}) {
+          for (auto flip : {0}) {
+            for (auto srcFormat : {0, 1, 2, 3}) {
+              for (auto layout : {1}) {
+                auto dstFormat = srcFormat;
+                print_info((ImageFormat)srcFormat,
+                           (ImageFormat)dstFormat,
+                           w,
+                           h,
+                           w,
+                           h,
+                           rotate,
+                           flip,
+                           layout);
+                test_to_tensor({FLAGS_cluster},
+                               {1},
+                               w,
+                               h,
+                               w,
+                               h,
+                               (ImageFormat)srcFormat,
+                               (ImageFormat)dstFormat,
+                               rotate,
+                               (FlipParam)flip,
+                               (LayoutType)layout,
+                               FLAGS_repeats);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+#endif
+#if 1
+TEST(TestImageConvertCustom, test_func_image_preprocess_custom) {
+  LOG(INFO) << "print info";
+  print_info((ImageFormat)FLAGS_srcFormat,
+             (ImageFormat)FLAGS_dstFormat,
+             FLAGS_srcw,
+             FLAGS_srch,
+             FLAGS_dstw,
+             FLAGS_dsth,
+             FLAGS_angle,
+             FLAGS_flip_num,
+             FLAGS_layout);
+  test_convert({FLAGS_cluster},
+               {1},
+               FLAGS_srcw,
+               FLAGS_srch,
+               FLAGS_dstw,
+               FLAGS_dsth,
+               (ImageFormat)FLAGS_srcFormat,
+               (ImageFormat)FLAGS_dstFormat,
+               FLAGS_angle,
+               (FlipParam)FLAGS_flip_num,
+               (LayoutType)FLAGS_layout,
+               FLAGS_repeats);
+  test_resize({FLAGS_cluster},
+              {1},
+              FLAGS_srcw,
+              FLAGS_srch,
+              FLAGS_dstw,
+              FLAGS_dsth,
+              (ImageFormat)FLAGS_dstFormat,
+              (ImageFormat)FLAGS_dstFormat,
+              FLAGS_angle,
+              (FlipParam)FLAGS_flip_num,
+              (LayoutType)FLAGS_layout,
+              FLAGS_repeats);
+  test_flip({FLAGS_cluster},
+            {1},
+            FLAGS_srcw,
+            FLAGS_srch,
+            FLAGS_dstw,
+            FLAGS_dsth,
+            (ImageFormat)FLAGS_dstFormat,
+            (ImageFormat)FLAGS_dstFormat,
+            FLAGS_angle,
+            (FlipParam)FLAGS_flip_num,
+            (LayoutType)FLAGS_layout,
+            FLAGS_repeats);
+  test_rotate({FLAGS_cluster},
+              {1},
+              FLAGS_srcw,
+              FLAGS_srch,
+              FLAGS_dstw,
+              FLAGS_dsth,
+              (ImageFormat)FLAGS_dstFormat,
+              (ImageFormat)FLAGS_dstFormat,
+              FLAGS_angle,
+              (FlipParam)FLAGS_flip_num,
+              (LayoutType)FLAGS_layout,
+              FLAGS_repeats);
+  test_to_tensor({FLAGS_cluster},
+                 {1},
+                 FLAGS_srcw,
+                 FLAGS_srch,
+                 FLAGS_dstw,
+                 FLAGS_dsth,
+                 (ImageFormat)FLAGS_dstFormat,
+                 (ImageFormat)FLAGS_dstFormat,
+                 FLAGS_angle,
+                 (FlipParam)FLAGS_flip_num,
+                 (LayoutType)FLAGS_layout,
+                 FLAGS_repeats);
+}
+#endif
+#endif
--- a/lite/utils/cv/image_convert.cc
+++ b/lite/utils/cv/image_convert.cc
@@ -131,7 +131,7 @@ void ImageConvert::choose(const uint8_t* src,
  impl_(src, dst, srcw, srch);
 }
 /*
-nv21(yvu)  to BGR: stroe hwc dsth * dstw = srch * (srcw)
+nv12(yuv) to BGR: stroe hwc dsth * dstw = srch * (srcw)
 y_w = srcw, y_h = srch uv_w = srcw uv_h = 1/2 * srch
 R = Y + 1.402*(V-128);
 G = Y - 0.34414*(U-128) - 0.71414*(V-128);
@@ -141,16 +141,8 @@ ra = 1.402 *128 = 179.456 = 179
 ga = 0.34414 * 64 = 44.3721 = 44
 gb = 0.71414 * 64 = 91.40992 = 91
 ba = 1.772 * 62 = 226.816 = 227
-nv12bgr, nv21tobgr
 */
-void nv_to_bgr(const uint8_t* src,
+inline void nv12_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
-               uint8_t* dst,
-               int srcw,
-               int srch,
-               int x_num,
-               int y_num) {
-  // nv21 x = 0, y = 1
-  // nv12 x = 1, y = 0
  int y_h = srch;
  int wout = srcw * 3;
  const uint8_t* y = src;
@@ -181,6 +173,698 @@ void nv_to_bgr(const uint8_t* src,
      ptr_bgr2 = writebuf;
    }
    int j = 0;
+#ifdef __aarch64__
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr_y1]]                \n"
+        "prfm   pldl1keep, [%[ptr_y1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_y2]]        \n"
+        "prfm   pldl1keep, [%[ptr_y2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_vu]]        \n"
+        "prfm   pldl1keep, [%[ptr_vu], #64]   \n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#else
+    asm volatile(
+        "pld [%[ptr_y1]]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y1], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y2]]            @ preload a, 64byte\n"
+        "pld [%[ptr_y2], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_vu]]            @ preload a, 64byte\n"
+        "pld [%[ptr_vu], #128]                         @ preload a, 64byte\n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#endif
+    for (; j < srcw - 15; j += 16) {
+      uint8x8x2_t y1 = vld2_u8(ptr_y1);  // d8 = y0y2y4y6...y14 d9 =
+                                         // y1y3y5...y15
+      uint8x8x2_t vu =
+          vld2_u8(ptr_vu);  // d0 = v0v1v2v3v4v5...v7 d1 = u0u1u2...u7
+      uint8x8x2_t y2 = vld2_u8(ptr_y2);
+      uint16x8_t v = vmovl_u8(vu.val[1]);
+      uint16x8_t u = vmovl_u8(vu.val[0]);
+      int16x8_t v_s = vreinterpretq_s16_u16(v);
+      int16x8_t u_s = vreinterpretq_s16_u16(u);
+      int16x8_t v_bias = vsubq_s16(v_s, bias);
+      int16x8_t u_bias = vsubq_s16(u_s, bias);
+      // G = Y - 0.34414*(U-128) - 0.71414*(V-128);
+      int16x8_t g0 = vmulq_s16(ga, u_bias);
+      // R = Y + 1.402*(V-128);
+      int16x8_t r0 = vmulq_s16(ra, v_bias);
+      // B = Y + 1.772*(U-128);
+      int16x8_t b0 = vmulq_s16(ba, u_bias);
+      g0 = vmlaq_s16(g0, gb, v_bias);
+      int16x8_t y1_0_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[0]));
+      int16x8_t y1_1_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[1]));
+      int16x8_t y2_0_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[0]));
+      int16x8_t y2_1_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[1]));
+      int16x8_t r0_bias = vshrq_n_s16(r0, 7);  // r0 / 128
+      int16x8_t b0_bias = vshrq_n_s16(b0, 7);
+      int16x8_t g0_bias = vshrq_n_s16(g0, 7);
+      int16x8_t r0_1 = vaddq_s16(y1_0_8, r0_bias);
+      int16x8_t b0_1 = vaddq_s16(y1_0_8, b0_bias);
+      int16x8_t g0_1 = vsubq_s16(y1_0_8, g0_bias);  // g0_1 = y1_0_8 - g0_1
+      int16x8_t r0_2 = vaddq_s16(y1_1_8, r0_bias);
+      int16x8_t b0_2 = vaddq_s16(y1_1_8, b0_bias);
+      int16x8_t g0_2 = vsubq_s16(y1_1_8, g0_bias);
+      r0_1 = vmaxq_s16(r0_1, zero);
+      b0_1 = vmaxq_s16(b0_1, zero);
+      g0_1 = vmaxq_s16(g0_1, zero);
+      r0_2 = vmaxq_s16(r0_2, zero);
+      b0_2 = vmaxq_s16(b0_2, zero);
+      g0_2 = vmaxq_s16(g0_2, zero);
+      r0_1 = vminq_s16(r0_1, max);
+      b0_1 = vminq_s16(b0_1, max);
+      g0_1 = vminq_s16(g0_1, max);
+      r0_2 = vminq_s16(r0_2, max);
+      b0_2 = vminq_s16(b0_2, max);
+      g0_2 = vminq_s16(g0_2, max);
+      uint8x8_t r00 = vreinterpret_u8_s8(vmovn_s16(r0_1));
+      uint8x8_t b00 = vreinterpret_u8_s8(vmovn_s16(b0_1));
+      uint8x8_t g00 = vreinterpret_u8_s8(vmovn_s16(g0_1));
+      uint8x8_t r01 = vreinterpret_u8_s8(vmovn_s16(r0_2));
+      uint8x8_t b01 = vreinterpret_u8_s8(vmovn_s16(b0_2));
+      uint8x8_t g01 = vreinterpret_u8_s8(vmovn_s16(g0_2));
+      int16x8_t r1_1 = vaddq_s16(y2_0_8, r0_bias);
+      int16x8_t b1_1 = vaddq_s16(y2_0_8, b0_bias);
+      int16x8_t g1_1 = vsubq_s16(y2_0_8, g0_bias);  // g0_1 = y1_0_8 - g0_1
+      int16x8_t r1_2 = vaddq_s16(y2_1_8, r0_bias);
+      int16x8_t b1_2 = vaddq_s16(y2_1_8, b0_bias);
+      int16x8_t g1_2 = vsubq_s16(y2_1_8, g0_bias);
+      uint8x8x2_t r00_0 = vtrn_u8(r00, r01);  // 014589  236710
+      uint8x8x2_t b00_0 = vtrn_u8(b00, b01);
+      uint8x8x2_t g00_0 = vtrn_u8(g00, g01);
+      r1_1 = vmaxq_s16(r1_1, zero);
+      b1_1 = vmaxq_s16(b1_1, zero);
+      g1_1 = vmaxq_s16(g1_1, zero);
+      r1_2 = vmaxq_s16(r1_2, zero);
+      b1_2 = vmaxq_s16(b1_2, zero);
+      g1_2 = vmaxq_s16(g1_2, zero);
+      uint16x4_t r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
+      uint16x4_t r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
+      uint16x4_t b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
+      uint16x4_t b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
+      uint16x4_t g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
+      uint16x4_t g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
+      uint16x4x2_t r00_1 = vtrn_u16(r0_16, r1_16);  // 012389 456710
+      uint16x4x2_t b00_1 = vtrn_u16(b0_16, b1_16);
+      uint16x4x2_t g00_1 = vtrn_u16(g0_16, g1_16);
+      r1_1 = vminq_s16(r1_1, max);
+      b1_1 = vminq_s16(b1_1, max);
+      g1_1 = vminq_s16(g1_1, max);
+      r1_2 = vminq_s16(r1_2, max);
+      b1_2 = vminq_s16(b1_2, max);
+      g1_2 = vminq_s16(g1_2, max);
+      uint32x2_t r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
+      uint32x2_t r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
+      uint32x2_t b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
+      uint32x2_t b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
+      uint32x2_t g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
+      uint32x2_t g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
+      uint32x2x2_t r00_2 = vtrn_u32(r0_32, r1_32);  // 01234567 8910
+      uint32x2x2_t b00_2 = vtrn_u32(b0_32, b1_32);
+      uint32x2x2_t g00_2 = vtrn_u32(g0_32, g1_32);
+      r00 = vreinterpret_u8_s8(vmovn_s16(r1_1));
+      b00 = vreinterpret_u8_s8(vmovn_s16(b1_1));
+      g00 = vreinterpret_u8_s8(vmovn_s16(g1_1));
+      r01 = vreinterpret_u8_s8(vmovn_s16(r1_2));
+      b01 = vreinterpret_u8_s8(vmovn_s16(b1_2));
+      g01 = vreinterpret_u8_s8(vmovn_s16(g1_2));
+      uint8x8_t r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
+      uint8x8_t b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
+      uint8x8_t g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
+      uint8x8_t r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
+      uint8x8_t b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
+      uint8x8_t g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
+      uint8x8x3_t v_bgr;
+      v_bgr.val[0] = b0_8;
+      v_bgr.val[1] = g0_8;
+      v_bgr.val[2] = r0_8;
+      r00_0 = vtrn_u8(r00, r01);  // 014589  236710
+      b00_0 = vtrn_u8(b00, b01);
+      g00_0 = vtrn_u8(g00, g01);
+      vst3_u8(ptr_bgr1, v_bgr);
+      r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
+      r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
+      b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
+      b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
+      g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
+      g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
+      ptr_bgr1 += 24;
+      uint8x8x3_t v_bgr1;
+      v_bgr1.val[0] = b1_8;
+      v_bgr1.val[1] = g1_8;
+      v_bgr1.val[2] = r1_8;
+      r00_1 = vtrn_u16(r0_16, r1_16);  // 012389 456710
+      b00_1 = vtrn_u16(b0_16, b1_16);
+      g00_1 = vtrn_u16(g0_16, g1_16);
+      vst3_u8(ptr_bgr1, v_bgr1);
+      r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
+      r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
+      b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
+      b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
+      g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
+      g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
+      ptr_bgr1 += 24;
+      r00_2 = vtrn_u32(r0_32, r1_32);  // 01234567 8910
+      b00_2 = vtrn_u32(b0_32, b1_32);
+      g00_2 = vtrn_u32(g0_32, g1_32);
+      ptr_vu += 16;
+      ptr_y1 += 16;
+      ptr_y2 += 16;
+      r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
+      b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
+      g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
+      r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
+      b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
+      g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
+      v_bgr.val[0] = b0_8;
+      v_bgr.val[1] = g0_8;
+      v_bgr.val[2] = r0_8;
+      v_bgr1.val[0] = b1_8;
+      v_bgr1.val[1] = g1_8;
+      v_bgr1.val[2] = r1_8;
+      vst3_u8(ptr_bgr2, v_bgr);
+      vst3_u8(ptr_bgr2 + 24, v_bgr1);
+      ptr_bgr2 += 48;
+    }
+    // two data
+    for (; j < srcw; j += 2) {
+      uint8_t _y0 = ptr_y1[0];
+      uint8_t _y1 = ptr_y1[1];
+      uint8_t _v = ptr_vu[1];
+      uint8_t _u = ptr_vu[0];
+      uint8_t _y0_1 = ptr_y2[0];
+      uint8_t _y1_1 = ptr_y2[1];
+      int ra = floor((179 * (_v - 128)) >> 7);
+      int ga = floor((44 * (_u - 128) + 91 * (_v - 128)) >> 7);
+      int ba = floor((227 * (_u - 128)) >> 7);
+      int r = _y0 + ra;
+      int g = _y0 - ga;
+      int b = _y0 + ba;
+      int r1 = _y1 + ra;
+      int g1 = _y1 - ga;
+      int b1 = _y1 + ba;
+      r = r < 0 ? 0 : (r > 255) ? 255 : r;
+      g = g < 0 ? 0 : (g > 255) ? 255 : g;
+      b = b < 0 ? 0 : (b > 255) ? 255 : b;
+      r1 = r1 < 0 ? 0 : (r1 > 255) ? 255 : r1;
+      g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1;
+      b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1;
+      *ptr_bgr1++ = b;
+      *ptr_bgr1++ = g;
+      *ptr_bgr1++ = r;
+      int r2 = _y0_1 + ra;
+      int g2 = _y0_1 - ga;
+      int b2 = _y0_1 + ba;
+      int r3 = _y1_1 + ra;
+      int g3 = _y1_1 - ga;
+      int b3 = _y1_1 + ba;
+      r2 = r2 < 0 ? 0 : (r2 > 255) ? 255 : r2;
+      g2 = g2 < 0 ? 0 : (g2 > 255) ? 255 : g2;
+      b2 = b2 < 0 ? 0 : (b2 > 255) ? 255 : b2;
+      r3 = r3 < 0 ? 0 : (r3 > 255) ? 255 : r3;
+      g3 = g3 < 0 ? 0 : (g3 > 255) ? 255 : g3;
+      b3 = b3 < 0 ? 0 : (b3 > 255) ? 255 : b3;
+      *ptr_bgr1++ = b1;
+      *ptr_bgr1++ = g1;
+      *ptr_bgr1++ = r1;
+      *ptr_bgr2++ = b2;
+      *ptr_bgr2++ = g2;
+      *ptr_bgr2++ = r2;
+      ptr_y1 += 2;
+      ptr_y2 += 2;
+      ptr_vu += 2;
+      *ptr_bgr2++ = b3;
+      *ptr_bgr2++ = g3;
+      *ptr_bgr2++ = r3;
+    }
+  }
+  delete[] zerobuf;
+  delete[] writebuf;
+}
+/*
+nv21(yvu) to BGR: stroe hwc dsth * dstw = srch * (srcw)
+*/
+inline void nv21_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
+  int y_h = srch;
+  int wout = srcw * 3;
+  const uint8_t* y = src;
+  const uint8_t* vu = src + y_h * srcw;
+  int16x8_t bias = vdupq_n_s16(128);
+  int16x8_t ga = vdupq_n_s16(44);
+  int16x8_t ra = vdupq_n_s16(179);
+  int16x8_t ba = vdupq_n_s16(227);
+  int16x8_t gb = vdupq_n_s16(91);
+  int16x8_t zero = vdupq_n_s16(0);
+  int16x8_t max = vdupq_n_s16(255);
+  uint8_t* zerobuf = new uint8_t[srcw];
+  uint8_t* writebuf = new uint8_t[wout];
+  memset(zerobuf, 0, sizeof(uint8_t) * srcw);
+  int i = 0;
+#pragma omp parallel for
+  for (i = 0; i < y_h; i += 2) {
+    const uint8_t* ptr_y1 = y + i * srcw;
+    const uint8_t* ptr_y2 = ptr_y1 + srcw;
+    const uint8_t* ptr_vu = vu + (i / 2) * srcw;
+    uint8_t* ptr_bgr1 = dst + i * wout;
+    uint8_t* ptr_bgr2 = ptr_bgr1 + wout;
+    if (i + 2 > y_h) {
+      ptr_y2 = zerobuf;
+      ptr_bgr2 = writebuf;
+    }
+    int j = 0;
+#ifdef __aarch64__
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr_y1]]                \n"
+        "prfm   pldl1keep, [%[ptr_y1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_y2]]        \n"
+        "prfm   pldl1keep, [%[ptr_y2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_vu]]        \n"
+        "prfm   pldl1keep, [%[ptr_vu], #64]   \n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#else
+    asm volatile(
+        "pld [%[ptr_y1]]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y1], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y2]]            @ preload a, 64byte\n"
+        "pld [%[ptr_y2], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_vu]]            @ preload a, 64byte\n"
+        "pld [%[ptr_vu], #128]                         @ preload a, 64byte\n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#endif
+    for (; j < srcw - 15; j += 16) {
+      uint8x8x2_t y1 = vld2_u8(ptr_y1);  // d8 = y0y2y4y6...y14 d9 =
+                                         // y1y3y5...y15
+      uint8x8x2_t vu =
+          vld2_u8(ptr_vu);  // d0 = v0v1v2v3v4v5...v7 d1 = u0u1u2...u7
+      uint8x8x2_t y2 = vld2_u8(ptr_y2);
+      uint16x8_t v = vmovl_u8(vu.val[0]);
+      uint16x8_t u = vmovl_u8(vu.val[1]);
+      int16x8_t v_s = vreinterpretq_s16_u16(v);
+      int16x8_t u_s = vreinterpretq_s16_u16(u);
+      int16x8_t v_bias = vsubq_s16(v_s, bias);
+      int16x8_t u_bias = vsubq_s16(u_s, bias);
+      // G = Y - 0.34414*(U-128) - 0.71414*(V-128);
+      int16x8_t g0 = vmulq_s16(ga, u_bias);
+      // R = Y + 1.402*(V-128);
+      int16x8_t r0 = vmulq_s16(ra, v_bias);
+      // B = Y + 1.772*(U-128);
+      int16x8_t b0 = vmulq_s16(ba, u_bias);
+      g0 = vmlaq_s16(g0, gb, v_bias);
+      int16x8_t y1_0_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[0]));
+      int16x8_t y1_1_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[1]));
+      int16x8_t y2_0_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[0]));
+      int16x8_t y2_1_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[1]));
+      int16x8_t r0_bias = vshrq_n_s16(r0, 7);  // r0 / 128
+      int16x8_t b0_bias = vshrq_n_s16(b0, 7);
+      int16x8_t g0_bias = vshrq_n_s16(g0, 7);
+      int16x8_t r0_1 = vaddq_s16(y1_0_8, r0_bias);
+      int16x8_t b0_1 = vaddq_s16(y1_0_8, b0_bias);
+      int16x8_t g0_1 = vsubq_s16(y1_0_8, g0_bias);  // g0_1 = y1_0_8 - g0_1
+      int16x8_t r0_2 = vaddq_s16(y1_1_8, r0_bias);
+      int16x8_t b0_2 = vaddq_s16(y1_1_8, b0_bias);
+      int16x8_t g0_2 = vsubq_s16(y1_1_8, g0_bias);
+      r0_1 = vmaxq_s16(r0_1, zero);
+      b0_1 = vmaxq_s16(b0_1, zero);
+      g0_1 = vmaxq_s16(g0_1, zero);
+      r0_2 = vmaxq_s16(r0_2, zero);
+      b0_2 = vmaxq_s16(b0_2, zero);
+      g0_2 = vmaxq_s16(g0_2, zero);
+      r0_1 = vminq_s16(r0_1, max);
+      b0_1 = vminq_s16(b0_1, max);
+      g0_1 = vminq_s16(g0_1, max);
+      r0_2 = vminq_s16(r0_2, max);
+      b0_2 = vminq_s16(b0_2, max);
+      g0_2 = vminq_s16(g0_2, max);
+      uint8x8_t r00 = vreinterpret_u8_s8(vmovn_s16(r0_1));
+      uint8x8_t b00 = vreinterpret_u8_s8(vmovn_s16(b0_1));
+      uint8x8_t g00 = vreinterpret_u8_s8(vmovn_s16(g0_1));
+      uint8x8_t r01 = vreinterpret_u8_s8(vmovn_s16(r0_2));
+      uint8x8_t b01 = vreinterpret_u8_s8(vmovn_s16(b0_2));
+      uint8x8_t g01 = vreinterpret_u8_s8(vmovn_s16(g0_2));
+      int16x8_t r1_1 = vaddq_s16(y2_0_8, r0_bias);
+      int16x8_t b1_1 = vaddq_s16(y2_0_8, b0_bias);
+      int16x8_t g1_1 = vsubq_s16(y2_0_8, g0_bias);  // g0_1 = y1_0_8 - g0_1
+      int16x8_t r1_2 = vaddq_s16(y2_1_8, r0_bias);
+      int16x8_t b1_2 = vaddq_s16(y2_1_8, b0_bias);
+      int16x8_t g1_2 = vsubq_s16(y2_1_8, g0_bias);
+      uint8x8x2_t r00_0 = vtrn_u8(r00, r01);  // 014589  236710
+      uint8x8x2_t b00_0 = vtrn_u8(b00, b01);
+      uint8x8x2_t g00_0 = vtrn_u8(g00, g01);
+      r1_1 = vmaxq_s16(r1_1, zero);
+      b1_1 = vmaxq_s16(b1_1, zero);
+      g1_1 = vmaxq_s16(g1_1, zero);
+      r1_2 = vmaxq_s16(r1_2, zero);
+      b1_2 = vmaxq_s16(b1_2, zero);
+      g1_2 = vmaxq_s16(g1_2, zero);
+      uint16x4_t r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
+      uint16x4_t r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
+      uint16x4_t b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
+      uint16x4_t b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
+      uint16x4_t g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
+      uint16x4_t g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
+      uint16x4x2_t r00_1 = vtrn_u16(r0_16, r1_16);  // 012389 456710
+      uint16x4x2_t b00_1 = vtrn_u16(b0_16, b1_16);
+      uint16x4x2_t g00_1 = vtrn_u16(g0_16, g1_16);
+      r1_1 = vminq_s16(r1_1, max);
+      b1_1 = vminq_s16(b1_1, max);
+      g1_1 = vminq_s16(g1_1, max);
+      r1_2 = vminq_s16(r1_2, max);
+      b1_2 = vminq_s16(b1_2, max);
+      g1_2 = vminq_s16(g1_2, max);
+      uint32x2_t r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
+      uint32x2_t r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
+      uint32x2_t b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
+      uint32x2_t b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
+      uint32x2_t g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
+      uint32x2_t g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
+      uint32x2x2_t r00_2 = vtrn_u32(r0_32, r1_32);  // 01234567 8910
+      uint32x2x2_t b00_2 = vtrn_u32(b0_32, b1_32);
+      uint32x2x2_t g00_2 = vtrn_u32(g0_32, g1_32);
+      r00 = vreinterpret_u8_s8(vmovn_s16(r1_1));
+      b00 = vreinterpret_u8_s8(vmovn_s16(b1_1));
+      g00 = vreinterpret_u8_s8(vmovn_s16(g1_1));
+      r01 = vreinterpret_u8_s8(vmovn_s16(r1_2));
+      b01 = vreinterpret_u8_s8(vmovn_s16(b1_2));
+      g01 = vreinterpret_u8_s8(vmovn_s16(g1_2));
+      uint8x8_t r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
+      uint8x8_t b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
+      uint8x8_t g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
+      uint8x8_t r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
+      uint8x8_t b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
+      uint8x8_t g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
+      uint8x8x3_t v_bgr;
+      v_bgr.val[0] = b0_8;
+      v_bgr.val[1] = g0_8;
+      v_bgr.val[2] = r0_8;
+      r00_0 = vtrn_u8(r00, r01);  // 014589  236710
+      b00_0 = vtrn_u8(b00, b01);
+      g00_0 = vtrn_u8(g00, g01);
+      vst3_u8(ptr_bgr1, v_bgr);
+      r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
+      r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
+      b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
+      b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
+      g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
+      g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
+      ptr_bgr1 += 24;
+      uint8x8x3_t v_bgr1;
+      v_bgr1.val[0] = b1_8;
+      v_bgr1.val[1] = g1_8;
+      v_bgr1.val[2] = r1_8;
+      r00_1 = vtrn_u16(r0_16, r1_16);  // 012389 456710
+      b00_1 = vtrn_u16(b0_16, b1_16);
+      g00_1 = vtrn_u16(g0_16, g1_16);
+      vst3_u8(ptr_bgr1, v_bgr1);
+      r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
+      r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
+      b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
+      b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
+      g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
+      g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
+      ptr_bgr1 += 24;
+      r00_2 = vtrn_u32(r0_32, r1_32);  // 01234567 8910
+      b00_2 = vtrn_u32(b0_32, b1_32);
+      g00_2 = vtrn_u32(g0_32, g1_32);
+      ptr_vu += 16;
+      ptr_y1 += 16;
+      ptr_y2 += 16;
+      r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
+      b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
+      g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
+      r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
+      b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
+      g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
+      v_bgr.val[0] = b0_8;
+      v_bgr.val[1] = g0_8;
+      v_bgr.val[2] = r0_8;
+      v_bgr1.val[0] = b1_8;
+      v_bgr1.val[1] = g1_8;
+      v_bgr1.val[2] = r1_8;
+      vst3_u8(ptr_bgr2, v_bgr);
+      vst3_u8(ptr_bgr2 + 24, v_bgr1);
+      ptr_bgr2 += 48;
+    }
+    // two data
+    for (; j < srcw; j += 2) {
+      uint8_t _y0 = ptr_y1[0];
+      uint8_t _y1 = ptr_y1[1];
+      uint8_t _v = ptr_vu[0];
+      uint8_t _u = ptr_vu[1];
+      uint8_t _y0_1 = ptr_y2[0];
+      uint8_t _y1_1 = ptr_y2[1];
+      int ra = floor((179 * (_v - 128)) >> 7);
+      int ga = floor((44 * (_u - 128) + 91 * (_v - 128)) >> 7);
+      int ba = floor((227 * (_u - 128)) >> 7);
+      int r = _y0 + ra;
+      int g = _y0 - ga;
+      int b = _y0 + ba;
+      int r1 = _y1 + ra;
+      int g1 = _y1 - ga;
+      int b1 = _y1 + ba;
+      r = r < 0 ? 0 : (r > 255) ? 255 : r;
+      g = g < 0 ? 0 : (g > 255) ? 255 : g;
+      b = b < 0 ? 0 : (b > 255) ? 255 : b;
+      r1 = r1 < 0 ? 0 : (r1 > 255) ? 255 : r1;
+      g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1;
+      b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1;
+      *ptr_bgr1++ = b;
+      *ptr_bgr1++ = g;
+      *ptr_bgr1++ = r;
+      int r2 = _y0_1 + ra;
+      int g2 = _y0_1 - ga;
+      int b2 = _y0_1 + ba;
+      int r3 = _y1_1 + ra;
+      int g3 = _y1_1 - ga;
+      int b3 = _y1_1 + ba;
+      r2 = r2 < 0 ? 0 : (r2 > 255) ? 255 : r2;
+      g2 = g2 < 0 ? 0 : (g2 > 255) ? 255 : g2;
+      b2 = b2 < 0 ? 0 : (b2 > 255) ? 255 : b2;
+      r3 = r3 < 0 ? 0 : (r3 > 255) ? 255 : r3;
+      g3 = g3 < 0 ? 0 : (g3 > 255) ? 255 : g3;
+      b3 = b3 < 0 ? 0 : (b3 > 255) ? 255 : b3;
+      *ptr_bgr1++ = b1;
+      *ptr_bgr1++ = g1;
+      *ptr_bgr1++ = r1;
+      *ptr_bgr2++ = b2;
+      *ptr_bgr2++ = g2;
+      *ptr_bgr2++ = r2;
+      ptr_y1 += 2;
+      ptr_y2 += 2;
+      ptr_vu += 2;
+      *ptr_bgr2++ = b3;
+      *ptr_bgr2++ = g3;
+      *ptr_bgr2++ = r3;
+    }
+  }
+  delete[] zerobuf;
+  delete[] writebuf;
+}
+// nv12(yuv) to BGRA: stroe hwc dsth * dstw = srch * (srcw) y_w = srcw, y_h =
+// srch uv_w = srcw uv_h = 1/2 * srch
+inline void nv12_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
+  int y_h = srch;
+  int vu_h = 1 / 2 * srch;
+  const uint8_t* y = src;
+  const uint8_t* vu = src + y_h * srcw;
+  int wout = srcw * 4;
+  uint8_t* zerobuf = new uint8_t[srcw];
+  uint8_t* writebuf = new uint8_t[wout];
+  memset(zerobuf, 0, sizeof(uint8_t) * srcw);
+  int16x8_t bias = vdupq_n_s16(128);
+  int16x8_t ga = vdupq_n_s16(44);
+  int16x8_t ra = vdupq_n_s16(179);
+  int16x8_t ba = vdupq_n_s16(227);
+  int16x8_t gb = vdupq_n_s16(91);
+  int16x8_t zero = vdupq_n_s16(0);
+  int16x8_t max = vdupq_n_s16(255);
+  uint8x8_t a_8 = vdup_n_u8(255);
+#pragma omp parallel for
+  for (int i = 0; i < y_h; i += 2) {
+    const uint8_t* ptr_y1 = y + i * srcw;
+    const uint8_t* ptr_y2 = ptr_y1 + srcw;
+    const uint8_t* ptr_vu = vu + (i / 2) * srcw;
+    uint8_t* ptr_bgr1 = dst + i * wout;
+    uint8_t* ptr_bgr2 = ptr_bgr1 + wout;
+    if (i + 2 > y_h) {
+      ptr_y2 = zerobuf;
+      ptr_bgr2 = writebuf;
+    }
+    int j = 0;
+#ifdef __aarch64__
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr_y1]]                \n"
+        "prfm   pldl1keep, [%[ptr_y1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_y2]]        \n"
+        "prfm   pldl1keep, [%[ptr_y2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_vu]]        \n"
+        "prfm   pldl1keep, [%[ptr_vu], #64]   \n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#else
+    asm volatile(
+        "pld [%[ptr_y1]]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y1], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y2]]            @ preload a, 64byte\n"
+        "pld [%[ptr_y2], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_vu]]            @ preload a, 64byte\n"
+        "pld [%[ptr_vu], #128]                         @ preload a, 64byte\n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#endif
    for (; j < srcw - 15; j += 16) {
      uint8x8x2_t y1 = vld2_u8(ptr_y1);  // d8 = y0y2y4y6...y14 d9 =
                                         // y1y3y5...y15
@@ -189,8 +873,8 @@ void nv_to_bgr(const uint8_t* src,
      uint8x8x2_t y2 = vld2_u8(ptr_y2);
-      uint16x8_t v = vmovl_u8(vu.val[x_num]);
+      uint16x8_t v = vmovl_u8(vu.val[1]);
-      uint16x8_t u = vmovl_u8(vu.val[y_num]);
+      uint16x8_t u = vmovl_u8(vu.val[0]);
      int16x8_t v_s = vreinterpretq_s16_u16(v);
      int16x8_t u_s = vreinterpretq_s16_u16(u);
      int16x8_t v_bias = vsubq_s16(v_s, bias);
@@ -317,16 +1001,17 @@ void nv_to_bgr(const uint8_t* src,
      uint8x8_t b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
      uint8x8_t g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
-      uint8x8x3_t v_bgr;
+      uint8x8x4_t v_bgr;
      v_bgr.val[0] = b0_8;
      v_bgr.val[1] = g0_8;
      v_bgr.val[2] = r0_8;
+      v_bgr.val[3] = a_8;
      r00_0 = vtrn_u8(r00, r01);  // 014589  236710
      b00_0 = vtrn_u8(b00, b01);
      g00_0 = vtrn_u8(g00, g01);
-      vst3_u8(ptr_bgr1, v_bgr);
+      vst4_u8(ptr_bgr1, v_bgr);
      r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
      r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
@@ -337,17 +1022,20 @@ void nv_to_bgr(const uint8_t* src,
      g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
      g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
-      ptr_bgr1 += 24;
+      ptr_bgr1 += 32;
-      uint8x8x3_t v_bgr1;
+      // uint8x8x3_t v_bgr1;
+      uint8x8x4_t v_bgr1;
      v_bgr1.val[0] = b1_8;
      v_bgr1.val[1] = g1_8;
      v_bgr1.val[2] = r1_8;
+      v_bgr1.val[3] = a_8;
      r00_1 = vtrn_u16(r0_16, r1_16);  // 012389 456710
      b00_1 = vtrn_u16(b0_16, b1_16);
      g00_1 = vtrn_u16(g0_16, g1_16);
-      vst3_u8(ptr_bgr1, v_bgr1);
+      // vst3_u8(ptr_bgr1, v_bgr1);
+      vst4_u8(ptr_bgr1, v_bgr1);
      r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
      r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
@@ -358,7 +1046,8 @@ void nv_to_bgr(const uint8_t* src,
      g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
      g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
-      ptr_bgr1 += 24;
+      // ptr_bgr1 += 24;
+      ptr_bgr1 += 32;
      r00_2 = vtrn_u32(r0_32, r1_32);  // 01234567 8910
      b00_2 = vtrn_u32(b0_32, b1_32);
@@ -384,17 +1073,17 @@ void nv_to_bgr(const uint8_t* src,
      v_bgr1.val[1] = g1_8;
      v_bgr1.val[2] = r1_8;
-      vst3_u8(ptr_bgr2, v_bgr);
+      vst4_u8(ptr_bgr2, v_bgr);
-      vst3_u8(ptr_bgr2 + 24, v_bgr1);
+      vst4_u8(ptr_bgr2 + 32, v_bgr1);
-      ptr_bgr2 += 48;
+      ptr_bgr2 += 64;
    }
    // two data
    for (; j < srcw; j += 2) {
      uint8_t _y0 = ptr_y1[0];
      uint8_t _y1 = ptr_y1[1];
-      uint8_t _v = ptr_vu[x_num];
+      uint8_t _v = ptr_vu[1];
-      uint8_t _u = ptr_vu[y_num];
+      uint8_t _u = ptr_vu[0];
      uint8_t _y0_1 = ptr_y2[0];
      uint8_t _y1_1 = ptr_y2[1];
@@ -421,6 +1110,7 @@ void nv_to_bgr(const uint8_t* src,
      *ptr_bgr1++ = b;
      *ptr_bgr1++ = g;
      *ptr_bgr1++ = r;
+      *ptr_bgr1++ = 255;
      int r2 = _y0_1 + ra;
      int g2 = _y0_1 - ga;
@@ -441,10 +1131,12 @@ void nv_to_bgr(const uint8_t* src,
      *ptr_bgr1++ = b1;
      *ptr_bgr1++ = g1;
      *ptr_bgr1++ = r1;
+      *ptr_bgr1++ = 255;
      *ptr_bgr2++ = b2;
      *ptr_bgr2++ = g2;
      *ptr_bgr2++ = r2;
+      *ptr_bgr2++ = 255;
      ptr_y1 += 2;
      ptr_y2 += 2;
@@ -453,20 +1145,16 @@ void nv_to_bgr(const uint8_t* src,
      *ptr_bgr2++ = b3;
      *ptr_bgr2++ = g3;
      *ptr_bgr2++ = r3;
+      *ptr_bgr2++ = 255;
    }
  }
  delete[] zerobuf;
  delete[] writebuf;
 }
-// nv12bgra, nv21tobgra
-void nv_to_bgra(const uint8_t* src,
+// nv21(yvu) to BGRA:store hwc dsth * dstw = srch * srcw y_w = srcw, y_h = srch
-                uint8_t* dst,
+// uv_w = srcw uv_h = 1/2 * srch
-                int srcw,
+inline void nv21_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
-                int srch,
-                int x_num,
-                int y_num) {
-  // nv21 x = 0, y = 1
-  // nv12 x = 1, y = 0
  int y_h = srch;
  int vu_h = 1 / 2 * srch;
  const uint8_t* y = src;
@@ -497,6 +1185,29 @@ void nv_to_bgra(const uint8_t* src,
      ptr_bgr2 = writebuf;
    }
    int j = 0;
+#ifdef __aarch64__
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr_y1]]                \n"
+        "prfm   pldl1keep, [%[ptr_y1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_y2]]        \n"
+        "prfm   pldl1keep, [%[ptr_y2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_vu]]        \n"
+        "prfm   pldl1keep, [%[ptr_vu], #64]   \n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#else
+    asm volatile(
+        "pld [%[ptr_y1]]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y1], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y2]]            @ preload a, 64byte\n"
+        "pld [%[ptr_y2], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_vu]]            @ preload a, 64byte\n"
+        "pld [%[ptr_vu], #128]                         @ preload a, 64byte\n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#endif
    for (; j < srcw - 15; j += 16) {
      uint8x8x2_t y1 = vld2_u8(ptr_y1);  // d8 = y0y2y4y6...y14 d9 =
                                         // y1y3y5...y15
@@ -505,8 +1216,8 @@ void nv_to_bgra(const uint8_t* src,
      uint8x8x2_t y2 = vld2_u8(ptr_y2);
-      uint16x8_t v = vmovl_u8(vu.val[x_num]);
+      uint16x8_t v = vmovl_u8(vu.val[0]);
-      uint16x8_t u = vmovl_u8(vu.val[y_num]);
+      uint16x8_t u = vmovl_u8(vu.val[1]);
      int16x8_t v_s = vreinterpretq_s16_u16(v);
      int16x8_t u_s = vreinterpretq_s16_u16(u);
      int16x8_t v_bias = vsubq_s16(v_s, bias);
@@ -643,10 +1354,6 @@ void nv_to_bgra(const uint8_t* src,
      b00_0 = vtrn_u8(b00, b01);
      g00_0 = vtrn_u8(g00, g01);
-      // ptr_bgr3 += 8;
-      // ptr_bgr1 += 8;
-      // ptr_bgr2 += 8;
-      // vst3_u8(ptr_bgr1, v_bgr);
      vst4_u8(ptr_bgr1, v_bgr);
      r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
@@ -709,8 +1416,6 @@ void nv_to_bgra(const uint8_t* src,
      v_bgr1.val[1] = g1_8;
      v_bgr1.val[2] = r1_8;
-      // vst3_u8(ptr_bgr2, v_bgr);
-      // vst3_u8(ptr_bgr2 + 24, v_bgr1);
      vst4_u8(ptr_bgr2, v_bgr);
      vst4_u8(ptr_bgr2 + 32, v_bgr1);
@@ -720,8 +1425,8 @@ void nv_to_bgra(const uint8_t* src,
    for (; j < srcw; j += 2) {
      uint8_t _y0 = ptr_y1[0];
      uint8_t _y1 = ptr_y1[1];
-      uint8_t _v = ptr_vu[x_num];
+      uint8_t _v = ptr_vu[0];
-      uint8_t _u = ptr_vu[y_num];
+      uint8_t _u = ptr_vu[1];
      uint8_t _y0_1 = ptr_y2[0];
      uint8_t _y1_1 = ptr_y2[1];
@@ -745,9 +1450,6 @@ void nv_to_bgra(const uint8_t* src,
      g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1;
      b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1;
-      // *ptr_bgr1++ = b;
-      // *ptr_bgr2++ = g;
-      // *ptr_bgr3++ = r;
      *ptr_bgr1++ = b;
      *ptr_bgr1++ = g;
      *ptr_bgr1++ = r;
@@ -792,26 +1494,7 @@ void nv_to_bgra(const uint8_t* src,
  delete[] zerobuf;
  delete[] writebuf;
 }
-void nv21_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
-  nv_to_bgr(src, dst, srcw, srch, 0, 1);
-}
-// nv12(yuv)  to BGR:store hwc dsth * dstw = srch * srcw y_w = srcw, y_h = srch
-// uv_w = srcw uv_h = 1/2 * srch
-void nv12_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
-  // exchange vu forward
-  nv_to_bgr(src, dst, srcw, srch, 1, 0);
-}
-// nv21(yvu)  to BGRA: stroe hwc dsth * dstw = srch * (srcw) y_w = srcw, y_h =
-// srch uv_w = srcw uv_h = 1/2 * srch
-void nv21_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
-  nv_to_bgra(src, dst, srcw, srch, 0, 1);
-}
-// nv12(yuv)  to BGRA:store hwc dsth * dstw = srch * srcw y_w = srcw, y_h = srch
-// uv_w = srcw uv_h = 1/2 * srch
-void nv12_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
-  nv_to_bgra(src, dst, srcw, srch, 1, 0);
-}
 /*
 采用CV_BGR2GRAY,转换公式Gray = 0.1140*B + 0.5870*G + 0.2989*R
 采用CV_RGB2GRAY,转换公式Gray = 0.1140*R + 0.5870*G + 0.2989*B
@@ -847,7 +1530,6 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
    uint8_t* outr1 = outr0 + srcw;
    uint8_t* outr2 = outr1 + srcw;
    uint8_t* outr3 = outr2 + srcw;
    int cnt = cnt_pro;
    if (cnt > 0) {
 #ifdef __aarch64__

--- a/lite/utils/cv/image_flip.cc
+++ b/lite/utils/cv/image_flip.cc
@@ -153,7 +153,10 @@ void flip_hwc1_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
                                                  // 26, 27}"
          "ld1  {v3.8b}, [%[inptr3]], #8    \n"   // v0={30,31,32, 33, 34, 35,
                                                  // 36, 37}"
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
          "st1 {v0.8b}, [%[outptr0]], #8             \n"   // 00 10 20 30 04 14
                                                           // 24 34
          "st1 {v1.8b}, [%[outptr1]], #8              \n"  // 02 12 22 32
@@ -180,6 +183,10 @@ void flip_hwc1_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
          "26 27\n"
          "vld1.8  {d12}, [%[inptr3]]!   @ zip load r1, d6 = 30 31 32 33 34 35 "
          "36 37\n"
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
          "vst1.32  {d0},    [%[outptr0]]!   @ write d0(q0,low),r00,r10 20 30\n"
          "vst1.32  {d4},    [%[outptr1]]!   @ write d4(q0,low),r01,r11 21 31\n"
@@ -286,7 +293,10 @@ void flip_hwc1_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
                                                   // 01 00
          "rev64  v7.8b, v3.8b                \n"  //@ reverse 07 06 05 04 03 02
                                                   // 01 00
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
          "st1 {v4.8b}, [%[outptr0]]             \n"  // 00 10 20 30 04 14 24 34
          "st1 {v5.8b}, [%[outptr1]]              \n"  // 02 12 22 32
          "st1 {v6.8b}, [%[outptr2]]             \n"   // 01 11 21 31
@@ -324,7 +334,10 @@ void flip_hwc1_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
          "vrev64.8  d9, d8               @ reverse 07 06 05 04 03 02 01 00 \n"
          "vrev64.8  d13, d12               @ reverse 07 06 05 04 03 02 01 00 "
          "\n"
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
          "vst1.32  {d1},    [%[outptr0]]   @ write d0(q0,low),r00,r10 20 30\n"
          "vst1.32  {d5},    [%[outptr1]]   @ write d4(q0,low),r01,r11 21 31\n"
          "vst1.32  {d9},    [%[outptr2]]   @ write d4(q0,low),r01,r11 21 31\n"
@@ -440,7 +453,10 @@ void flip_hwc1_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
                                                   // 01 00
          "rev64  v7.8b, v3.8b                \n"  //@ reverse 07 06 05 04 03 02
                                                   // 01 00
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
          "st1 {v4.8b}, [%[outptr0]]             \n"  // 00 10 20 30 04 14 24 34
          "st1 {v5.8b}, [%[outptr1]]              \n"  // 02 12 22 32
          "st1 {v6.8b}, [%[outptr2]]             \n"   // 01 11 21 31
@@ -478,7 +494,10 @@ void flip_hwc1_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
          "vrev64.8  d9, d8               @ reverse 07 06 05 04 03 02 01 00 \n"
          "vrev64.8  d13, d12               @ reverse 07 06 05 04 03 02 01 00 "
          "\n"
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
          "vst1.32  {d1},    [%[outptr0]]   @ write d0(q0,low),r00,r10 20 30\n"
          "vst1.32  {d5},    [%[outptr1]]   @ write d4(q0,low),r01,r11 21 31\n"
          "vst1.32  {d9},    [%[outptr2]]   @ write d4(q0,low),r01,r11 21 31\n"
@@ -583,7 +602,10 @@ void flip_hwc3_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24    \n"  // v0={30,31,32,
                                                                  // 33, 34, 35,
                                                                  // 36, 37}"
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
          "st3 {v0.8b, v1.8b, v2.8b}, [%[outptr0]], #24             \n"   // 00
                                                                          // 10
                                                                          // 20
@@ -634,6 +656,10 @@ void flip_hwc3_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
          "vld3.8  {d9, d10, d11}, [%[inptr3]]!   @ zip load r1, d6 = 30 31 32 "
          "33 34 35 36 37\n"
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
          "vst3.8  {d0, d1, d2},    [%[outptr0]]!   @ write d0(q0,low),r00,r10 "
          "20 30\n"
          "vst3.8  {d3, d4, d5},    [%[outptr1]]!   @ write d4(q0,low),r01,r11 "
@@ -748,7 +774,10 @@ void flip_hwc3_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24    \n"  // v0={30,31,32,
                                                                  // 33, 34, 35,
                                                                  // 36, 37}"
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
          "rev64  v12.8b, v0.8b                \n"  //@ reverse 07 06 05 04 03
                                                    // 02 01 00 b
          "rev64  v13.8b, v1.8b                \n"  //@ reverse 07 06 05 04 03
@@ -855,7 +884,10 @@ void flip_hwc3_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
          "\n"
          "vrev64.8  d23, d11               @ reverse 07 06 05 04 03 02 01 00 "
          "\n"
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
          "vst3.8  {d12, d13, d14},    [%[outptr0]]   @ write "
          "d0(q0,low),r00,r10 20 30\n"
          "vst3.8  {d15, d16, d17},    [%[outptr1]]   @ write "
@@ -1027,7 +1059,10 @@ void flip_hwc3_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
                                                     // 02 01 00
          "rev64  v23.8b, v11.8b                \n"  //@ reverse 07 06 05 04 03
                                                     // 02 01 00
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
          "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]]             \n"   // 00 10
                                                                        // 20 30
                                                                        // 04 14
@@ -1106,6 +1141,10 @@ void flip_hwc3_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
          "\n"
          "vrev64.8  d23, d11               @ reverse 07 06 05 04 03 02 01 00 "
          "\n"
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
          "vst3.8  {d12, d13, d14},    [%[outptr0]]   @ write "
          "d0(q0,low),r00,r10 20 30\n"
@@ -1262,7 +1301,10 @@ void flip_hwc4_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
          // 35,
          // 36,
          // 37}"
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
          "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[outptr0]], #32  \n"  // 00 10 20
                                                                     // 30 04 14
                                                                     // 24 34
@@ -1306,6 +1348,10 @@ void flip_hwc4_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
          "22 23 24 25 26 27\n"
          "vld4.8  {d12, d13, d14, d15}, [%[inptr3]]!   @ zip load r1, d6 = 30 "
          "31 32 33 34 35 36 37\n"
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
          "vst4.8  {d0, d1, d2, d3},    [%[outptr0]]!   @ write "
          "d0(q0,low),r00,r10 20 30\n"
@@ -1476,7 +1522,10 @@ void flip_hwc4_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
                                                    // 02 01 00
          "rev64  v7.8b, v15.8b                \n"  //@ reverse 07 06 05 04 03
                                                    // 02 01 00
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
          "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[outptr0]]             \n"  // 00 10 20 30 04 14 24 34
          "st4 {v20.8b, v21.8b, v22.8b, v23.8b}, [%[outptr1]]              \n"  // 02 12 22 32
          "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[outptr2]]             \n"  // 01
@@ -1571,6 +1620,10 @@ void flip_hwc4_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
          "\n"
          "vrev64.8  d7, d15               @ reverse 07 06 05 04 03 02 01 00 "
          "\n"
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
          "vst4.8  {d16, d17, d18, d19},    [%[outptr0]]   @ write "
          "d0(q0,low),r00,r10 20 30\n"
@@ -1770,7 +1823,10 @@ void flip_hwc4_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
                                                    // 02 01 00
          "rev64  v7.8b, v15.8b                \n"  //@ reverse 07 06 05 04 03
                                                    // 02 01 00
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
          "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[outptr0]]             \n"  // 00 10 20 30 04 14 24 34
          "st4 {v20.8b, v21.8b, v22.8b, v23.8b}, [%[outptr1]]              \n"  // 02 12 22 32
          "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[outptr2]]             \n"  // 01
@@ -1868,6 +1924,10 @@ void flip_hwc4_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
          "vrev64.8  d7, d15               @ reverse 07 06 05 04 03 02 01 00 "
          "\n"
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
          "vst4.8  {d16, d17, d18, d19},    [%[outptr0]]   @ write "
          "d0(q0,low),r00,r10 20 30\n"
          "vst4.8  {d20, d21, d22, d23},    [%[outptr1]]   @ write "

--- a/lite/utils/cv/image_resize.cc
+++ b/lite/utils/cv/image_resize.cc
@@ -51,9 +51,44 @@ void ImageResize::choose(const uint8_t* src,
                         int dsth) {
  resize(src, dst, srcFormat, srcw, srch, dstw, dsth);
 }
+void resize_one_channel(
+    const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
+void resize_one_channel_uv(
+    const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
 void resize_three_channel(
    const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
+void resize_four_channel(
+    const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
+void nv21_resize(const uint8_t* src,
+                 uint8_t* dst,
+                 int w_in,
+                 int h_in,
+                 int w_out,
+                 int h_out) {
+  if (w_out == w_in && h_out == h_in) {
+    memcpy(dst, src, sizeof(uint8_t) * w_in * static_cast<int>(1.5 * h_in));
+    return;
+  }
+  //     return;
+  int y_h = h_in;
+  int uv_h = h_in / 2;
+  const uint8_t* y_ptr = src;
+  const uint8_t* uv_ptr = src + y_h * w_in;
+  // out
+  int dst_y_h = h_out;
+  int dst_uv_h = h_out / 2;
+  uint8_t* dst_ptr = dst + dst_y_h * w_out;
+  // y
+  resize_one_channel(y_ptr, w_in, y_h, dst, w_out, dst_y_h);
+  // uv
+  resize_one_channel_uv(uv_ptr, w_in, uv_h, dst_ptr, w_out, dst_uv_h);
+}
 void bgr_resize(const uint8_t* src,
                uint8_t* dst,
                int w_in,
@@ -67,36 +102,57 @@ void bgr_resize(const uint8_t* src,
  // y
  resize_three_channel(src, w_in * 3, h_in, dst, w_out * 3, h_out);
 }
-void resize_three_channel(const uint8_t* src,
-                          int w_in,
+void bgra_resize(const uint8_t* src,
-                          int h_in,
+                 uint8_t* dst,
-                          uint8_t* dst,
+                 int w_in,
-                          int w_out,
+                 int h_in,
-                          int h_out) {
+                 int w_out,
+                 int h_out) {
+  if (w_out == w_in && h_out == h_in) {
+    memcpy(dst, src, sizeof(uint8_t) * w_in * h_in * 4);
+    return;
+  }
+  // y
+  resize_four_channel(src, w_in * 4, h_in, dst, w_out * 4, h_out);
+}
+void resize_one_channel(const uint8_t* src,
+                        int w_in,
+                        int h_in,
+                        uint8_t* dst,
+                        int w_out,
+                        int h_out) {
  const int resize_coef_bits = 11;
  const int resize_coef_scale = 1 << resize_coef_bits;
  double scale_x = static_cast<double>(w_in) / w_out;
  double scale_y = static_cast<double>(h_in) / h_out;
  int* buf = new int[w_out * 2 + h_out * 2];
  int* xofs = buf;          // new int[w];
  int* yofs = buf + w_out;  // new int[h];
  int16_t* ialpha =
-      reinterpret_cast<int16_t*>(buf + w_out + h_out);  // new int16_t[w * 2];
+      reinterpret_cast<int16_t*>(buf + w_out + h_out);  // new short[w * 2];
  int16_t* ibeta =
      reinterpret_cast<int16_t*>(buf + w_out * 2 + h_out);  // new short[h * 2];
  float fx = 0.f;
  float fy = 0.f;
-  int sx = 0.f;
+  int sx = 0;
-  int sy = 0.f;
+  int sy = 0;
 #define SATURATE_CAST_SHORT(X)                                               \
  (int16_t)::std::min(                                                       \
      ::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
      SHRT_MAX);
-  // #pragma omp parallel for
+  for (int dx = 0; dx < w_out; dx++) {
-  for (int dx = 0; dx < w_out / 3; dx++) {
    fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
    sx = floor(fx);
    fx -= sx;
    if (sx < 0) {
      sx = 0;
      fx = 0.f;
@@ -105,17 +161,20 @@ void resize_three_channel(const uint8_t* src,
      sx = w_in - 2;
      fx = 1.f;
    }
-    xofs[dx] = sx * 3;
+    xofs[dx] = sx;
    float a0 = (1.f - fx) * resize_coef_scale;
    float a1 = fx * resize_coef_scale;
    ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
    ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
  }
-  // #pragma omp parallel for
  for (int dy = 0; dy < h_out; dy++) {
    fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
    sy = floor(fy);
    fy -= sy;
    if (sy < 0) {
      sy = 0;
      fy = 0.f;
@@ -124,9 +183,12 @@ void resize_three_channel(const uint8_t* src,
      sy = h_in - 2;
      fy = 1.f;
    }
    yofs[dy] = sy;
    float b0 = (1.f - fy) * resize_coef_scale;
    float b1 = fy * resize_coef_scale;
    ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
    ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
  }
@@ -136,9 +198,11 @@ void resize_three_channel(const uint8_t* src,
  int16_t* rowsbuf1 = new int16_t[w_out + 1];
  int16_t* rows0 = rowsbuf0;
  int16_t* rows1 = rowsbuf1;
  int prev_sy1 = -1;
  for (int dy = 0; dy < h_out; dy++) {
    int sy = yofs[dy];
    if (sy == prev_sy1) {
      // hresize one row
      int16_t* rows0_old = rows0;
@@ -147,72 +211,80 @@ void resize_three_channel(const uint8_t* src,
      const uint8_t* S1 = src + w_in * (sy + 1);
      const int16_t* ialphap = ialpha;
      int16_t* rows1p = rows1;
-      for (int dx = 0; dx < w_out / 3; dx++) {
+      for (int dx = 0; dx < w_out; dx++) {
        int sx = xofs[dx];
        int16_t a0 = ialphap[0];
        int16_t a1 = ialphap[1];
        const uint8_t* S1p = S1 + sx;
-        int tmp = dx * 3;
+        rows1p[dx] = (S1p[0] * a0 + S1p[1] * a1) >> 4;
-        rows1p[tmp] = (S1p[0] * a0 + S1p[3] * a1) >> 4;
-        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[4] * a1) >> 4;
-        rows1p[tmp + 2] = (S1p[2] * a0 + S1p[5] * a1) >> 4;
        ialphap += 2;
      }
    } else {
      // hresize two rows
      const uint8_t* S0 = src + w_in * (sy);
      const uint8_t* S1 = src + w_in * (sy + 1);
      const int16_t* ialphap = ialpha;
      int16_t* rows0p = rows0;
      int16_t* rows1p = rows1;
-      for (int dx = 0; dx < w_out / 3; dx++) {
+      for (int dx = 0; dx < w_out; dx++) {
        int sx = xofs[dx];
        int16_t a0 = ialphap[0];
        int16_t a1 = ialphap[1];
        const uint8_t* S0p = S0 + sx;
        const uint8_t* S1p = S1 + sx;
-        int tmp = dx * 3;
+        rows0p[dx] = (S0p[0] * a0 + S0p[1] * a1) >> 4;
-        rows0p[tmp] = (S0p[0] * a0 + S0p[3] * a1) >> 4;
+        rows1p[dx] = (S1p[0] * a0 + S1p[1] * a1) >> 4;
-        rows1p[tmp] = (S1p[0] * a0 + S1p[3] * a1) >> 4;
-        rows0p[tmp + 1] = (S0p[1] * a0 + S0p[4] * a1) >> 4;
-        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[4] * a1) >> 4;
-        rows0p[tmp + 2] = (S0p[2] * a0 + S0p[5] * a1) >> 4;
-        rows1p[tmp + 2] = (S1p[2] * a0 + S1p[5] * a1) >> 4;
        ialphap += 2;
      }
    }
    prev_sy1 = sy + 1;
    // vresize
    int16_t b0 = ibeta[0];
    int16_t b1 = ibeta[1];
    int16_t* rows0p = rows0;
    int16_t* rows1p = rows1;
    uint8_t* dp_ptr = dst + w_out * (dy);
    int cnt = w_out >> 3;
    int remain = w_out - (cnt << 3);
    int16x4_t _b0 = vdup_n_s16(b0);
    int16x4_t _b1 = vdup_n_s16(b1);
    int32x4_t _v2 = vdupq_n_s32(2);
    for (cnt = w_out >> 3; cnt > 0; cnt--) {
      int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
      int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
      int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
      int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
      int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
      int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
      int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
      int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
      int32x4_t _acc = _v2;
-      _acc = vsraq_n_s32(
+      _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
-          _acc, _rows0p_sr4_mb0, 16);  // _acc >> 16 + _rows0p_sr4_mb0 >> 16
      _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
      int32x4_t _acc_1 = _v2;
      _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
      _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
-      int16x4_t _acc16 = vshrn_n_s32(_acc, 2);  // _acc >> 2
+      int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
      int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
      uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
      vst1_u8(dp_ptr, _dout);
      dp_ptr += 8;
      rows0p += 8;
      rows1p += 8;
@@ -226,45 +298,18 @@ void resize_three_channel(const uint8_t* src,
    }
    ibeta += 2;
  }
  delete[] buf;
  delete[] rowsbuf0;
  delete[] rowsbuf1;
 }
-void resize_one_channel(
-    const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
-void resize_one_channel_uv(
-    const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
-void nv21_resize(const uint8_t* src,
-                 uint8_t* dst,
-                 int w_in,
-                 int h_in,
-                 int w_out,
-                 int h_out) {
-  if (w_out == w_in && h_out == h_in) {
-    memcpy(dst, src, sizeof(uint8_t) * w_in * static_cast<int>(1.5 * h_in));
-    return;
-  }
-  //     return;
-  int y_h = h_in;
-  int uv_h = h_in / 2;
-  const uint8_t* y_ptr = src;
-  const uint8_t* uv_ptr = src + y_h * w_in;
-  // out
-  int dst_y_h = h_out;
-  int dst_uv_h = h_out / 2;
-  uint8_t* dst_ptr = dst + dst_y_h * w_out;
-  // y
-  resize_one_channel(y_ptr, w_in, y_h, dst, w_out, dst_y_h);
-  // uv
-  resize_one_channel_uv(uv_ptr, w_in, uv_h, dst_ptr, w_out, dst_uv_h);
-}
-void resize_one_channel(const uint8_t* src,
+void resize_one_channel_uv(const uint8_t* src,
-                        int w_in,
+                           int w_in,
-                        int h_in,
+                           int h_in,
-                        uint8_t* dst,
+                           uint8_t* dst,
-                        int w_out,
+                           int w_out,
-                        int h_out) {
+                           int h_out) {
  const int resize_coef_bits = 11;
  const int resize_coef_scale = 1 << resize_coef_bits;
@@ -277,20 +322,20 @@ void resize_one_channel(const uint8_t* src,
  int* yofs = buf + w_out;  // new int[h];
  int16_t* ialpha =
-      reinterpret_cast<int16_t*>(buf + w_out + h_out);  // new short[w * 2];
+      reinterpret_cast<int16_t*>(buf + w_out + h_out);  // new int16_t[w * 2];
-  int16_t* ibeta =
+  int16_t* ibeta = reinterpret_cast<int16_t*>(buf + w_out * 2 +
-      reinterpret_cast<int16_t*>(buf + w_out * 2 + h_out);  // new short[h * 2];
+                                              h_out);  // new int16_t[h * 2];
  float fx = 0.f;
  float fy = 0.f;
-  int sx = 0;
+  int sx = 0.f;
-  int sy = 0;
+  int sy = 0.f;
 #define SATURATE_CAST_SHORT(X)                                               \
  (int16_t)::std::min(                                                       \
      ::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
      SHRT_MAX);
-  for (int dx = 0; dx < w_out; dx++) {
+  for (int dx = 0; dx < w_out / 2; dx++) {
    fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
    sx = floor(fx);
    fx -= sx;
@@ -334,6 +379,7 @@ void resize_one_channel(const uint8_t* src,
    ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
    ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
  }
 #undef SATURATE_CAST_SHORT
  // loop body
  int16_t* rowsbuf0 = new int16_t[w_out + 1];
@@ -344,22 +390,23 @@ void resize_one_channel(const uint8_t* src,
  int prev_sy1 = -1;
  for (int dy = 0; dy < h_out; dy++) {
    int sy = yofs[dy];
    if (sy == prev_sy1) {
      // hresize one row
      int16_t* rows0_old = rows0;
      rows0 = rows1;
      rows1 = rows0_old;
      const uint8_t* S1 = src + w_in * (sy + 1);
      const int16_t* ialphap = ialpha;
      int16_t* rows1p = rows1;
-      for (int dx = 0; dx < w_out; dx++) {
+      for (int dx = 0; dx < w_out / 2; dx++) {
-        int sx = xofs[dx];
+        int sx = xofs[dx] * 2;
        int16_t a0 = ialphap[0];
        int16_t a1 = ialphap[1];
        const uint8_t* S1p = S1 + sx;
-        rows1p[dx] = (S1p[0] * a0 + S1p[1] * a1) >> 4;
+        int tmp = dx * 2;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[2] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[3] * a1) >> 4;
        ialphap += 2;
      }
@@ -371,20 +418,22 @@ void resize_one_channel(const uint8_t* src,
      const int16_t* ialphap = ialpha;
      int16_t* rows0p = rows0;
      int16_t* rows1p = rows1;
-      for (int dx = 0; dx < w_out; dx++) {
+      for (int dx = 0; dx < w_out / 2; dx++) {
-        int sx = xofs[dx];
+        int sx = xofs[dx] * 2;
        int16_t a0 = ialphap[0];
        int16_t a1 = ialphap[1];
        const uint8_t* S0p = S0 + sx;
        const uint8_t* S1p = S1 + sx;
-        rows0p[dx] = (S0p[0] * a0 + S0p[1] * a1) >> 4;
+        int tmp = dx * 2;
-        rows1p[dx] = (S1p[0] * a0 + S1p[1] * a1) >> 4;
+        rows0p[tmp] = (S0p[0] * a0 + S0p[2] * a1) >> 4;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[2] * a1) >> 4;
+        rows0p[tmp + 1] = (S0p[1] * a0 + S0p[3] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[3] * a1) >> 4;
        ialphap += 2;
      }
    }
    prev_sy1 = sy + 1;
    // vresize
@@ -400,7 +449,6 @@ void resize_one_channel(const uint8_t* src,
    int16x4_t _b0 = vdup_n_s16(b0);
    int16x4_t _b1 = vdup_n_s16(b1);
    int32x4_t _v2 = vdupq_n_s32(2);
    for (cnt = w_out >> 3; cnt > 0; cnt--) {
      int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
      int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
@@ -413,14 +461,15 @@ void resize_one_channel(const uint8_t* src,
      int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
      int32x4_t _acc = _v2;
-      _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
+      _acc = vsraq_n_s32(
+          _acc, _rows0p_sr4_mb0, 16);  // _acc >> 16 + _rows0p_sr4_mb0 >> 16
      _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
      int32x4_t _acc_1 = _v2;
      _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
      _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
-      int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
+      int16x4_t _acc16 = vshrn_n_s32(_acc, 2);  // _acc >> 2
      int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
      uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
@@ -446,42 +495,35 @@ void resize_one_channel(const uint8_t* src,
  delete[] rowsbuf1;
 }
-void resize_one_channel_uv(const uint8_t* src,
+void resize_three_channel(const uint8_t* src,
-                           int w_in,
+                          int w_in,
-                           int h_in,
+                          int h_in,
-                           uint8_t* dst,
+                          uint8_t* dst,
-                           int w_out,
+                          int w_out,
-                           int h_out) {
+                          int h_out) {
  const int resize_coef_bits = 11;
  const int resize_coef_scale = 1 << resize_coef_bits;
  double scale_x = static_cast<double>(w_in) / w_out;
  double scale_y = static_cast<double>(h_in) / h_out;
  int* buf = new int[w_out * 2 + h_out * 2];
  int* xofs = buf;          // new int[w];
  int* yofs = buf + w_out;  // new int[h];
  int16_t* ialpha =
      reinterpret_cast<int16_t*>(buf + w_out + h_out);  // new int16_t[w * 2];
-  int16_t* ibeta = reinterpret_cast<int16_t*>(buf + w_out * 2 +
+  int16_t* ibeta =
-                                              h_out);  // new int16_t[h * 2];
+      reinterpret_cast<int16_t*>(buf + w_out * 2 + h_out);  // new short[h * 2];
  float fx = 0.f;
  float fy = 0.f;
  int sx = 0.f;
  int sy = 0.f;
 #define SATURATE_CAST_SHORT(X)                                               \
  (int16_t)::std::min(                                                       \
      ::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
      SHRT_MAX);
-  for (int dx = 0; dx < w_out / 2; dx++) {
+  for (int dx = 0; dx < w_out / 3; dx++) {
    fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
    sx = floor(fx);
    fx -= sx;
    if (sx < 0) {
      sx = 0;
      fx = 0.f;
@@ -490,12 +532,9 @@ void resize_one_channel_uv(const uint8_t* src,
      sx = w_in - 2;
      fx = 1.f;
    }
+    xofs[dx] = sx * 3;
-    xofs[dx] = sx;
    float a0 = (1.f - fx) * resize_coef_scale;
    float a1 = fx * resize_coef_scale;
    ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
    ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
  }
@@ -503,7 +542,6 @@ void resize_one_channel_uv(const uint8_t* src,
    fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
    sy = floor(fy);
    fy -= sy;
    if (sy < 0) {
      sy = 0;
      fy = 0.f;
@@ -512,23 +550,18 @@ void resize_one_channel_uv(const uint8_t* src,
      sy = h_in - 2;
      fy = 1.f;
    }
    yofs[dy] = sy;
    float b0 = (1.f - fy) * resize_coef_scale;
    float b1 = fy * resize_coef_scale;
    ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
    ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
  }
 #undef SATURATE_CAST_SHORT
  // loop body
  int16_t* rowsbuf0 = new int16_t[w_out + 1];
  int16_t* rowsbuf1 = new int16_t[w_out + 1];
  int16_t* rows0 = rowsbuf0;
  int16_t* rows1 = rowsbuf1;
  int prev_sy1 = -1;
  for (int dy = 0; dy < h_out; dy++) {
    int sy = yofs[dy];
@@ -538,54 +571,49 @@ void resize_one_channel_uv(const uint8_t* src,
      rows0 = rows1;
      rows1 = rows0_old;
      const uint8_t* S1 = src + w_in * (sy + 1);
      const int16_t* ialphap = ialpha;
      int16_t* rows1p = rows1;
-      for (int dx = 0; dx < w_out / 2; dx++) {
+      for (int dx = 0; dx < w_out / 3; dx++) {
-        int sx = xofs[dx] * 2;
+        int sx = xofs[dx];
        int16_t a0 = ialphap[0];
        int16_t a1 = ialphap[1];
        const uint8_t* S1p = S1 + sx;
-        int tmp = dx * 2;
+        int tmp = dx * 3;
-        rows1p[tmp] = (S1p[0] * a0 + S1p[2] * a1) >> 4;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[3] * a1) >> 4;
-        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[3] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[4] * a1) >> 4;
+        rows1p[tmp + 2] = (S1p[2] * a0 + S1p[5] * a1) >> 4;
        ialphap += 2;
      }
    } else {
      // hresize two rows
      const uint8_t* S0 = src + w_in * (sy);
      const uint8_t* S1 = src + w_in * (sy + 1);
      const int16_t* ialphap = ialpha;
      int16_t* rows0p = rows0;
      int16_t* rows1p = rows1;
-      for (int dx = 0; dx < w_out / 2; dx++) {
+      for (int dx = 0; dx < w_out / 3; dx++) {
-        int sx = xofs[dx] * 2;
+        int sx = xofs[dx];
        int16_t a0 = ialphap[0];
        int16_t a1 = ialphap[1];
        const uint8_t* S0p = S0 + sx;
        const uint8_t* S1p = S1 + sx;
-        int tmp = dx * 2;
+        int tmp = dx * 3;
-        rows0p[tmp] = (S0p[0] * a0 + S0p[2] * a1) >> 4;
+        rows0p[tmp] = (S0p[0] * a0 + S0p[3] * a1) >> 4;
-        rows1p[tmp] = (S1p[0] * a0 + S1p[2] * a1) >> 4;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[3] * a1) >> 4;
+        rows0p[tmp + 1] = (S0p[1] * a0 + S0p[4] * a1) >> 4;
-        rows0p[tmp + 1] = (S0p[1] * a0 + S0p[3] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[4] * a1) >> 4;
-        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[3] * a1) >> 4;
+        rows0p[tmp + 2] = (S0p[2] * a0 + S0p[5] * a1) >> 4;
+        rows1p[tmp + 2] = (S1p[2] * a0 + S1p[5] * a1) >> 4;
        ialphap += 2;
      }
    }
    prev_sy1 = sy + 1;
    // vresize
    int16_t b0 = ibeta[0];
    int16_t b1 = ibeta[1];
    int16_t* rows0p = rows0;
    int16_t* rows1p = rows1;
    uint8_t* dp_ptr = dst + w_out * (dy);
    int cnt = w_out >> 3;
    int remain = w_out - (cnt << 3);
    int16x4_t _b0 = vdup_n_s16(b0);
@@ -596,28 +624,21 @@ void resize_one_channel_uv(const uint8_t* src,
      int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
      int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
      int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
      int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
      int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
      int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
      int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
      int32x4_t _acc = _v2;
      _acc = vsraq_n_s32(
          _acc, _rows0p_sr4_mb0, 16);  // _acc >> 16 + _rows0p_sr4_mb0 >> 16
      _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
      int32x4_t _acc_1 = _v2;
      _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
      _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
      int16x4_t _acc16 = vshrn_n_s32(_acc, 2);  // _acc >> 2
      int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
      uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
      vst1_u8(dp_ptr, _dout);
      dp_ptr += 8;
      rows0p += 8;
      rows1p += 8;
@@ -631,7 +652,172 @@ void resize_one_channel_uv(const uint8_t* src,
    }
    ibeta += 2;
  }
+  delete[] buf;
+  delete[] rowsbuf0;
+  delete[] rowsbuf1;
+}
+void resize_four_channel(const uint8_t* src,
+                         int w_in,
+                         int h_in,
+                         uint8_t* dst,
+                         int w_out,
+                         int h_out) {
+  const int resize_coef_bits = 11;
+  const int resize_coef_scale = 1 << resize_coef_bits;
+  double scale_x = static_cast<double>(w_in) / w_out;
+  double scale_y = static_cast<double>(h_in) / h_out;
+  int* buf = new int[w_out * 2 + h_out * 2];
+  int* xofs = buf;          // new int[w];
+  int* yofs = buf + w_out;  // new int[h];
+  int16_t* ialpha =
+      reinterpret_cast<int16_t*>(buf + w_out + h_out);  // new int16_t[w * 2];
+  int16_t* ibeta =
+      reinterpret_cast<int16_t*>(buf + w_out * 2 + h_out);  // new short[h * 2];
+  float fx = 0.f;
+  float fy = 0.f;
+  int sx = 0.f;
+  int sy = 0.f;
+#define SATURATE_CAST_SHORT(X)                                               \
+  (int16_t)::std::min(                                                       \
+      ::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
+      SHRT_MAX);
+  for (int dx = 0; dx < w_out / 4; dx++) {
+    fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
+    sx = floor(fx);
+    fx -= sx;
+    if (sx < 0) {
+      sx = 0;
+      fx = 0.f;
+    }
+    if (sx >= w_in - 1) {
+      sx = w_in - 2;
+      fx = 1.f;
+    }
+    xofs[dx] = sx * 4;
+    float a0 = (1.f - fx) * resize_coef_scale;
+    float a1 = fx * resize_coef_scale;
+    ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
+    ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
+  }
+  for (int dy = 0; dy < h_out; dy++) {
+    fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
+    sy = floor(fy);
+    fy -= sy;
+    if (sy < 0) {
+      sy = 0;
+      fy = 0.f;
+    }
+    if (sy >= h_in - 1) {
+      sy = h_in - 2;
+      fy = 1.f;
+    }
+    yofs[dy] = sy;
+    float b0 = (1.f - fy) * resize_coef_scale;
+    float b1 = fy * resize_coef_scale;
+    ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
+    ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
+  }
+#undef SATURATE_CAST_SHORT
+  // loop body
+  int16_t* rowsbuf0 = new int16_t[w_out + 1];
+  int16_t* rowsbuf1 = new int16_t[w_out + 1];
+  int16_t* rows0 = rowsbuf0;
+  int16_t* rows1 = rowsbuf1;
+  int prev_sy1 = -1;
+  for (int dy = 0; dy < h_out; dy++) {
+    int sy = yofs[dy];
+    if (sy == prev_sy1) {
+      // hresize one row
+      int16_t* rows0_old = rows0;
+      rows0 = rows1;
+      rows1 = rows0_old;
+      const uint8_t* S1 = src + w_in * (sy + 1);
+      const int16_t* ialphap = ialpha;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out / 4; dx++) {
+        int sx = xofs[dx];
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+        const uint8_t* S1p = S1 + sx;
+        int tmp = dx * 4;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[4] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[5] * a1) >> 4;
+        rows1p[tmp + 2] = (S1p[2] * a0 + S1p[6] * a1) >> 4;
+        rows1p[tmp + 3] = (S1p[3] * a0 + S1p[7] * a1) >> 4;
+        ialphap += 2;
+      }
+    } else {
+      // hresize two rows
+      const uint8_t* S0 = src + w_in * (sy);
+      const uint8_t* S1 = src + w_in * (sy + 1);
+      const int16_t* ialphap = ialpha;
+      int16_t* rows0p = rows0;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out / 4; dx++) {
+        int sx = xofs[dx];
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+        const uint8_t* S0p = S0 + sx;
+        const uint8_t* S1p = S1 + sx;
+        int tmp = dx * 4;
+        rows0p[tmp] = (S0p[0] * a0 + S0p[4] * a1) >> 4;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[4] * a1) >> 4;
+        rows0p[tmp + 1] = (S0p[1] * a0 + S0p[5] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[5] * a1) >> 4;
+        rows0p[tmp + 2] = (S0p[2] * a0 + S0p[6] * a1) >> 4;
+        rows1p[tmp + 2] = (S1p[2] * a0 + S1p[6] * a1) >> 4;
+        rows0p[tmp + 3] = (S0p[3] * a0 + S0p[7] * a1) >> 4;
+        rows1p[tmp + 3] = (S1p[3] * a0 + S1p[7] * a1) >> 4;
+        ialphap += 2;
+      }
+    }
+    prev_sy1 = sy + 1;
+    // vresize
+    int16_t b0 = ibeta[0];
+    int16_t b1 = ibeta[1];
+    int16_t* rows0p = rows0;
+    int16_t* rows1p = rows1;
+    uint8_t* dp_ptr = dst + w_out * (dy);
+    int cnt = w_out >> 3;
+    int remain = w_out - (cnt << 3);
+    int16x4_t _b0 = vdup_n_s16(b0);
+    int16x4_t _b1 = vdup_n_s16(b1);
+    int32x4_t _v2 = vdupq_n_s32(2);
+    for (cnt = w_out >> 3; cnt > 0; cnt--) {
+      int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
+      int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
+      int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
+      int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
+      int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
+      int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
+      int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
+      int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
+      int32x4_t _acc = _v2;
+      // _acc >> 16 + _rows0p_sr4_mb0 >> 16
+      _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
+      _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
+      int32x4_t _acc_1 = _v2;
+      _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
+      _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
+      // _acc >> 2
+      int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
+      int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
+      uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
+      vst1_u8(dp_ptr, _dout);
+      dp_ptr += 8;
+      rows0p += 8;
+      rows1p += 8;
+    }
+    for (; remain; --remain) {
+      // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
+      *dp_ptr++ =
+          (uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
+                     (int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
+                    2);
+    }
+    ibeta += 2;
+  }
  delete[] buf;
  delete[] rowsbuf0;
  delete[] rowsbuf1;
@@ -648,6 +834,7 @@ void compute_xy(int srcw,
                int* yofs,
                int16_t* ialpha,
                int16_t* ibeta);
 // use bilinear method to resize
 void resize(const uint8_t* src,
            uint8_t* dst,
@@ -682,9 +869,8 @@ void resize(const uint8_t* src,
    bgr_resize(src, dst, srcw, srch, dstw, dsth);
    return;
  } else if (srcFormat == BGRA || srcFormat == RGBA) {
-    w_in = srcw * 4;
+    bgra_resize(src, dst, srcw, srch, dstw, dsth);
-    w_out = dstw * 4;
+    return;
-    num = 4;
  }
  double scale_x = static_cast<double>(srcw) / dstw;
  double scale_y = static_cast<double>(srch) / dsth;
@@ -701,23 +887,6 @@ void resize(const uint8_t* src,
  int* xofs1 = nullptr;
  int* yofs1 = nullptr;
  int16_t* ialpha1 = nullptr;
-  if (orih < dsth) {  // uv
-    int tmp = dsth - orih;
-    xofs1 = new int[dstw];
-    yofs1 = new int[tmp];
-    ialpha1 = new int16_t[dstw];
-    compute_xy(srcw,
-               srch / 2,
-               dstw / 2,
-               tmp,
-               2,
-               scale_x,
-               scale_y,
-               xofs1,
-               yofs1,
-               ialpha1,
-               ibeta + orih * 2);
-  }
  int cnt = w_out >> 3;
  int remain = w_out % 8;
  int32x4_t _v2 = vdupq_n_s32(2);
@@ -727,13 +896,6 @@ void resize(const uint8_t* src,
 #pragma omp parallel for
  for (int dy = 0; dy < dsth; dy++) {
    int sy = yofs[dy];
-    if (dy >= orih) {
-      xofs = xofs1;
-      yofs = yofs1;
-      ialpha = ialpha1;
-      num = 2;
-      sy = yofs1[dy - orih] + srch;
-    }
    // hresize two rows
    const uint8_t* S0 = src + w_in * (sy);
@@ -850,11 +1012,6 @@ void resize(const uint8_t* src,
    }
    ibeta += 2;
  }
-  if (orih < dsth) {  // uv
-    delete[] xofs1;
-    delete[] yofs1;
-    delete[] ialpha1;
-  }
  delete[] buf;
  delete[] rowsbuf0;
  delete[] rowsbuf1;

--- a/lite/utils/cv/paddle_image_preprocess.cc
+++ b/lite/utils/cv/paddle_image_preprocess.cc
@@ -39,7 +39,7 @@ ImagePreprocess::ImagePreprocess(ImageFormat srcFormat,
  this->dstFormat_ = dstFormat;
  this->transParam_ = param;
 }
-__attribute__((visibility("default"))) void ImagePreprocess::imageConvert(
+__attribute__((visibility("default"))) void ImagePreprocess::image_convert(
    const uint8_t* src, uint8_t* dst) {
  ImageConvert img_convert;
  img_convert.choose(src,
@@ -50,7 +50,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageConvert(
                     this->transParam_.ih);
 }
-__attribute__((visibility("default"))) void ImagePreprocess::imageConvert(
+__attribute__((visibility("default"))) void ImagePreprocess::image_convert(
    const uint8_t* src,
    uint8_t* dst,
    ImageFormat srcFormat,
@@ -64,7 +64,18 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageConvert(
                     this->transParam_.ih);
 }
-__attribute__((visibility("default"))) void ImagePreprocess::imageResize(
+__attribute__((visibility("default"))) void ImagePreprocess::image_convert(
+    const uint8_t* src,
+    uint8_t* dst,
+    ImageFormat srcFormat,
+    ImageFormat dstFormat,
+    int srcw,
+    int srch) {
+  ImageConvert img_convert;
+  img_convert.choose(src, dst, srcFormat, dstFormat, srcw, srch);
+}
+__attribute__((visibility("default"))) void ImagePreprocess::image_resize(
    const uint8_t* src,
    uint8_t* dst,
    ImageFormat srcFormat,
@@ -76,7 +87,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageResize(
  img_resize.choose(src, dst, srcFormat, srcw, srch, dstw, dsth);
 }
-__attribute__((visibility("default"))) void ImagePreprocess::imageResize(
+__attribute__((visibility("default"))) void ImagePreprocess::image_resize(
    const uint8_t* src, uint8_t* dst) {
  int srcw = this->transParam_.iw;
  int srch = this->transParam_.ih;
@@ -87,7 +98,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageResize(
  img_resize.choose(src, dst, srcFormat, srcw, srch, dstw, dsth);
 }
-__attribute__((visibility("default"))) void ImagePreprocess::imageRotate(
+__attribute__((visibility("default"))) void ImagePreprocess::image_rotate(
    const uint8_t* src,
    uint8_t* dst,
    ImageFormat srcFormat,
@@ -98,7 +109,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageRotate(
  img_rotate.choose(src, dst, srcFormat, srcw, srch, degree);
 }
-__attribute__((visibility("default"))) void ImagePreprocess::imageRotate(
+__attribute__((visibility("default"))) void ImagePreprocess::image_rotate(
    const uint8_t* src, uint8_t* dst) {
  auto srcw = this->transParam_.ow;
  auto srch = this->transParam_.oh;
@@ -108,7 +119,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageRotate(
  img_rotate.choose(src, dst, srcFormat, srcw, srch, degree);
 }
-__attribute__((visibility("default"))) void ImagePreprocess::imageFlip(
+__attribute__((visibility("default"))) void ImagePreprocess::image_flip(
    const uint8_t* src,
    uint8_t* dst,
    ImageFormat srcFormat,
@@ -119,7 +130,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageFlip(
  img_flip.choose(src, dst, srcFormat, srcw, srch, flip_param);
 }
-__attribute__((visibility("default"))) void ImagePreprocess::imageFlip(
+__attribute__((visibility("default"))) void ImagePreprocess::image_flip(
    const uint8_t* src, uint8_t* dst) {
  auto srcw = this->transParam_.ow;
  auto srch = this->transParam_.oh;
@@ -129,7 +140,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageFlip(
  img_flip.choose(src, dst, srcFormat, srcw, srch, flip_param);
 }
-__attribute__((visibility("default"))) void ImagePreprocess::image2Tensor(
+__attribute__((visibility("default"))) void ImagePreprocess::image_to_tensor(
    const uint8_t* src,
    Tensor* dstTensor,
    ImageFormat srcFormat,
@@ -143,7 +154,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::image2Tensor(
      src, dstTensor, srcFormat, layout, srcw, srch, means, scales);
 }
-__attribute__((visibility("default"))) void ImagePreprocess::image2Tensor(
+__attribute__((visibility("default"))) void ImagePreprocess::image_to_tensor(
    const uint8_t* src,
    Tensor* dstTensor,
    LayoutType layout,
@@ -160,7 +171,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::image2Tensor(
                    scales);
 }
-__attribute__((visibility("default"))) void ImagePreprocess::imageCrop(
+__attribute__((visibility("default"))) void ImagePreprocess::image_crop(
    const uint8_t* src,
    uint8_t* dst,
    ImageFormat srcFormat,

--- a/lite/utils/cv/paddle_image_preprocess.h
+++ b/lite/utils/cv/paddle_image_preprocess.h
@@ -75,7 +75,8 @@ class ImagePreprocess {
  * param src: input image data
  * param dst: output image data
  */
-  void imageConvert(const uint8_t* src, uint8_t* dst);
+  void image_convert(const uint8_t* src, uint8_t* dst);
  /*
  * image color convert
  * support NV12/NV21_to_BGR(RGB), NV12/NV21_to_BGRA(RGBA),
@@ -91,10 +92,35 @@ class ImagePreprocess {
  * param dstFormat: output image image format, support GRAY, BGR(RGB) and
  * BGRA(RGBA)
  */
-  void imageConvert(const uint8_t* src,
+  void image_convert(const uint8_t* src,
-                    uint8_t* dst,
+                     uint8_t* dst,
-                    ImageFormat srcFormat,
+                     ImageFormat srcFormat,
-                    ImageFormat dstFormat);
+                     ImageFormat dstFormat);
+  /*
+  * image color convert
+  * support NV12/NV21_to_BGR(RGB), NV12/NV21_to_BGRA(RGBA),
+  * BGR(RGB)and BGRA(RGBA) transform,
+  * BGR(RGB)and RGB(BGR) transform,
+  * BGR(RGB)and RGBA(BGRA) transform,
+  * BGR(RGB)and GRAY transform,
+  * BGRA(RGBA) and GRAY transform,
+  * param src: input image data
+  * param dst: output image data
+  * param srcFormat: input image image format support: GRAY, NV12(NV21),
+  * BGR(RGB) and BGRA(RGBA)
+  * param dstFormat: output image image format, support GRAY, BGR(RGB) and
+  * BGRA(RGBA)
+  * param srcw: input image width
+  * param srch: input image height
+  */
+  void image_convert(const uint8_t* src,
+                     uint8_t* dst,
+                     ImageFormat srcFormat,
+                     ImageFormat dstFormat,
+                     int srcw,
+                     int srch);
  /*
  * image resize, use bilinear method
  * support image format: 1-channel image (egs: GRAY, 2-channel image (egs:
@@ -102,7 +128,8 @@ class ImagePreprocess {
  * param src: input image data
  * param dst: output image data
  */
-  void imageResize(const uint8_t* src, uint8_t* dst);
+  void image_resize(const uint8_t* src, uint8_t* dst);
  /*
   image resize, use bilinear method
  * support image format: 1-channel image (egs: GRAY, 2-channel image (egs:
@@ -114,13 +141,13 @@ class ImagePreprocess {
  * param dstw: output image width
  * param dsth: output image height
  */
-  void imageResize(const uint8_t* src,
+  void image_resize(const uint8_t* src,
-                   uint8_t* dst,
+                    uint8_t* dst,
-                   ImageFormat srcFormat,
+                    ImageFormat srcFormat,
-                   int srcw,
+                    int srcw,
-                   int srch,
+                    int srch,
-                   int dstw,
+                    int dstw,
-                   int dsth);
+                    int dsth);
  /*
  * image Rotate
@@ -129,7 +156,8 @@ class ImagePreprocess {
  * param src: input image data
  * param dst: output image data
  */
-  void imageRotate(const uint8_t* src, uint8_t* dst);
+  void image_rotate(const uint8_t* src, uint8_t* dst);
  /*
  * image Rotate
  * support 90, 180 and 270 Rotate process
@@ -141,12 +169,13 @@ class ImagePreprocess {
  * param srch: input image height
  * param degree: Rotate degree, support 90, 180 and 270
  */
-  void imageRotate(const uint8_t* src,
+  void image_rotate(const uint8_t* src,
-                   uint8_t* dst,
+                    uint8_t* dst,
-                   ImageFormat srcFormat,
+                    ImageFormat srcFormat,
-                   int srcw,
+                    int srcw,
-                   int srch,
+                    int srch,
-                   float degree);
+                    float degree);
  /*
  * image Flip
  * support X, Y and XY flip process
@@ -154,7 +183,8 @@ class ImagePreprocess {
  * param src: input image data
  * param dst: output image data
  */
-  void imageFlip(const uint8_t* src, uint8_t* dst);
+  void image_flip(const uint8_t* src, uint8_t* dst);
  /*
  * image Flip
  * support X, Y and XY flip process
@@ -166,12 +196,13 @@ class ImagePreprocess {
  * param srch: input image height
  * param flip_param: flip parameter, support X, Y and XY
  */
-  void imageFlip(const uint8_t* src,
+  void image_flip(const uint8_t* src,
-                 uint8_t* dst,
+                  uint8_t* dst,
-                 ImageFormat srcFormat,
+                  ImageFormat srcFormat,
-                 int srcw,
+                  int srcw,
-                 int srch,
+                  int srch,
-                 FlipParam flip_param);
+                  FlipParam flip_param);
  /*
  * change image data to tensor data
  * support image format is GRAY, BGR(RGB) and BGRA(RGBA), Data layout is NHWC
@@ -183,11 +214,12 @@ class ImagePreprocess {
  * param means: means of image
  * param scales: scales of image
  */
-  void image2Tensor(const uint8_t* src,
+  void image_to_tensor(const uint8_t* src,
-                    Tensor* dstTensor,
+                       Tensor* dstTensor,
-                    LayoutType layout,
+                       LayoutType layout,
-                    float* means,
+                       float* means,
-                    float* scales);
+                       float* scales);
  /*
  * change image data to tensor data
  * support image format is GRAY, BGR(RGB) and BGRA(RGBA), Data layout is NHWC
@@ -202,14 +234,14 @@ class ImagePreprocess {
  * param means: means of image
  * param scales: scales of image
  */
-  void image2Tensor(const uint8_t* src,
+  void image_to_tensor(const uint8_t* src,
-                    Tensor* dstTensor,
+                       Tensor* dstTensor,
-                    ImageFormat srcFormat,
+                       ImageFormat srcFormat,
-                    int srcw,
+                       int srcw,
-                    int srch,
+                       int srch,
-                    LayoutType layout,
+                       LayoutType layout,
-                    float* means,
+                       float* means,
-                    float* scales);
+                       float* scales);
  /*
  * image crop process
@@ -217,15 +249,15 @@ class ImagePreprocess {
  * param src: input image data
  * param dst: output image data
  */
-  void imageCrop(const uint8_t* src,
+  void image_crop(const uint8_t* src,
-                 uint8_t* dst,
+                  uint8_t* dst,
-                 ImageFormat srcFormat,
+                  ImageFormat srcFormat,
-                 int srcw,
+                  int srcw,
-                 int srch,
+                  int srch,
-                 int left_x,
+                  int left_x,
-                 int left_y,
+                  int left_y,
-                 int dstw,
+                  int dstw,
-                 int dsth);
+                  int dsth);
 private:
  ImageFormat srcFormat_;