[arm] add cv unit_test (#4250)

add cv_ut. test=develop add Anakin implement add image_profiler test

[arm] add cv unit_test (#4250)
add cv_ut. test=develop add Anakin implement add image_profiler test
515f9a6a · HappyAngel · GitHub · 339c2e53 · 515f9a6a · 515f9a6a
25 changed file
--- a/docs/api_reference/cv.md
+++ b/docs/api_reference/cv.md
@@ -91,14 +91,24 @@ ImagePreprocess::ImagePreprocess(ImageFormat srcFormat, ImageFormat dstFormat, T
    // 方法二
    void ImagePreprocess::imageCovert(const uint8_t* src,
    uint8_t* dst, ImageFormat srcFormat, ImageFormat dstFormat);
+    // 方法三
+    void ImagePreprocess::imageCovert(const uint8_t* src,
+    uint8_t* dst, ImageFormat srcFormat, ImageFormat dstFormat,
+    int srcw, int srch);
    ```

    + 第一个 `imageCovert` 接口，缺省参数来源于 `ImagePreprocess` 类的成员变量。故在初始化 `ImagePreprocess` 类的对象时，必须要给以下成员变量赋值：
        - param srcFormat：`ImagePreprocess` 类的成员变量`srcFormat_`
        - param dstFormat：`ImagePreprocess` 类的成员变量`dstFormat_`
+        - param srcw: `ImagePreprocess` 类的成员变量`transParam_`结构体中的`iw`变量
+        - param srch: `ImagePreprocess` 类的成员变量`transParam_`结构体中的`ih`变量
    
-    - 第二个`imageCovert` 接口，可以直接使用
+    - 第二个`imageCovert` 接口，缺省参数来源于 `ImagePreprocess` 类的成员变量。故在初始化 `ImagePreprocess` 类的对象时，必须要给以下成员变量赋值：
+        - param srcw: `ImagePreprocess` 类的成员变量`transParam_`结构体中的`iw`变量
+        - param srch: `ImagePreprocess` 类的成员变量`transParam_`结构体中的`ih`变量

+    - 第二个`imageCovert` 接口, 可以直接使用
+    
 ### 缩放 Resize

 `Resize` 功能支持颜色空间：GRAY、NV12（NV21）、RGB（BGR）和RGBA（BGRA）

--- a/lite/tests/CMakeLists.txt
+++ b/lite/tests/CMakeLists.txt
 add_subdirectory(kernels)
 add_subdirectory(math)
 add_subdirectory(cv)
+add_subdirectory(cv/anakin)
 add_subdirectory(api)
--- a/lite/tests/cv/CMakeLists.txt
+++ b/lite/tests/cv/CMakeLists.txt
 if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_MLU) AND LITE_WITH_ARM)
    lite_cc_test(image_convert_test SRCS image_convert_test.cc DEPS paddle_cv_arm)
+    lite_cc_test(image_profiler_test SRCS image_profiler_test.cc DEPS paddle_cv_arm anakin_cv_arm)
 endif()
--- a/lite/tests/cv/anakin/CMakeLists.txt
+++ b/lite/tests/cv/anakin/CMakeLists.txt
+if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_MLU) AND LITE_WITH_ARM)
+  lite_cc_library(anakin_cv_arm SRCS
+                  bgr_resize.cc
+                  bgr_flip_hwc.cc
+                  bgr_rotate_hwc.cc
+                  bgr_to_tensor_hwc.cc
+                  bgra_resize.cc
+                  bgra_flip_hwc.cc
+                  bgra_rotate_hwc.cc
+                  bgra_to_tensor_hwc.cc
+                  cv_utils.cc
+                  nv12_to_bgr.cc
+                  nv12_to_bgra.cc
+                  nv21_to_bgr.cc
+                  nv21_to_bgra.cc
+                  nv21_resize.cc
+                  DEPS paddle_api place)
+endif()
--- a/lite/tests/cv/anakin/bgr_flip_hwc.cc
+++ b/lite/tests/cv/anakin/bgr_flip_hwc.cc
--- a/lite/tests/cv/anakin/bgr_resize.cc
+++ b/lite/tests/cv/anakin/bgr_resize.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <limits.h>
+#include <math.h>
+#include "lite/tests/cv/anakin/cv_utils.h"
+
+void resize_three_channel(
+    const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
+void bgr_resize(const uint8_t* src,
+                uint8_t* dst,
+                int w_in,
+                int h_in,
+                int w_out,
+                int h_out) {
+  if (w_out == w_in && h_out == h_in) {
+    memcpy(dst, src, sizeof(char) * w_in * h_in * 3);
+    return;
+  }
+  // y
+  resize_three_channel(src, w_in * 3, h_in, dst, w_out * 3, h_out);
+}
+void resize_three_channel(const uint8_t* src,
+                          int w_in,
+                          int h_in,
+                          uint8_t* dst,
+                          int w_out,
+                          int h_out) {
+  const int resize_coef_bits = 11;
+  const int resize_coef_scale = 1 << resize_coef_bits;
+  double scale_x = static_cast<double>(w_in) / w_out;
+  double scale_y = static_cast<double>(h_in) / h_out;
+  int* buf = new int[w_out * 2 + h_out * 2];
+  int* xofs = buf;          // new int[w];
+  int* yofs = buf + w_out;  // new int[h];
+  int16_t* ialpha =
+      reinterpret_cast<int16_t*>(buf + w_out + h_out);  // new int16_t[w * 2];
+  int16_t* ibeta =
+      reinterpret_cast<int16_t*>(buf + w_out * 2 + h_out);  // new short[h * 2];
+  float fx = 0.f;
+  float fy = 0.f;
+  int sx = 0.f;
+  int sy = 0.f;
+#define SATURATE_CAST_SHORT(X)                                               \
+  (int16_t)::std::min(                                                       \
+      ::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
+      SHRT_MAX);
+  for (int dx = 0; dx < w_out / 3; dx++) {
+    fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
+    sx = floor(fx);
+    fx -= sx;
+    if (sx < 0) {
+      sx = 0;
+      fx = 0.f;
+    }
+    if (sx >= w_in - 1) {
+      sx = w_in - 2;
+      fx = 1.f;
+    }
+    xofs[dx] = sx * 3;
+    float a0 = (1.f - fx) * resize_coef_scale;
+    float a1 = fx * resize_coef_scale;
+    ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
+    ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
+  }
+  for (int dy = 0; dy < h_out; dy++) {
+    fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
+    sy = floor(fy);
+    fy -= sy;
+    if (sy < 0) {
+      sy = 0;
+      fy = 0.f;
+    }
+    if (sy >= h_in - 1) {
+      sy = h_in - 2;
+      fy = 1.f;
+    }
+    yofs[dy] = sy;
+    float b0 = (1.f - fy) * resize_coef_scale;
+    float b1 = fy * resize_coef_scale;
+    ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
+    ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
+  }
+#undef SATURATE_CAST_SHORT
+  // loop body
+  int16_t* rowsbuf0 = new int16_t[w_out + 1];
+  int16_t* rowsbuf1 = new int16_t[w_out + 1];
+  int16_t* rows0 = rowsbuf0;
+  int16_t* rows1 = rowsbuf1;
+  int prev_sy1 = -1;
+  for (int dy = 0; dy < h_out; dy++) {
+    int sy = yofs[dy];
+    if (sy == prev_sy1) {
+      // hresize one row
+      int16_t* rows0_old = rows0;
+      rows0 = rows1;
+      rows1 = rows0_old;
+      const uint8_t* S1 = src + w_in * (sy + 1);
+      const int16_t* ialphap = ialpha;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out / 3; dx++) {
+        int sx = xofs[dx];
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+        const uint8_t* S1p = S1 + sx;
+        int tmp = dx * 3;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[3] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[4] * a1) >> 4;
+        rows1p[tmp + 2] = (S1p[2] * a0 + S1p[5] * a1) >> 4;
+        ialphap += 2;
+      }
+    } else {
+      // hresize two rows
+      const uint8_t* S0 = src + w_in * (sy);
+      const uint8_t* S1 = src + w_in * (sy + 1);
+      const int16_t* ialphap = ialpha;
+      int16_t* rows0p = rows0;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out / 3; dx++) {
+        int sx = xofs[dx];
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+        const uint8_t* S0p = S0 + sx;
+        const uint8_t* S1p = S1 + sx;
+        int tmp = dx * 3;
+        rows0p[tmp] = (S0p[0] * a0 + S0p[3] * a1) >> 4;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[3] * a1) >> 4;
+        rows0p[tmp + 1] = (S0p[1] * a0 + S0p[4] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[4] * a1) >> 4;
+        rows0p[tmp + 2] = (S0p[2] * a0 + S0p[5] * a1) >> 4;
+        rows1p[tmp + 2] = (S1p[2] * a0 + S1p[5] * a1) >> 4;
+        ialphap += 2;
+      }
+    }
+    prev_sy1 = sy + 1;
+    // vresize
+    int16_t b0 = ibeta[0];
+    int16_t b1 = ibeta[1];
+    int16_t* rows0p = rows0;
+    int16_t* rows1p = rows1;
+    uint8_t* dp_ptr = dst + w_out * (dy);
+    int cnt = w_out >> 3;
+    int remain = w_out - (cnt << 3);
+    int16x4_t _b0 = vdup_n_s16(b0);
+    int16x4_t _b1 = vdup_n_s16(b1);
+    int32x4_t _v2 = vdupq_n_s32(2);
+    for (cnt = w_out >> 3; cnt > 0; cnt--) {
+      int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
+      int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
+      int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
+      int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
+      int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
+      int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
+      int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
+      int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
+      int32x4_t _acc = _v2;
+      _acc = vsraq_n_s32(
+          _acc, _rows0p_sr4_mb0, 16);  // _acc >> 16 + _rows0p_sr4_mb0 >> 16
+      _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
+      int32x4_t _acc_1 = _v2;
+      _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
+      _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
+      int16x4_t _acc16 = vshrn_n_s32(_acc, 2);  // _acc >> 2
+      int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
+      uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
+      vst1_u8(dp_ptr, _dout);
+      dp_ptr += 8;
+      rows0p += 8;
+      rows1p += 8;
+    }
+    for (; remain; --remain) {
+      // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
+      *dp_ptr++ =
+          (uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
+                     (int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
+                    2);
+    }
+    ibeta += 2;
+  }
+  delete[] buf;
+  delete[] rowsbuf0;
+  delete[] rowsbuf1;
+}
--- a/lite/tests/cv/anakin/bgr_rotate_hwc.cc
+++ b/lite/tests/cv/anakin/bgr_rotate_hwc.cc
--- a/lite/tests/cv/anakin/bgr_to_tensor_hwc.cc
+++ b/lite/tests/cv/anakin/bgr_to_tensor_hwc.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/tests/cv/anakin/cv_utils.h"
+
+void bgr_to_tensor_hwc(const uint8_t* bgr,
+                       Tensor& output,  // NOLINT
+                       int width,
+                       int height,
+                       float* means,
+                       float* scales) {
+  int size = width * height;
+  float* ptr0 = output.mutable_data<float>();
+  float r_means = means[0];
+  float g_means = means[1];
+  float b_means = means[2];
+  float r_scales = scales[0];
+  float g_scales = scales[1];
+  float b_scales = scales[2];
+
+  int w = width;
+  int dim8 = w >> 3;
+  int remain = w - (dim8 << 3);
+
+  float32x4_t vrmean = vdupq_n_f32(r_means);
+  float32x4_t vgmean = vdupq_n_f32(g_means);
+  float32x4_t vbmean = vdupq_n_f32(b_means);
+  float32x4_t vrscale = vdupq_n_f32(r_scales);
+  float32x4_t vgscale = vdupq_n_f32(g_scales);
+  float32x4_t vbscale = vdupq_n_f32(b_scales);
+
+  for (int i = 0; i < height; i++) {
+    const uint8_t* ptr_bgr = bgr + i * width * 3;
+    float* ptr0_b = ptr0 + i * width;
+    float* ptr1_g = ptr0_b + size;
+    float* ptr2_r = ptr1_g + size;
+
+    for (int j = 0; j < dim8; j++) {
+      uint8x8x3_t vbgr = vld3_u8(ptr_bgr);
+      uint8x8_t vb = vbgr.val[0];
+      uint8x8_t vg = vbgr.val[1];
+      uint8x8_t vr = vbgr.val[2];
+
+      uint16x8_t vb_16 = vmovl_u8(vb);
+      uint16x8_t vg_16 = vmovl_u8(vg);
+      uint16x8_t vr_16 = vmovl_u8(vr);
+
+      uint32x4_t vb_low_32 = vmovl_u16(vget_low_u16(vb_16));
+      uint32x4_t vg_low_32 = vmovl_u16(vget_low_u16(vg_16));
+      uint32x4_t vr_low_32 = vmovl_u16(vget_low_u16(vr_16));
+
+      uint32x4_t vb_high_32 = vmovl_u16(vget_high_u16(vb_16));
+      uint32x4_t vg_high_32 = vmovl_u16(vget_high_u16(vg_16));
+      uint32x4_t vr_high_32 = vmovl_u16(vget_high_u16(vr_16));
+
+      float32x4_t vb_low_f32 = vcvtq_f32_u32(vb_low_32);
+      float32x4_t vr_low_f32 = vcvtq_f32_u32(vr_low_32);
+      float32x4_t vg_low_f32 = vcvtq_f32_u32(vg_low_32);
+
+      float32x4_t vb_high_f32 = vcvtq_f32_u32(vb_high_32);
+      float32x4_t vg_high_f32 = vcvtq_f32_u32(vg_high_32);
+      float32x4_t vr_high_f32 = vcvtq_f32_u32(vr_high_32);
+
+      vb_low_f32 = vsubq_f32(vb_low_f32, vbmean);
+      vg_low_f32 = vsubq_f32(vg_low_f32, vgmean);
+      vr_low_f32 = vsubq_f32(vr_low_f32, vrmean);
+
+      vb_high_f32 = vsubq_f32(vb_high_f32, vbmean);
+      vg_high_f32 = vsubq_f32(vg_high_f32, vgmean);
+      vr_high_f32 = vsubq_f32(vr_high_f32, vrmean);
+
+      vb_low_f32 = vmulq_f32(vb_low_f32, vbscale);
+      vg_low_f32 = vmulq_f32(vg_low_f32, vgscale);
+      vr_low_f32 = vmulq_f32(vr_low_f32, vrscale);
+
+      vb_high_f32 = vmulq_f32(vb_high_f32, vbscale);
+      vg_high_f32 = vmulq_f32(vg_high_f32, vgscale);
+      vr_high_f32 = vmulq_f32(vr_high_f32, vrscale);
+
+      vst1q_f32(ptr0_b, vb_low_f32);
+      vst1q_f32(ptr1_g, vg_low_f32);
+      vst1q_f32(ptr2_r, vr_low_f32);
+
+      ptr_bgr += 24;
+
+      vst1q_f32(ptr0_b + 4, vb_high_f32);
+      vst1q_f32(ptr1_g + 4, vg_high_f32);
+      vst1q_f32(ptr2_r + 4, vr_high_f32);
+
+      ptr0_b += 8;
+      ptr1_g += 8;
+      ptr2_r += 8;
+    }
+
+    for (int j = 0; j < remain; j++) {
+      *ptr0_b++ = (*ptr_bgr - b_means) * b_scales;  // NOLINT
+      ptr_bgr++;
+      *ptr1_g++ = (*ptr_bgr - g_means) * g_scales;  // NOLINT
+      ptr_bgr++;
+      *ptr2_r++ = (*ptr_bgr - r_means) * r_scales;  // NOLINT
+      ptr_bgr++;
+    }
+  }
+}
--- a/lite/tests/cv/anakin/bgra_flip_hwc.cc
+++ b/lite/tests/cv/anakin/bgra_flip_hwc.cc
--- a/lite/tests/cv/anakin/bgra_resize.cc
+++ b/lite/tests/cv/anakin/bgra_resize.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <limits.h>
+#include <math.h>
+#include "lite/tests/cv/anakin/cv_utils.h"
+
+void resize_four_channel(
+    const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
+void bgra_resize(const uint8_t* src,
+                 uint8_t* dst,
+                 int w_in,
+                 int h_in,
+                 int w_out,
+                 int h_out) {
+  if (w_out == w_in && h_out == h_in) {
+    memcpy(dst, src, sizeof(char) * w_in * h_in * 4);
+    return;
+  }
+  // y
+  resize_four_channel(src, w_in * 4, h_in, dst, w_out * 4, h_out);
+}
+void resize_four_channel(const uint8_t* src,
+                         int w_in,
+                         int h_in,
+                         uint8_t* dst,
+                         int w_out,
+                         int h_out) {
+  const int resize_coef_bits = 11;
+  const int resize_coef_scale = 1 << resize_coef_bits;
+  double scale_x = static_cast<double>(w_in) / w_out;
+  double scale_y = static_cast<double>(h_in) / h_out;
+  int* buf = new int[w_out * 2 + h_out * 2];
+  int* xofs = buf;          // new int[w];
+  int* yofs = buf + w_out;  // new int[h];
+  int16_t* ialpha =
+      reinterpret_cast<int16_t*>(buf + w_out + h_out);  // new int16_t[w * 2];
+  int16_t* ibeta = reinterpret_cast<int16_t*>(buf + w_out * 2 +
+                                              h_out);  // new int16_t[h * 2];
+  float fx = 0.f;
+  float fy = 0.f;
+  int sx = 0.f;
+  int sy = 0.f;
+#define SATURATE_CAST_int16_t(X)                                             \
+  (int16_t)::std::min(                                                       \
+      ::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
+      SHRT_MAX);
+  for (int dx = 0; dx < w_out / 4; dx++) {
+    fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
+    sx = floor(fx);
+    fx -= sx;
+    if (sx < 0) {
+      sx = 0;
+      fx = 0.f;
+    }
+    if (sx >= w_in - 1) {
+      sx = w_in - 2;
+      fx = 1.f;
+    }
+    xofs[dx] = sx * 4;
+    float a0 = (1.f - fx) * resize_coef_scale;
+    float a1 = fx * resize_coef_scale;
+    ialpha[dx * 2] = SATURATE_CAST_int16_t(a0);
+    ialpha[dx * 2 + 1] = SATURATE_CAST_int16_t(a1);
+  }
+  for (int dy = 0; dy < h_out; dy++) {
+    fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
+    sy = floor(fy);
+    fy -= sy;
+    if (sy < 0) {
+      sy = 0;
+      fy = 0.f;
+    }
+    if (sy >= h_in - 1) {
+      sy = h_in - 2;
+      fy = 1.f;
+    }
+    yofs[dy] = sy;
+    float b0 = (1.f - fy) * resize_coef_scale;
+    float b1 = fy * resize_coef_scale;
+    ibeta[dy * 2] = SATURATE_CAST_int16_t(b0);
+    ibeta[dy * 2 + 1] = SATURATE_CAST_int16_t(b1);
+  }
+#undef SATURATE_CAST_int16_t
+  // loop body
+  int16_t* rowsbuf0 = new int16_t[w_out + 1];
+  int16_t* rowsbuf1 = new int16_t[w_out + 1];
+  int16_t* rows0 = rowsbuf0;
+  int16_t* rows1 = rowsbuf1;
+  int prev_sy1 = -1;
+  for (int dy = 0; dy < h_out; dy++) {
+    int sy = yofs[dy];
+    if (sy == prev_sy1) {
+      // hresize one row
+      int16_t* rows0_old = rows0;
+      rows0 = rows1;
+      rows1 = rows0_old;
+      const uint8_t* S1 = src + w_in * (sy + 1);
+      const int16_t* ialphap = ialpha;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out / 4; dx++) {
+        int sx = xofs[dx];
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+        const uint8_t* S1p = S1 + sx;
+        int tmp = dx * 4;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[4] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[5] * a1) >> 4;
+        rows1p[tmp + 2] = (S1p[2] * a0 + S1p[6] * a1) >> 4;
+        rows1p[tmp + 3] = (S1p[3] * a0 + S1p[7] * a1) >> 4;
+        ialphap += 2;
+      }
+    } else {
+      // hresize two rows
+      const uint8_t* S0 = src + w_in * (sy);
+      const uint8_t* S1 = src + w_in * (sy + 1);
+      const int16_t* ialphap = ialpha;
+      int16_t* rows0p = rows0;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out / 4; dx++) {
+        int sx = xofs[dx];
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+        const uint8_t* S0p = S0 + sx;
+        const uint8_t* S1p = S1 + sx;
+        int tmp = dx * 4;
+        rows0p[tmp] = (S0p[0] * a0 + S0p[4] * a1) >> 4;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[4] * a1) >> 4;
+        rows0p[tmp + 1] = (S0p[1] * a0 + S0p[5] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[5] * a1) >> 4;
+        rows0p[tmp + 2] = (S0p[2] * a0 + S0p[6] * a1) >> 4;
+        rows1p[tmp + 2] = (S1p[2] * a0 + S1p[6] * a1) >> 4;
+        rows0p[tmp + 3] = (S0p[3] * a0 + S0p[7] * a1) >> 4;
+        rows1p[tmp + 3] = (S1p[3] * a0 + S1p[7] * a1) >> 4;
+        ialphap += 2;
+      }
+    }
+    prev_sy1 = sy + 1;
+    // vresize
+    int16_t b0 = ibeta[0];
+    int16_t b1 = ibeta[1];
+    int16_t* rows0p = rows0;
+    int16_t* rows1p = rows1;
+    uint8_t* dp_ptr = dst + w_out * (dy);
+    int cnt = w_out >> 3;
+    int remain = w_out - (cnt << 3);
+    int16x4_t _b0 = vdup_n_s16(b0);
+    int16x4_t _b1 = vdup_n_s16(b1);
+    int32x4_t _v2 = vdupq_n_s32(2);
+
+    for (cnt = w_out >> 3; cnt > 0; cnt--) {
+      int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
+      int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
+      int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
+      int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
+      int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
+      int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
+      int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
+      int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
+      int32x4_t _acc = _v2;
+      _acc = vsraq_n_s32(
+          _acc, _rows0p_sr4_mb0, 16);  // _acc >> 16 + _rows0p_sr4_mb0 >> 16
+      _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
+      int32x4_t _acc_1 = _v2;
+      _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
+      _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
+      int16x4_t _acc16 = vshrn_n_s32(_acc, 2);  // _acc >> 2
+      int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
+      uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
+      vst1_u8(dp_ptr, _dout);
+      dp_ptr += 8;
+      rows0p += 8;
+      rows1p += 8;
+    }
+    for (; remain; --remain) {
+      // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
+      *dp_ptr++ =
+          (uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
+                     (int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
+                    2);
+    }
+    ibeta += 2;
+  }
+  delete[] buf;
+  delete[] rowsbuf0;
+  delete[] rowsbuf1;
+}
--- a/lite/tests/cv/anakin/bgra_rotate_hwc.cc
+++ b/lite/tests/cv/anakin/bgra_rotate_hwc.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/tests/cv/anakin/cv_utils.h"
+
+void rotate90_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+
+void rotate270_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+
+void rotate180_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+
+void bgra_rotate_hwc(
+    const uint8_t* src, uint8_t* dst, int w_in, int h_in, int angle) {
+  if (angle == 90) {
+    rotate90_hwc_bgra(src, dst, w_in, h_in);
+  }
+  if (angle == 270) {
+    rotate270_hwc_bgra(src, dst, w_in, h_in);
+  }
+  if (angle == 180) {
+    rotate180_hwc_bgra(src, dst, w_in, h_in);
+  }
+}
+
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+rotate:
+bgr7 bgr4 bgr1
+bgr8 bgr5 bgr2
+bgr9 bgr6 bgr3
+*/
+void rotate90_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int w_out = h_in;
+  int h_out = w_in;
+  int win = w_in * 4;
+  int wout = w_out * 4;
+  int hremain = h_in % 8;
+  int stride_h = 4 * win;
+  int stride_h_w = 4 * win - 32;
+  int ww = w_out - 8;
+  // block 8*8. -- 8*8
+  int i = 0;
+  for (i = 0; i < h_in - 7; i += 8) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+    const uint8_t* inptr4 = inptr3 + win;
+    const uint8_t* inptr5 = inptr4 + win;
+    const uint8_t* inptr6 = inptr5 + win;
+    const uint8_t* inptr7 = inptr6 + win;
+#ifdef __aarch64__
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr0], #64]   \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        "prfm   pldl1keep, [%[ptr3], #64]   \n"
+        "prfm   pldl1keep, [%[ptr4]]        \n"
+        "prfm   pldl1keep, [%[ptr4], #64]   \n"
+        "prfm   pldl1keep, [%[ptr5]]        \n"
+        "prfm   pldl1keep, [%[ptr5], #64]   \n"
+        "prfm   pldl1keep, [%[ptr6]]        \n"
+        "prfm   pldl1keep, [%[ptr6], #64]   \n"
+        "prfm   pldl1keep, [%[ptr7]]        \n"
+        "prfm   pldl1keep, [%[ptr7], #64]   \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3),
+          [ptr4] "r"(inptr4),
+          [ptr5] "r"(inptr5),
+          [ptr6] "r"(inptr6),
+          [ptr7] "r"(inptr7)
+        : "memory");
+#else
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr0], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr1], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr2], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        "pld [%[ptr3], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr4]]            @ preload a, 64byte\n"
+        "pld [%[ptr4], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr5]]            @ preload a, 64byte\n"
+        "pld [%[ptr5], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr6]]            @ preload a, 64byte\n"
+        "pld [%[ptr6], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr7]]            @ preload a, 64byte\n"
+        "pld [%[ptr7], #64]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3),
+          [ptr4] "r"(inptr4),
+          [ptr5] "r"(inptr5),
+          [ptr6] "r"(inptr6),
+          [ptr7] "r"(inptr7)
+        : "memory");
+#endif
+    int j = 0;
+    for (; j < w_in; j++) {
+      int tmpx = (ww - i) * 4;
+      uint8_t* outptr = dst + j * wout + tmpx;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+    }
+  }
+  ww = w_out - 1;
+  for (; i < h_in; i++) {
+    const uint8_t* inptr0 = src + i * win;
+    for (int j = 0; j < w_in; j++) {
+      uint8_t* outptr0 = dst + j * wout + (ww - i) * 4;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+    }
+  }
+}
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+rotate:
+bgr3 bgr6 bgr9
+bgr2 bgr5 bgr8
+bgr1 bgr4 bgr7
+*/
+// dst = (h_out - 1) * w_out
+// 类似rotate90，将输出结果倒着输出 或者先rotate90,然后沿Y轴翻转
+void rotate270_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int w_out = h_in;
+  int h_out = w_in;
+  int win = w_in * 4;
+  int wout = w_out * 4;
+  int hremain = h_in % 8;
+  int stride_h = 4 * win;
+  int stride_h_w = 4 * win - 32;
+  int hout = h_out - 1;
+  // block 8*8. -- 8*8
+  int i = 0;
+  for (; i < h_in - 7; i += 8) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+    const uint8_t* inptr4 = inptr3 + win;
+    const uint8_t* inptr5 = inptr4 + win;
+    const uint8_t* inptr6 = inptr5 + win;
+    const uint8_t* inptr7 = inptr6 + win;
+    int j = 0;
+#ifdef __aarch64__
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr0], #64]   \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        "prfm   pldl1keep, [%[ptr3], #64]   \n"
+        "prfm   pldl1keep, [%[ptr4]]        \n"
+        "prfm   pldl1keep, [%[ptr4], #64]   \n"
+        "prfm   pldl1keep, [%[ptr5]]        \n"
+        "prfm   pldl1keep, [%[ptr5], #64]   \n"
+        "prfm   pldl1keep, [%[ptr6]]        \n"
+        "prfm   pldl1keep, [%[ptr6], #64]   \n"
+        "prfm   pldl1keep, [%[ptr7]]        \n"
+        "prfm   pldl1keep, [%[ptr7], #64]   \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3),
+          [ptr4] "r"(inptr4),
+          [ptr5] "r"(inptr5),
+          [ptr6] "r"(inptr6),
+          [ptr7] "r"(inptr7)
+        : "memory");
+#else
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr0], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr1], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr2], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        "pld [%[ptr3], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr4]]            @ preload a, 64byte\n"
+        "pld [%[ptr4], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr5]]            @ preload a, 64byte\n"
+        "pld [%[ptr5], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr6]]            @ preload a, 64byte\n"
+        "pld [%[ptr6], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr7]]            @ preload a, 64byte\n"
+        "pld [%[ptr7], #64]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3),
+          [ptr4] "r"(inptr4),
+          [ptr5] "r"(inptr5),
+          [ptr6] "r"(inptr6),
+          [ptr7] "r"(inptr7)
+        : "memory");
+#endif
+    for (; j < w_in; j++) {
+      int tmpx = i * 4;
+      uint8_t* outptr = dst + (hout - j) * wout + tmpx;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+    }
+  }
+
+  for (; i < h_in; i++) {
+    const uint8_t* inptr0 = src + i * win;
+    for (int j = 0; j < w_in; j++) {
+      uint8_t* outptr0 = dst + (hout - j) * wout + i * 4;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+    }
+  }
+}
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+rotate:
+bgr9 bgr8 bgr7
+bgr6 bgr5 bgr4
+bgr3 bgr2 bgr1
+*/
+// filp y
+void rotate180_hwc_bgra(const uint8_t* src, uint8_t* dst, int w, int h_in) {
+  int w_in = w * 4;
+  uint8_t zerobuff[w_in];  // NOLINT
+  memset(zerobuff, 0, w_in * sizeof(uint8_t));
+  int stride_w = 4;
+  // 4*8
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * w_in;
+    const uint8_t* inptr1 = inptr0 + w_in;
+    const uint8_t* inptr2 = inptr1 + w_in;
+    const uint8_t* inptr3 = inptr2 + w_in;
+
+    uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w;  // last
+    uint8_t* outptr1 = outptr0 - w_in;
+    uint8_t* outptr2 = outptr1 - w_in;
+    uint8_t* outptr3 = outptr2 - w_in;
+
+    if (i + 3 >= h_in) {
+      switch ((i + 3) - h_in) {
+        case 3:
+          inptr0 = zerobuff;
+        case 2:
+          inptr1 = zerobuff;
+        case 1:
+          inptr2 = zerobuff;
+        case 0:
+          inptr3 = zerobuff;
+        default:
+          break;
+      }
+    }
+#ifdef __aarch64__
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr0], #64]   \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        "prfm   pldl1keep, [%[ptr3], #64]   \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+#else
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr0], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr1], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr2], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        "pld [%[ptr3], #64]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+#endif
+    int j = 0;
+    for (; j < w; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            outptr2 -= 8;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            outptr1 -= 8;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            outptr0 -= 8;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        outptr3 -= 8;
+
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        outptr2 -= 8;
+
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        outptr1 -= 8;
+
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        outptr0 -= 8;
+      }
+    }
+  }
+}
--- a/lite/tests/cv/anakin/bgra_to_tensor_hwc.cc
+++ b/lite/tests/cv/anakin/bgra_to_tensor_hwc.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/tests/cv/anakin/cv_utils.h"
+
+void bgra_to_tensor_hwc(const uint8_t* bgr,
+                        Tensor& output,  // NOLINT
+                        int width,
+                        int height,
+                        float* means,
+                        float* scales) {
+  int size = width * height;
+  float* ptr0 = output.mutable_data<float>();
+  float r_means = means[0];
+  float g_means = means[1];
+  float b_means = means[2];
+  float r_scales = scales[0];
+  float g_scales = scales[1];
+  float b_scales = scales[2];
+
+  int dim8 = width >> 3;
+  int remain = wwidth - (dim8 << 3);
+
+  float32x4_t vrmean = vdupq_n_f32(r_means);
+  float32x4_t vgmean = vdupq_n_f32(g_means);
+  float32x4_t vbmean = vdupq_n_f32(b_means);
+  float32x4_t vrscale = vdupq_n_f32(r_scales);
+  float32x4_t vgscale = vdupq_n_f32(g_scales);
+  float32x4_t vbscale = vdupq_n_f32(b_scales);
+
+  for (int i = 0; i < height; i++) {
+    const uint8_t* ptr_bgr = bgr + i * width * 4;
+    float* ptr0_b = ptr0 + i * width;
+    float* ptr1_g = ptr0_b + size;
+    float* ptr2_r = ptr1_g + size;
+
+    for (int j = 0; j < dim8; j++) {
+      uint8x8x4_t vbgr = vld4_u8(ptr_bgr);
+      uint8x8_t vb = vbgr.val[0];
+      uint8x8_t vg = vbgr.val[1];
+      uint8x8_t vr = vbgr.val[2];
+
+      uint16x8_t vb_16 = vmovl_u8(vb);
+      uint16x8_t vg_16 = vmovl_u8(vg);
+      uint16x8_t vr_16 = vmovl_u8(vr);
+
+      uint32x4_t vb_low_32 = vmovl_u16(vget_low_u16(vb_16));
+      uint32x4_t vg_low_32 = vmovl_u16(vget_low_u16(vg_16));
+      uint32x4_t vr_low_32 = vmovl_u16(vget_low_u16(vr_16));
+
+      uint32x4_t vb_high_32 = vmovl_u16(vget_high_u16(vb_16));
+      uint32x4_t vg_high_32 = vmovl_u16(vget_high_u16(vg_16));
+      uint32x4_t vr_high_32 = vmovl_u16(vget_high_u16(vr_16));
+
+      float32x4_t vb_low_f32 = vcvtq_f32_u32(vb_low_32);
+      float32x4_t vr_low_f32 = vcvtq_f32_u32(vr_low_32);
+      float32x4_t vg_low_f32 = vcvtq_f32_u32(vg_low_32);
+
+      float32x4_t vb_high_f32 = vcvtq_f32_u32(vb_high_32);
+      float32x4_t vg_high_f32 = vcvtq_f32_u32(vg_high_32);
+      float32x4_t vr_high_f32 = vcvtq_f32_u32(vr_high_32);
+
+      vb_low_f32 = vsubq_f32(vb_low_f32, vbmean);
+      vg_low_f32 = vsubq_f32(vg_low_f32, vgmean);
+      vr_low_f32 = vsubq_f32(vr_low_f32, vrmean);
+
+      vb_high_f32 = vsubq_f32(vb_high_f32, vbmean);
+      vg_high_f32 = vsubq_f32(vg_high_f32, vgmean);
+      vr_high_f32 = vsubq_f32(vr_high_f32, vrmean);
+
+      vb_low_f32 = vmulq_f32(vb_low_f32, vbscale);
+      vg_low_f32 = vmulq_f32(vg_low_f32, vgscale);
+      vr_low_f32 = vmulq_f32(vr_low_f32, vrscale);
+
+      vb_high_f32 = vmulq_f32(vb_high_f32, vbscale);
+      vg_high_f32 = vmulq_f32(vg_high_f32, vgscale);
+      vr_high_f32 = vmulq_f32(vr_high_f32, vrscale);
+
+      vst1q_f32(ptr0_b, vb_low_f32);
+      vst1q_f32(ptr1_g, vg_low_f32);
+      vst1q_f32(ptr2_r, vr_low_f32);
+
+      ptr_bgr += 32;
+
+      vst1q_f32(ptr0_b + 4, vb_high_f32);
+      vst1q_f32(ptr1_g + 4, vg_high_f32);
+      vst1q_f32(ptr2_r + 4, vr_high_f32);
+
+      ptr0_b += 8;
+      ptr1_g += 8;
+      ptr2_r += 8;
+    }
+
+    for (int j = 0; j < remain; j++) {
+      *ptr0_b++ = (*ptr_bgr - b_means) * b_scales;
+      ptr_bgr++;
+      *ptr1_g++ = (*ptr_bgr - g_means) * g_scales;
+      ptr_bgr++;
+      *ptr2_r++ = (*ptr_bgr - r_means) * r_scales;
+      ptr_bgr++;
+      ptr_bgr++;
+    }
+  }
+}
--- a/lite/tests/cv/anakin/cv_utils.cc
+++ b/lite/tests/cv/anakin/cv_utils.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/tests/cv/anakin/cv_utils.h"
+
+void image_basic_convert(const uint8_t* src,
+                         uint8_t* dst,
+                         ImageFormat srcFormat,
+                         ImageFormat dstFormat,
+                         int srcw,
+                         int srch,
+                         int out_size) {
+  if (srcFormat == dstFormat) {
+    // copy
+    memcpy(dst, src, sizeof(uint8_t) * out_size);
+    return;
+  } else {
+    if (srcFormat == ImageFormat::NV12 &&
+        (dstFormat == ImageFormat::BGR || dstFormat == ImageFormat::RGB)) {
+      nv12_to_bgr(src, dst, srcw, srch);
+    } else if (srcFormat == ImageFormat::NV21 &&
+               (dstFormat == ImageFormat::BGR ||
+                dstFormat == ImageFormat::RGB)) {
+      nv21_to_bgr(src, dst, srcw, srch);
+    } else if (srcFormat == ImageFormat::NV12 &&
+               (dstFormat == ImageFormat::BGRA ||
+                dstFormat == ImageFormat::RGBA)) {
+      nv12_to_bgra(src, dst, srcw, srch);
+    } else if (srcFormat == ImageFormat::NV21 &&
+               (dstFormat == ImageFormat::BGRA ||
+                dstFormat == ImageFormat::RGBA)) {
+      nv21_to_bgra(src, dst, srcw, srch);
+    } else {
+      printf("bais-anakin srcFormat: %d, dstFormat: %d does not support! \n",
+             srcFormat,
+             dstFormat);
+    }
+  }
+}
+
+void image_basic_resize(const uint8_t* src,
+                        uint8_t* dst,
+                        ImageFormat srcFormat,
+                        int srcw,
+                        int srch,
+                        int dstw,
+                        int dsth) {
+  int size = srcw * srch;
+  if (srcw == dstw && srch == dsth) {
+    if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
+      size = srcw * (static_cast<int>(1.5 * srch));
+    } else if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
+      size = 3 * srcw * srch;
+    } else if (srcFormat == ImageFormat::BGRA ||
+               srcFormat == ImageFormat::RGBA) {
+      size = 4 * srcw * srch;
+    }
+    memcpy(dst, src, sizeof(uint8_t) * size);
+    return;
+  } else {
+    if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
+      nv21_resize(src, dst, srcw, srch, dstw, dsth);
+    } else if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
+      bgr_resize(src, dst, srcw, srch, dstw, dsth);
+    } else if (srcFormat == ImageFormat::BGRA ||
+               srcFormat == ImageFormat::RGBA) {
+      bgra_resize(src, dst, srcw, srch, dstw, dsth);
+    } else {
+      printf("anakin doesn't support this type: %d\n",
+             static_cast<int>(srcFormat));
+    }
+  }
+}
+
+void image_basic_flip(const uint8_t* src,
+                      uint8_t* dst,
+                      ImageFormat srcFormat,
+                      int srcw,
+                      int srch,
+                      int flip_num) {
+  if (flip_num == -1) {
+    flip_num = 0;  // xy
+  } else if (flip_num == 0) {
+    flip_num = 1;  // x
+  } else if (flip_num == 1) {
+    flip_num = -1;  // y
+  }
+  if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
+    bgr_flip_hwc(src, dst, srcw, srch, flip_num);
+  } else if (srcFormat == ImageFormat::BGRA || srcFormat == ImageFormat::RGBA) {
+    bgra_flip_hwc(src, dst, srcw, srch, flip_num);
+  } else {
+    printf("anakin doesn't support this type: %d\n",
+           static_cast<int>(srcFormat));
+  }
+}
+
+void image_basic_rotate(const uint8_t* src,
+                        uint8_t* dst,
+                        ImageFormat srcFormat,
+                        int srcw,
+                        int srch,
+                        float rotate_num) {
+  if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
+    bgr_rotate_hwc(src, dst, srcw, srch, rotate_num);
+  } else if (srcFormat == ImageFormat::BGRA || srcFormat == ImageFormat::RGBA) {
+    bgra_rotate_hwc(src, dst, srcw, srch, rotate_num);
+  } else {
+    printf("anakin doesn't support this type: %d\n",
+           static_cast<int>(srcFormat));
+  }
+}
+
+void image_basic_to_tensor(const uint8_t* in_data,
+                           Tensor dst,
+                           ImageFormat srcFormat,
+                           LayoutType layout,
+                           int srcw,
+                           int srch,
+                           float* means,
+                           float* scales) {
+  if (layout == LayoutType::kNCHW &&
+      (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB)) {
+    bgr_to_tensor_hwc(in_data, dst, srcw, srch, means, scales);
+  } else if (layout == LayoutType::kNCHW && (srcFormat == ImageFormat::BGRA ||
+                                             srcFormat == ImageFormat::RGBA)) {
+    bgra_to_tensor_hwc(in_data, dst, srcw, srch, means, scales);
+  } else {
+    printf("anakin doesn't support this type: %d\n",
+           static_cast<int>(srcFormat));
+  }
+}
--- a/lite/tests/cv/anakin/cv_utils.h
+++ b/lite/tests/cv/anakin/cv_utils.h
--- a/lite/tests/cv/anakin/nv12_to_bgr.cc
+++ b/lite/tests/cv/anakin/nv12_to_bgr.cc
--- a/lite/tests/cv/anakin/nv12_to_bgra.cc
+++ b/lite/tests/cv/anakin/nv12_to_bgra.cc
--- a/lite/tests/cv/anakin/nv21_resize.cc
+++ b/lite/tests/cv/anakin/nv21_resize.cc
--- a/lite/tests/cv/anakin/nv21_to_bgr.cc
+++ b/lite/tests/cv/anakin/nv21_to_bgr.cc
--- a/lite/tests/cv/anakin/nv21_to_bgra.cc
+++ b/lite/tests/cv/anakin/nv21_to_bgra.cc
--- a/lite/tests/cv/image_profiler_test.cc
+++ b/lite/tests/cv/image_profiler_test.cc
--- a/lite/utils/cv/image_convert.cc
+++ b/lite/utils/cv/image_convert.cc
--- a/lite/utils/cv/image_flip.cc
+++ b/lite/utils/cv/image_flip.cc
--- a/lite/utils/cv/image_resize.cc
+++ b/lite/utils/cv/image_resize.cc
--- a/lite/utils/cv/paddle_image_preprocess.cc
+++ b/lite/utils/cv/paddle_image_preprocess.cc
--- a/lite/utils/cv/paddle_image_preprocess.h
+++ b/lite/utils/cv/paddle_image_preprocess.h