diff --git a/lite/tests/cv/cv_basic.h b/lite/tests/cv/cv_basic.h
index 45ae3fb616527f404984bba7b2366a2f66c09a96..5e867487e2e5f75411aae7204dcacd0dd791ee98 100644
--- a/lite/tests/cv/cv_basic.h
+++ b/lite/tests/cv/cv_basic.h
@@ -538,10 +538,10 @@ void image_resize_basic(const uint8_t* in_data,
   int* yofs1 = nullptr;
   if (orih < dsth) {
     int tmp = dsth - orih;
-    ialpha1 = new float[srcw];
-    xofs1 = new int[dstw / 2];
+    ialpha1 = new float[dstw];
+    xofs1 = new int[dstw];
     yofs1 = new int[tmp];
-    compute_xy(srcw / 2,
+    compute_xy(srcw,
                srch / 2,
                dstw / 2,
                tmp,
@@ -565,7 +565,7 @@ void image_resize_basic(const uint8_t* in_data,
       ialpha = ialpha1;
       xofs = xofs1;
       yofs = yofs1;
-      y_in_start = yofs[dy - orih];
+      y_in_start = yofs[dy - orih] + srch;
     }
     int y_in_end = y_in_start + 1;
     if (y_in_start < 0) {
diff --git a/lite/utils/cv/image_resize.cc b/lite/utils/cv/image_resize.cc
index 3e67c9386f963ab31f6b200a6badde93e431c482..2fc884a0fa4fd359c150115a20bdf751094b4687 100644
--- a/lite/utils/cv/image_resize.cc
+++ b/lite/utils/cv/image_resize.cc
@@ -47,6 +47,592 @@ void ImageResize::choose(const uint8_t* src,
                          int dsth) {
   resize(src, dst, srcFormat, srcw, srch, dstw, dsth);
 }
+void resize_three_channel(
+    const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
+
+void bgr_resize(const uint8_t* src,
+                uint8_t* dst,
+                int w_in,
+                int h_in,
+                int w_out,
+                int h_out) {
+  if (w_out == w_in && h_out == h_in) {
+    memcpy(dst, src, sizeof(uint8_t) * w_in * h_in * 3);
+    return;
+  }
+  // y
+  resize_three_channel(src, w_in * 3, h_in, dst, w_out * 3, h_out);
+}
+void resize_three_channel(const uint8_t* src,
+                          int w_in,
+                          int h_in,
+                          uint8_t* dst,
+                          int w_out,
+                          int h_out) {
+  const int resize_coef_bits = 11;
+  const int resize_coef_scale = 1 << resize_coef_bits;
+  double scale_x = static_cast<double>(w_in) / w_out;
+  double scale_y = static_cast<double>(h_in) / h_out;
+  int* buf = new int[w_out * 2 + h_out * 2];
+  int* xofs = buf;          // new int[w];
+  int* yofs = buf + w_out;  // new int[h];
+  int16_t* ialpha =
+      reinterpret_cast<int16_t*>(buf + w_out + h_out);  // new int16_t[w * 2];
+  int16_t* ibeta =
+      reinterpret_cast<int16_t*>(buf + w_out * 2 + h_out);  // new short[h * 2];
+  float fx = 0.f;
+  float fy = 0.f;
+  int sx = 0.f;
+  int sy = 0.f;
+#define SATURATE_CAST_SHORT(X)                                               \
+  (int16_t)::std::min(                                                       \
+      ::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
+      SHRT_MAX);
+  // #pragma omp parallel for
+  for (int dx = 0; dx < w_out / 3; dx++) {
+    fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
+    sx = floor(fx);
+    fx -= sx;
+    if (sx < 0) {
+      sx = 0;
+      fx = 0.f;
+    }
+    if (sx >= w_in - 1) {
+      sx = w_in - 2;
+      fx = 1.f;
+    }
+    xofs[dx] = sx * 3;
+    float a0 = (1.f - fx) * resize_coef_scale;
+    float a1 = fx * resize_coef_scale;
+    ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
+    ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
+  }
+  // #pragma omp parallel for
+  for (int dy = 0; dy < h_out; dy++) {
+    fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
+    sy = floor(fy);
+    fy -= sy;
+    if (sy < 0) {
+      sy = 0;
+      fy = 0.f;
+    }
+    if (sy >= h_in - 1) {
+      sy = h_in - 2;
+      fy = 1.f;
+    }
+    yofs[dy] = sy;
+    float b0 = (1.f - fy) * resize_coef_scale;
+    float b1 = fy * resize_coef_scale;
+    ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
+    ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
+  }
+#undef SATURATE_CAST_SHORT
+  // loop body
+  int16_t* rowsbuf0 = new int16_t[w_out + 1];
+  int16_t* rowsbuf1 = new int16_t[w_out + 1];
+  int16_t* rows0 = rowsbuf0;
+  int16_t* rows1 = rowsbuf1;
+  int prev_sy1 = -1;
+  for (int dy = 0; dy < h_out; dy++) {
+    int sy = yofs[dy];
+    if (sy == prev_sy1) {
+      // hresize one row
+      int16_t* rows0_old = rows0;
+      rows0 = rows1;
+      rows1 = rows0_old;
+      const uint8_t* S1 = src + w_in * (sy + 1);
+      const int16_t* ialphap = ialpha;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out / 3; dx++) {
+        int sx = xofs[dx];
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+        const uint8_t* S1p = S1 + sx;
+        int tmp = dx * 3;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[3] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[4] * a1) >> 4;
+        rows1p[tmp + 2] = (S1p[2] * a0 + S1p[5] * a1) >> 4;
+        ialphap += 2;
+      }
+    } else {
+      // hresize two rows
+      const uint8_t* S0 = src + w_in * (sy);
+      const uint8_t* S1 = src + w_in * (sy + 1);
+      const int16_t* ialphap = ialpha;
+      int16_t* rows0p = rows0;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out / 3; dx++) {
+        int sx = xofs[dx];
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+        const uint8_t* S0p = S0 + sx;
+        const uint8_t* S1p = S1 + sx;
+        int tmp = dx * 3;
+        rows0p[tmp] = (S0p[0] * a0 + S0p[3] * a1) >> 4;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[3] * a1) >> 4;
+        rows0p[tmp + 1] = (S0p[1] * a0 + S0p[4] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[4] * a1) >> 4;
+        rows0p[tmp + 2] = (S0p[2] * a0 + S0p[5] * a1) >> 4;
+        rows1p[tmp + 2] = (S1p[2] * a0 + S1p[5] * a1) >> 4;
+        ialphap += 2;
+      }
+    }
+    prev_sy1 = sy + 1;
+    // vresize
+    int16_t b0 = ibeta[0];
+    int16_t b1 = ibeta[1];
+    int16_t* rows0p = rows0;
+    int16_t* rows1p = rows1;
+    uint8_t* dp_ptr = dst + w_out * (dy);
+    int cnt = w_out >> 3;
+    int remain = w_out - (cnt << 3);
+    int16x4_t _b0 = vdup_n_s16(b0);
+    int16x4_t _b1 = vdup_n_s16(b1);
+    int32x4_t _v2 = vdupq_n_s32(2);
+    for (cnt = w_out >> 3; cnt > 0; cnt--) {
+      int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
+      int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
+      int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
+      int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
+      int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
+      int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
+      int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
+      int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
+      int32x4_t _acc = _v2;
+      _acc = vsraq_n_s32(
+          _acc, _rows0p_sr4_mb0, 16);  // _acc >> 16 + _rows0p_sr4_mb0 >> 16
+      _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
+      int32x4_t _acc_1 = _v2;
+      _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
+      _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
+      int16x4_t _acc16 = vshrn_n_s32(_acc, 2);  // _acc >> 2
+      int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
+      uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
+      vst1_u8(dp_ptr, _dout);
+      dp_ptr += 8;
+      rows0p += 8;
+      rows1p += 8;
+    }
+    for (; remain; --remain) {
+      // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
+      *dp_ptr++ =
+          (uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
+                     (int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
+                    2);
+    }
+    ibeta += 2;
+  }
+  delete[] buf;
+  delete[] rowsbuf0;
+  delete[] rowsbuf1;
+}
+void resize_one_channel(
+    const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
+void resize_one_channel_uv(
+    const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
+void nv21_resize(const uint8_t* src,
+                 uint8_t* dst,
+                 int w_in,
+                 int h_in,
+                 int w_out,
+                 int h_out) {
+  if (w_out == w_in && h_out == h_in) {
+    memcpy(dst, src, sizeof(uint8_t) * w_in * static_cast<int>(1.5 * h_in));
+    return;
+  }
+  //     return;
+  int y_h = h_in;
+  int uv_h = h_in / 2;
+  const uint8_t* y_ptr = src;
+  const uint8_t* uv_ptr = src + y_h * w_in;
+  // out
+  int dst_y_h = h_out;
+  int dst_uv_h = h_out / 2;
+  uint8_t* dst_ptr = dst + dst_y_h * w_out;
+  // y
+  resize_one_channel(y_ptr, w_in, y_h, dst, w_out, dst_y_h);
+  // uv
+  resize_one_channel_uv(uv_ptr, w_in, uv_h, dst_ptr, w_out, dst_uv_h);
+}
+
+void resize_one_channel(const uint8_t* src,
+                        int w_in,
+                        int h_in,
+                        uint8_t* dst,
+                        int w_out,
+                        int h_out) {
+  const int resize_coef_bits = 11;
+  const int resize_coef_scale = 1 << resize_coef_bits;
+
+  double scale_x = static_cast<double>(w_in) / w_out;
+  double scale_y = static_cast<double>(h_in) / h_out;
+
+  int* buf = new int[w_out * 2 + h_out * 2];
+
+  int* xofs = buf;          // new int[w];
+  int* yofs = buf + w_out;  // new int[h];
+
+  int16_t* ialpha =
+      reinterpret_cast<int16_t*>(buf + w_out + h_out);  // new short[w * 2];
+  int16_t* ibeta =
+      reinterpret_cast<int16_t*>(buf + w_out * 2 + h_out);  // new short[h * 2];
+
+  float fx = 0.f;
+  float fy = 0.f;
+  int sx = 0;
+  int sy = 0;
+
+#define SATURATE_CAST_SHORT(X)                                               \
+  (int16_t)::std::min(                                                       \
+      ::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
+      SHRT_MAX);
+  for (int dx = 0; dx < w_out; dx++) {
+    fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
+    sx = floor(fx);
+    fx -= sx;
+
+    if (sx < 0) {
+      sx = 0;
+      fx = 0.f;
+    }
+    if (sx >= w_in - 1) {
+      sx = w_in - 2;
+      fx = 1.f;
+    }
+
+    xofs[dx] = sx;
+
+    float a0 = (1.f - fx) * resize_coef_scale;
+    float a1 = fx * resize_coef_scale;
+
+    ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
+    ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
+  }
+  for (int dy = 0; dy < h_out; dy++) {
+    fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
+    sy = floor(fy);
+    fy -= sy;
+
+    if (sy < 0) {
+      sy = 0;
+      fy = 0.f;
+    }
+    if (sy >= h_in - 1) {
+      sy = h_in - 2;
+      fy = 1.f;
+    }
+
+    yofs[dy] = sy;
+
+    float b0 = (1.f - fy) * resize_coef_scale;
+    float b1 = fy * resize_coef_scale;
+
+    ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
+    ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
+  }
+#undef SATURATE_CAST_SHORT
+  // loop body
+  int16_t* rowsbuf0 = new int16_t[w_out + 1];
+  int16_t* rowsbuf1 = new int16_t[w_out + 1];
+  int16_t* rows0 = rowsbuf0;
+  int16_t* rows1 = rowsbuf1;
+
+  int prev_sy1 = -1;
+  for (int dy = 0; dy < h_out; dy++) {
+    int sy = yofs[dy];
+
+    if (sy == prev_sy1) {
+      // hresize one row
+      int16_t* rows0_old = rows0;
+      rows0 = rows1;
+      rows1 = rows0_old;
+      const uint8_t* S1 = src + w_in * (sy + 1);
+      const int16_t* ialphap = ialpha;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out; dx++) {
+        int sx = xofs[dx];
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+
+        const uint8_t* S1p = S1 + sx;
+        rows1p[dx] = (S1p[0] * a0 + S1p[1] * a1) >> 4;
+
+        ialphap += 2;
+      }
+    } else {
+      // hresize two rows
+      const uint8_t* S0 = src + w_in * (sy);
+      const uint8_t* S1 = src + w_in * (sy + 1);
+
+      const int16_t* ialphap = ialpha;
+      int16_t* rows0p = rows0;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out; dx++) {
+        int sx = xofs[dx];
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+
+        const uint8_t* S0p = S0 + sx;
+        const uint8_t* S1p = S1 + sx;
+        rows0p[dx] = (S0p[0] * a0 + S0p[1] * a1) >> 4;
+        rows1p[dx] = (S1p[0] * a0 + S1p[1] * a1) >> 4;
+
+        ialphap += 2;
+      }
+    }
+
+    prev_sy1 = sy + 1;
+
+    // vresize
+    int16_t b0 = ibeta[0];
+    int16_t b1 = ibeta[1];
+
+    int16_t* rows0p = rows0;
+    int16_t* rows1p = rows1;
+    uint8_t* dp_ptr = dst + w_out * (dy);
+
+    int cnt = w_out >> 3;
+    int remain = w_out - (cnt << 3);
+    int16x4_t _b0 = vdup_n_s16(b0);
+    int16x4_t _b1 = vdup_n_s16(b1);
+    int32x4_t _v2 = vdupq_n_s32(2);
+
+    for (cnt = w_out >> 3; cnt > 0; cnt--) {
+      int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
+      int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
+      int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
+      int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
+
+      int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
+      int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
+      int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
+      int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
+
+      int32x4_t _acc = _v2;
+      _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
+      _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
+
+      int32x4_t _acc_1 = _v2;
+      _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
+      _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
+
+      int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
+      int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
+
+      uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
+
+      vst1_u8(dp_ptr, _dout);
+
+      dp_ptr += 8;
+      rows0p += 8;
+      rows1p += 8;
+    }
+    for (; remain; --remain) {
+      // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
+      *dp_ptr++ =
+          (uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
+                     (int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
+                    2);
+    }
+    ibeta += 2;
+  }
+
+  delete[] buf;
+  delete[] rowsbuf0;
+  delete[] rowsbuf1;
+}
+
+void resize_one_channel_uv(const uint8_t* src,
+                           int w_in,
+                           int h_in,
+                           uint8_t* dst,
+                           int w_out,
+                           int h_out) {
+  const int resize_coef_bits = 11;
+  const int resize_coef_scale = 1 << resize_coef_bits;
+
+  double scale_x = static_cast<double>(w_in) / w_out;
+  double scale_y = static_cast<double>(h_in) / h_out;
+
+  int* buf = new int[w_out * 2 + h_out * 2];
+
+  int* xofs = buf;          // new int[w];
+  int* yofs = buf + w_out;  // new int[h];
+
+  int16_t* ialpha =
+      reinterpret_cast<int16_t*>(buf + w_out + h_out);  // new int16_t[w * 2];
+  int16_t* ibeta = reinterpret_cast<int16_t*>(buf + w_out * 2 +
+                                              h_out);  // new int16_t[h * 2];
+
+  float fx = 0.f;
+  float fy = 0.f;
+  int sx = 0.f;
+  int sy = 0.f;
+
+#define SATURATE_CAST_SHORT(X)                                               \
+  (int16_t)::std::min(                                                       \
+      ::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
+      SHRT_MAX);
+  for (int dx = 0; dx < w_out / 2; dx++) {
+    fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
+    sx = floor(fx);
+    fx -= sx;
+
+    if (sx < 0) {
+      sx = 0;
+      fx = 0.f;
+    }
+    if (sx >= w_in - 1) {
+      sx = w_in - 2;
+      fx = 1.f;
+    }
+
+    xofs[dx] = sx;
+
+    float a0 = (1.f - fx) * resize_coef_scale;
+    float a1 = fx * resize_coef_scale;
+
+    ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
+    ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
+  }
+  for (int dy = 0; dy < h_out; dy++) {
+    fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
+    sy = floor(fy);
+    fy -= sy;
+
+    if (sy < 0) {
+      sy = 0;
+      fy = 0.f;
+    }
+    if (sy >= h_in - 1) {
+      sy = h_in - 2;
+      fy = 1.f;
+    }
+
+    yofs[dy] = sy;
+
+    float b0 = (1.f - fy) * resize_coef_scale;
+    float b1 = fy * resize_coef_scale;
+
+    ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
+    ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
+  }
+
+#undef SATURATE_CAST_SHORT
+  // loop body
+  int16_t* rowsbuf0 = new int16_t[w_out + 1];
+  int16_t* rowsbuf1 = new int16_t[w_out + 1];
+  int16_t* rows0 = rowsbuf0;
+  int16_t* rows1 = rowsbuf1;
+
+  int prev_sy1 = -1;
+  for (int dy = 0; dy < h_out; dy++) {
+    int sy = yofs[dy];
+    if (sy == prev_sy1) {
+      // hresize one row
+      int16_t* rows0_old = rows0;
+      rows0 = rows1;
+      rows1 = rows0_old;
+      const uint8_t* S1 = src + w_in * (sy + 1);
+
+      const int16_t* ialphap = ialpha;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out / 2; dx++) {
+        int sx = xofs[dx] * 2;
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+        const uint8_t* S1p = S1 + sx;
+        int tmp = dx * 2;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[2] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[3] * a1) >> 4;
+
+        ialphap += 2;
+      }
+    } else {
+      // hresize two rows
+      const uint8_t* S0 = src + w_in * (sy);
+      const uint8_t* S1 = src + w_in * (sy + 1);
+
+      const int16_t* ialphap = ialpha;
+      int16_t* rows0p = rows0;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out / 2; dx++) {
+        int sx = xofs[dx] * 2;
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+
+        const uint8_t* S0p = S0 + sx;
+        const uint8_t* S1p = S1 + sx;
+        int tmp = dx * 2;
+        rows0p[tmp] = (S0p[0] * a0 + S0p[2] * a1) >> 4;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[2] * a1) >> 4;
+
+        rows0p[tmp + 1] = (S0p[1] * a0 + S0p[3] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[3] * a1) >> 4;
+        ialphap += 2;
+      }
+    }
+    prev_sy1 = sy + 1;
+
+    // vresize
+    int16_t b0 = ibeta[0];
+    int16_t b1 = ibeta[1];
+
+    int16_t* rows0p = rows0;
+    int16_t* rows1p = rows1;
+    uint8_t* dp_ptr = dst + w_out * (dy);
+
+    int cnt = w_out >> 3;
+    int remain = w_out - (cnt << 3);
+    int16x4_t _b0 = vdup_n_s16(b0);
+    int16x4_t _b1 = vdup_n_s16(b1);
+    int32x4_t _v2 = vdupq_n_s32(2);
+    for (cnt = w_out >> 3; cnt > 0; cnt--) {
+      int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
+      int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
+      int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
+      int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
+
+      int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
+      int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
+      int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
+      int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
+
+      int32x4_t _acc = _v2;
+      _acc = vsraq_n_s32(
+          _acc, _rows0p_sr4_mb0, 16);  // _acc >> 16 + _rows0p_sr4_mb0 >> 16
+      _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
+
+      int32x4_t _acc_1 = _v2;
+      _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
+      _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
+
+      int16x4_t _acc16 = vshrn_n_s32(_acc, 2);  // _acc >> 2
+      int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
+
+      uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
+
+      vst1_u8(dp_ptr, _dout);
+
+      dp_ptr += 8;
+      rows0p += 8;
+      rows1p += 8;
+    }
+    for (; remain; --remain) {
+      // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
+      *dp_ptr++ =
+          (uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
+                     (int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
+                    2);
+    }
+    ibeta += 2;
+  }
+
+  delete[] buf;
+  delete[] rowsbuf0;
+  delete[] rowsbuf1;
+}
+
 void compute_xy(int srcw,
                 int srch,
                 int dstw,
@@ -95,10 +681,14 @@ void resize(const uint8_t* src,
   if (srcFormat == GRAY) {
     num = 1;
   } else if (srcFormat == NV12 || srcFormat == NV21) {
+    nv21_resize(src, dst, srcw, srch, dstw, dsth);
+    return;
     num = 1;
     int hout = static_cast<int>(0.5 * dsth);
     dsth += hout;
   } else if (srcFormat == BGR || srcFormat == RGB) {
+    bgr_resize(src, dst, srcw, srch, dstw, dsth);
+    return;
     w_in = srcw * 3;
     w_out = dstw * 3;
     num = 3;
@@ -117,13 +707,12 @@ void resize(const uint8_t* src,
   int16_t* ialpha1 = nullptr;
   if (orih < dsth) {  // uv
     int tmp = dsth - orih;
-    int w = dstw / 2;
-    xofs1 = new int[w];
+    xofs1 = new int[dstw];
     yofs1 = new int[tmp];
-    ialpha1 = new int16_t[srcw];
-    compute_xy(srcw / 2,
+    ialpha1 = new int16_t[dstw];
+    compute_xy(srcw,
                srch / 2,
-               w,
+               dstw / 2,
                tmp,
                2,
                scale_x,
@@ -139,15 +728,15 @@ void resize(const uint8_t* src,
   int prev_sy1 = -1;
 #pragma omp parallel for
   for (int dy = 0; dy < dsth; dy++) {
-    int16_t* rowsbuf0 = new int16_t[w_out];
-    int16_t* rowsbuf1 = new int16_t[w_out];
+    int16_t* rowsbuf0 = new int16_t[w_out + 1];
+    int16_t* rowsbuf1 = new int16_t[w_out + 1];
     int sy = yofs[dy];
     if (dy >= orih) {
       xofs = xofs1;
       yofs = yofs1;
       ialpha = ialpha1;
       num = 2;
-      sy = yofs1[dy - orih];
+      sy = yofs1[dy - orih] + srch;
     }
 
     // hresize two rows