diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt
index 61f07583b2ed920ce7ac0f2d56b2b2e89bb99b42..4e4bc103a8a872e1f0635e71e2e39f4eac1272d1 100644
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -160,7 +160,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                     COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/include"
                     COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include"
                     COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/lib"
-                    COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+                    COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include"
                     )
             add_dependencies(tiny_publish_lib bundle_light_api)
             add_dependencies(publish_inference tiny_publish_lib)
diff --git a/lite/utils/cv/paddle_image_preprocess.cc b/lite/utils/cv/paddle_image_preprocess.cc
index 0bccfe2804a9ba17473575815bfe4b2e9635f234..f18047556874a82d28c5964a1b5fd2fa8284c814 100644
--- a/lite/utils/cv/paddle_image_preprocess.cc
+++ b/lite/utils/cv/paddle_image_preprocess.cc
@@ -69,240 +69,6 @@ void ImagePreprocess::imageResize(const uint8_t* src,
                                   int dstw,
                                   int dsth) {
   resize(src, dst, srcFormat, srcw, srch, dstw, dsth);
-  /*
-    int size = srcw * srch;
-    if (srcw == dstw && srch == dsth) {
-      if (srcFormat == NV12 || srcFormat == NV21) {
-        size = srcw * (floor(1.5 * srch));
-      } else if (srcFormat == BGR || srcFormat == RGB) {
-        size = 3 * srcw * srch;
-      } else if (srcFormat == BGRA || srcFormat == RGBA) {
-        size = 4 * srcw * srch;
-      }
-      memcpy(dst, src, sizeof(uint8_t) * size);
-      return;
-    }
-    double scale_x = static_cast<double>(srcw / dstw);
-    double scale_y = static_cast<double>(srch / dsth);
-
-    int* buf = new int[dstw * 2 + dsth * 2];
-
-    int* xofs = buf;
-    int* yofs = buf + dstw;
-    int16_t* ialpha = reinterpret_cast<int16_t*>(buf + dstw + dsth);
-    int16_t* ibeta = reinterpret_cast<int16_t*>(buf + 2 * dstw + dsth);
-
-    compute_xy(
-        srcw, srch, dstw, dsth, scale_x, scale_y, xofs, yofs, ialpha, ibeta);
-
-    int w_out = dstw;
-    int w_in = srcw;
-    int num = 1;
-    int orih = dsth;
-    if (srcFormat == GRAY) {
-      num = 1;
-    } else if (srcFormat == NV12 || srcFormat == NV21) {
-      num = 1;
-      int hout = static_cast<int>(0.5 * dsth);
-      dsth += hout;
-    } else if (srcFormat == BGR || srcFormat == RGB) {
-      w_in = srcw * 3;
-      w_out = dstw * 3;
-      num = 3;
-
-    } else if (srcFormat == BGRA || srcFormat == RGBA) {
-      w_in = srcw * 4;
-      w_out = dstw * 4;
-      num = 4;
-    }
-
-    int* xofs1 = nullptr;
-    int* yofs1 = nullptr;
-    int16_t* ialpha1 = nullptr;
-    if (orih < dsth) {  // uv
-      int tmp = dsth - orih;
-      int w = dstw / 2;
-      xofs1 = new int[w];
-      yofs1 = new int[tmp];
-      ialpha1 = new int16_t[srcw];
-      compute_xy(srcw / 2,
-                 srch / 2,
-                 w,
-                 tmp,
-                 scale_x,
-                 scale_y,
-                 xofs1,
-                 yofs1,
-                 ialpha1,
-                 ibeta + orih);
-    }
-    int cnt = w_out >> 3;
-    int remain = w_out % 8;
-    int32x4_t _v2 = vdupq_n_s32(2);
-  #pragma omp parallel for
-    for (int dy = 0; dy < dsth; dy++) {
-      int16_t* rowsbuf0 = new int16_t[w_out];
-      int16_t* rowsbuf1 = new int16_t[w_out];
-      int sy = yofs[dy];
-      if (dy >= orih) {
-        xofs = xofs1;
-        yofs = yofs1;
-        ialpha = ialpha1;
-      }
-      if (sy < 0) {
-        memset(rowsbuf0, 0, sizeof(uint16_t) * w_out);
-        const uint8_t* S1 = src + srcw * (sy + 1);
-        const int16_t* ialphap = ialpha;
-        int16_t* rows1p = rowsbuf1;
-        for (int dx = 0; dx < dstw; dx++) {
-          int sx = xofs[dx] * num;  // num = 4
-          int16_t a0 = ialphap[0];
-          int16_t a1 = ialphap[1];
-
-          const uint8_t* S1pl = S1 + sx;
-          const uint8_t* S1pr = S1 + sx + num;
-          if (sx < 0) {
-            S1pl = S1;
-          }
-          for (int i = 0; i < num; i++) {
-            if (sx < 0) {
-              *rows1p++ = ((*S1pl++) * a1) >> 4;
-            } else {
-              *rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4;
-            }
-          }
-          ialphap += 2;
-        }
-      } else {
-        // hresize two rows
-        const uint8_t* S0 = src + w_in * (sy);
-        const uint8_t* S1 = src + w_in * (sy + 1);
-        const int16_t* ialphap = ialpha;
-        int16_t* rows0p = rowsbuf0;
-        int16_t* rows1p = rowsbuf1;
-        for (int dx = 0; dx < dstw; dx++) {
-          int sx = xofs[dx] * num;  // num = 4
-          int16_t a0 = ialphap[0];
-          int16_t a1 = ialphap[1];
-
-          const uint8_t* S0pl = S0 + sx;
-          const uint8_t* S0pr = S0 + sx + num;
-          const uint8_t* S1pl = S1 + sx;
-          const uint8_t* S1pr = S1 + sx + num;
-          if (sx < 0) {
-            S0pl = S0;
-            S1pl = S1;
-          }
-          for (int i = 0; i < num; i++) {
-            if (sx < 0) {
-              *rows0p = ((*S0pl++) * a1) >> 4;
-              *rows1p = ((*S1pl++) * a1) >> 4;
-              rows0p++;
-              rows1p++;
-            } else {
-              *rows0p++ = ((*S0pl++) * a0 + (*S0pr++) * a1) >> 4;
-              *rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4;
-            }
-          }
-          ialphap += 2;
-        }
-      }
-      int ind = dy * 2;
-      int16_t b0 = ibeta[ind];
-      int16_t b1 = ibeta[ind + 1];
-      int16x8_t _b0 = vdupq_n_s16(b0);
-      int16x8_t _b1 = vdupq_n_s16(b1);
-      uint8_t* dp_ptr = dst + dy * w_out;
-      int16_t* rows0p = rowsbuf0;
-      int16_t* rows1p = rowsbuf1;
-      int re_cnt = cnt;
-      if (re_cnt > 0) {
-  #ifdef __aarch64__
-        asm volatile(
-            "1: \n"
-            "ld1 {v0.8h}, [%[rows0p]], #16 \n"
-            "ld1 {v1.8h}, [%[rows1p]], #16 \n"
-            "orr v6.16b, %w[_v2].16b, %w[_v2].16b \n"
-            "orr v7.16b, %w[_v2].16b, %w[_v2].16b \n"
-            "smull v2.4s, v0.4h, %w[_b0].4h \n"
-            "smull2 v4.4s, v0.8h, %w[_b0].8h \n"
-            "smull v3.4s, v1.4h, %w[_b1].4h \n"
-            "smull2 v5.4s, v1.8h, %w[_b1].8h \n"
-
-            "ssra v6.4s, v2.4s, #16 \n"
-            "ssra v7.4s, v4.4s, #16 \n"
-            "ssra v6.4s, v3.4s, #16 \n"
-            "ssra v7.4s, v5.4s, #16 \n"
-
-            "shrn v0.4h, v6.4s, #2 \n"
-            "shrn2 v0.8h, v7.4s, #2 \n"
-            "subs %w[cnt], %w[cnt], #1 \n"
-            "sqxtun v1.8b, v0.8h \n"
-            "st1 {v1.8b}, [%[dp]], #8 \n"
-            "bne 1b \n"
-            : [rows0p] "+r"(rows0p),
-              [rows1p] "+r"(rows1p),
-              [cnt] "+r"(re_cnt),
-              [dp] "+r"(dp_ptr)
-            : [_b0] "w"(_b0), [_b1] "w"(_b1), [_v2] "w"(_v2)
-            : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-  #else
-        asm volatile(
-            "mov        r4, #2          \n"
-            "vdup.s32   q12, r4         \n"
-            "0:                         \n"
-            "vld1.s16   {d2-d3}, [%[rows0p]]!\n"
-            "vld1.s16   {d6-d7}, [%[rows1p]]!\n"
-            "vorr.s32   q10, q12, q12   \n"
-            "vorr.s32   q11, q12, q12   \n"
-
-            "vmull.s16  q0, d2, %[_b0]     \n"
-            "vmull.s16  q1, d3, %[_b0]     \n"
-            "vmull.s16  q2, d6, %[_b1]     \n"
-            "vmull.s16  q3, d7, %[_b1]     \n"
-
-            "vsra.s32   q10, q0, #16    \n"
-            "vsra.s32   q11, q1, #16    \n"
-            "vsra.s32   q10, q2, #16    \n"
-            "vsra.s32   q11, q3, #16    \n"
-
-            "vshrn.s32  d20, q10, #2    \n"
-            "vshrn.s32  d21, q11, #2    \n"
-            "subs       %[cnt], #1          \n"
-            "vqmovun.s16 d20, q10        \n"
-            "vst1.8     {d20}, [%[dp]]!    \n"
-            "bne        0b              \n"
-            : [rows0p] "+r"(rows0p),
-              [rows1p] "+r"(rows1p),
-              [cnt] "+r"(re_cnt),
-              [dp] "+r"(dp_ptr)
-            : [_b0] "w"(_b0), [_b1] "w"(_b1)
-            : "cc",
-              "memory",
-              "r4",
-              "q0",
-              "q1",
-              "q2",
-              "q3",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12");
-
-  #endif  // __aarch64__
-      }
-      for (int i = 0; i < remain; i++) {
-        //             D[x] = (rows0[x]*b0 + rows1[x]*b1) >>
-        //             INTER_RESIZE_COEF_BITS;
-        *dp_ptr++ =
-            (uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
-                       (int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
-                      2);
-      }
-    }
-    delete[] buf;
-    */
 }
 
 void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst) {
diff --git a/lite/utils/cv/paddle_image_preprocess.h b/lite/utils/cv/paddle_image_preprocess.h
index 11673e19041883bfa6ca7a45f03ca3feca76dd20..5a46a9e48e8202fe29ec9fc7d950ccf15920cc32 100644
--- a/lite/utils/cv/paddle_image_preprocess.h
+++ b/lite/utils/cv/paddle_image_preprocess.h
@@ -133,7 +133,7 @@ class ImagePreprocess {
   * color format support 1-channel image, 3-channel image and 4-channel image
   * param src: input image data
   * param dst: output image data
-  * param srcFormat: input image format, support GRAY, BGR(GRB) and BGRA(RGBA)
+  * param srcFormat: input image format, support GRAY, BGR(RGB) and BGRA(RGBA)
   * param srcw: input image width
   * param srch: input image height
   * param degree: Rotate degree, support 90, 180 and 270
@@ -158,7 +158,7 @@ class ImagePreprocess {
   * color format support 1-channel image, 3-channel image and 4-channel image
   * param src: input image data
   * param dst: output image data
-  * param srcFormat: input image format, support GRAY, BGR(GRB) and BGRA(RGBA)
+  * param srcFormat: input image format, support GRAY, BGR(RGB) and BGRA(RGBA)
   * param srcw: input image width
   * param srch: input image height
   * param flip_param: flip parameter, support X, Y and XY
@@ -190,7 +190,7 @@ class ImagePreprocess {
   * NCHW
   * param src: input image data
   * param dstTensor: output tensor data
-  * param srcFormat: input image format, support BGR(GRB) and BGRA(RGBA)
+  * param srcFormat: input image format, support BGR(RGB) and BGRA(RGBA)
   * param srcw: input image width
   * param srch: input image height
   * param layout: output tensor layout，support NHWC and NCHW