[arm]improve sgemv profile (#3241)

* fxi ff, test=develop * delete bgr_flip, test=develop

[arm]improve sgemv profile (#3241)
* fxi ff, test=develop * delete bgr_flip, test=develop
b740c549 · HappyAngel · GitHub · 0276b025 · b740c549 · b740c549
6 changed file
--- a/lite/backends/arm/math/sgemv.cc
+++ b/lite/backends/arm/math/sgemv.cc
@@ -983,10 +983,12 @@ void sgemv_trans(const int M,
  "vld1.32 {d8-d11}, [%[in]]!     @ load input, q4, q5\n"                      \
  "vld1.32 {d12-d15}, [%[w0]]!    @ load weights r0, q6,q7\n"                  \
  "vld1.32 {d16-d19}, [%[w1]]!    @ load weights r1, q8,q9\n"                  \
-  "vld1.32 {d20-d23}, [%[w2]]!    @ load weights r2, q10,q11\n"                \
-  "vld1.32 {d24-d27}, [%[w3]]!    @ load weights r3, q12,q13\n"                \
  "vmla.f32 q0, q4, q6            @ mul add\n"                                 \
+  "vld1.32 {d20-d23}, [%[w2]]!    @ load weights r2, q10,q11\n"                \
  "vmla.f32 q1, q4, q8            @ mul add\n"                                 \
+  "vld1.32 {d24-d27}, [%[w3]]!    @ load weights r3, q12,q13\n"                \
+  /*"vmla.f32 q0, q4, q6            @ mul add\n" */                            \
+  /*"vmla.f32 q1, q4, q8            @ mul add\n" */                            \
  "vmla.f32 q2, q4, q10           @ mul add\n"                                 \
  "vmla.f32 q3, q4, q12           @ mul add\n"                                 \
  "subs %[cnt], #1                @ sub loop count \n"                         \

--- a/lite/utils/cv/CMakeLists.txt
+++ b/lite/utils/cv/CMakeLists.txt
 if(LITE_WITH_CV AND (NOT LITE_WITH_FPGA) AND LITE_WITH_ARM)
    lite_cc_library(paddle_cv_arm SRCS
            image_convert.cc
+            bgr_rotate.cc
            paddle_image_preprocess.cc
            image2tensor.cc
            image_flip.cc

--- a/lite/utils/cv/bgr_rotate.cc
+++ b/lite/utils/cv/bgr_rotate.cc
--- a/lite/utils/cv/bgr_rotate.h
+++ b/lite/utils/cv/bgr_rotate.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+namespace paddle {
+namespace lite {
+namespace utils {
+namespace cv {
+void bgr_rotate_hwc(
+    const uint8_t* src, uint8_t* dst, int w_in, int h_in, int angle);
+}  // namespace cv
+}  // namespace utils
+}  // namespace lite
+}  // namespace paddle
--- a/lite/utils/cv/image_resize.cc
+++ b/lite/utils/cv/image_resize.cc
@@ -683,7 +683,6 @@ void resize(const uint8_t* src,
    w_in = srcw * 3;
    w_out = dstw * 3;
    num = 3;
-
  } else if (srcFormat == BGRA || srcFormat == RGBA) {
    w_in = srcw * 4;
    w_out = dstw * 4;
@@ -725,10 +724,10 @@ void resize(const uint8_t* src,
  int remain = w_out % 8;
  int32x4_t _v2 = vdupq_n_s32(2);
  int prev_sy1 = -1;
-#pragma omp parallel for
-  for (int dy = 0; dy < dsth; dy++) {
  int16_t* rowsbuf0 = new int16_t[w_out + 1];
  int16_t* rowsbuf1 = new int16_t[w_out + 1];
+#pragma omp parallel for
+  for (int dy = 0; dy < dsth; dy++) {
    int sy = yofs[dy];
    if (dy >= orih) {
      xofs = xofs1;
@@ -852,8 +851,6 @@ void resize(const uint8_t* src,
                    2);
    }
    ibeta += 2;
-    delete[] rowsbuf0;
-    delete[] rowsbuf1;
  }
  if (orih < dsth) {  // uv
    delete[] xofs1;
@@ -861,6 +858,8 @@ void resize(const uint8_t* src,
    delete[] ialpha1;
  }
  delete[] buf;
+  delete[] rowsbuf0;
+  delete[] rowsbuf1;
 }
 // compute xofs, yofs, alpha, beta
 void compute_xy(int srcw,

--- a/lite/utils/cv/image_rotate.cc
+++ b/lite/utils/cv/image_rotate.cc
@@ -15,6 +15,7 @@
 #include "lite/utils/cv/image_rotate.h"
 #include <math.h>
 #include <string.h>
+#include "lite/utils/cv/bgr_rotate.h"
 namespace paddle {
 namespace lite {
 namespace utils {
@@ -31,7 +32,8 @@ void ImageRotate::choose(const uint8_t* src,
  if (srcFormat == GRAY) {
    rotate_hwc1(src, dst, srcw, srch, degree);
  } else if (srcFormat == BGR || srcFormat == RGB) {
-    rotate_hwc3(src, dst, srcw, srch, degree);
+    // rotate_hwc3(src, dst, srcw, srch, degree);
+    bgr_rotate_hwc(src, dst, srcw, srch, static_cast<int>(degree));
  } else if (srcFormat == BGRA || srcFormat == RGBA) {
    rotate_hwc4(src, dst, srcw, srch, degree);
  } else {