未验证 提交 b740c549 编写于 作者: H HappyAngel 提交者: GitHub

[arm]improve sgemv profile (#3241)

* fxi ff, test=develop

* delete bgr_flip, test=develop
上级 0276b025
...@@ -983,10 +983,12 @@ void sgemv_trans(const int M, ...@@ -983,10 +983,12 @@ void sgemv_trans(const int M,
"vld1.32 {d8-d11}, [%[in]]! @ load input, q4, q5\n" \ "vld1.32 {d8-d11}, [%[in]]! @ load input, q4, q5\n" \
"vld1.32 {d12-d15}, [%[w0]]! @ load weights r0, q6,q7\n" \ "vld1.32 {d12-d15}, [%[w0]]! @ load weights r0, q6,q7\n" \
"vld1.32 {d16-d19}, [%[w1]]! @ load weights r1, q8,q9\n" \ "vld1.32 {d16-d19}, [%[w1]]! @ load weights r1, q8,q9\n" \
"vld1.32 {d20-d23}, [%[w2]]! @ load weights r2, q10,q11\n" \
"vld1.32 {d24-d27}, [%[w3]]! @ load weights r3, q12,q13\n" \
"vmla.f32 q0, q4, q6 @ mul add\n" \ "vmla.f32 q0, q4, q6 @ mul add\n" \
"vld1.32 {d20-d23}, [%[w2]]! @ load weights r2, q10,q11\n" \
"vmla.f32 q1, q4, q8 @ mul add\n" \ "vmla.f32 q1, q4, q8 @ mul add\n" \
"vld1.32 {d24-d27}, [%[w3]]! @ load weights r3, q12,q13\n" \
/*"vmla.f32 q0, q4, q6 @ mul add\n" */ \
/*"vmla.f32 q1, q4, q8 @ mul add\n" */ \
"vmla.f32 q2, q4, q10 @ mul add\n" \ "vmla.f32 q2, q4, q10 @ mul add\n" \
"vmla.f32 q3, q4, q12 @ mul add\n" \ "vmla.f32 q3, q4, q12 @ mul add\n" \
"subs %[cnt], #1 @ sub loop count \n" \ "subs %[cnt], #1 @ sub loop count \n" \
......
if(LITE_WITH_CV AND (NOT LITE_WITH_FPGA) AND LITE_WITH_ARM) if(LITE_WITH_CV AND (NOT LITE_WITH_FPGA) AND LITE_WITH_ARM)
lite_cc_library(paddle_cv_arm SRCS lite_cc_library(paddle_cv_arm SRCS
image_convert.cc image_convert.cc
bgr_rotate.cc
paddle_image_preprocess.cc paddle_image_preprocess.cc
image2tensor.cc image2tensor.cc
image_flip.cc image_flip.cc
......
此差异已折叠。
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
namespace paddle {
namespace lite {
namespace utils {
namespace cv {
void bgr_rotate_hwc(
const uint8_t* src, uint8_t* dst, int w_in, int h_in, int angle);
} // namespace cv
} // namespace utils
} // namespace lite
} // namespace paddle
...@@ -683,7 +683,6 @@ void resize(const uint8_t* src, ...@@ -683,7 +683,6 @@ void resize(const uint8_t* src,
w_in = srcw * 3; w_in = srcw * 3;
w_out = dstw * 3; w_out = dstw * 3;
num = 3; num = 3;
} else if (srcFormat == BGRA || srcFormat == RGBA) { } else if (srcFormat == BGRA || srcFormat == RGBA) {
w_in = srcw * 4; w_in = srcw * 4;
w_out = dstw * 4; w_out = dstw * 4;
...@@ -725,10 +724,10 @@ void resize(const uint8_t* src, ...@@ -725,10 +724,10 @@ void resize(const uint8_t* src,
int remain = w_out % 8; int remain = w_out % 8;
int32x4_t _v2 = vdupq_n_s32(2); int32x4_t _v2 = vdupq_n_s32(2);
int prev_sy1 = -1; int prev_sy1 = -1;
int16_t* rowsbuf0 = new int16_t[w_out + 1];
int16_t* rowsbuf1 = new int16_t[w_out + 1];
#pragma omp parallel for #pragma omp parallel for
for (int dy = 0; dy < dsth; dy++) { for (int dy = 0; dy < dsth; dy++) {
int16_t* rowsbuf0 = new int16_t[w_out + 1];
int16_t* rowsbuf1 = new int16_t[w_out + 1];
int sy = yofs[dy]; int sy = yofs[dy];
if (dy >= orih) { if (dy >= orih) {
xofs = xofs1; xofs = xofs1;
...@@ -852,8 +851,6 @@ void resize(const uint8_t* src, ...@@ -852,8 +851,6 @@ void resize(const uint8_t* src,
2); 2);
} }
ibeta += 2; ibeta += 2;
delete[] rowsbuf0;
delete[] rowsbuf1;
} }
if (orih < dsth) { // uv if (orih < dsth) { // uv
delete[] xofs1; delete[] xofs1;
...@@ -861,6 +858,8 @@ void resize(const uint8_t* src, ...@@ -861,6 +858,8 @@ void resize(const uint8_t* src,
delete[] ialpha1; delete[] ialpha1;
} }
delete[] buf; delete[] buf;
delete[] rowsbuf0;
delete[] rowsbuf1;
} }
// compute xofs, yofs, alpha, beta // compute xofs, yofs, alpha, beta
void compute_xy(int srcw, void compute_xy(int srcw,
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "lite/utils/cv/image_rotate.h" #include "lite/utils/cv/image_rotate.h"
#include <math.h> #include <math.h>
#include <string.h> #include <string.h>
#include "lite/utils/cv/bgr_rotate.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace utils { namespace utils {
...@@ -31,7 +32,8 @@ void ImageRotate::choose(const uint8_t* src, ...@@ -31,7 +32,8 @@ void ImageRotate::choose(const uint8_t* src,
if (srcFormat == GRAY) { if (srcFormat == GRAY) {
rotate_hwc1(src, dst, srcw, srch, degree); rotate_hwc1(src, dst, srcw, srch, degree);
} else if (srcFormat == BGR || srcFormat == RGB) { } else if (srcFormat == BGR || srcFormat == RGB) {
rotate_hwc3(src, dst, srcw, srch, degree); // rotate_hwc3(src, dst, srcw, srch, degree);
bgr_rotate_hwc(src, dst, srcw, srch, static_cast<int>(degree));
} else if (srcFormat == BGRA || srcFormat == RGBA) { } else if (srcFormat == BGRA || srcFormat == RGBA) {
rotate_hwc4(src, dst, srcw, srch, degree); rotate_hwc4(src, dst, srcw, srch, degree);
} else { } else {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册