未验证 提交 b740c549 编写于 作者: H HappyAngel 提交者: GitHub

[arm]improve sgemv profile (#3241)

* fxi ff, test=develop

* delete bgr_flip, test=develop
上级 0276b025
......@@ -983,10 +983,12 @@ void sgemv_trans(const int M,
"vld1.32 {d8-d11}, [%[in]]! @ load input, q4, q5\n" \
"vld1.32 {d12-d15}, [%[w0]]! @ load weights r0, q6,q7\n" \
"vld1.32 {d16-d19}, [%[w1]]! @ load weights r1, q8,q9\n" \
"vld1.32 {d20-d23}, [%[w2]]! @ load weights r2, q10,q11\n" \
"vld1.32 {d24-d27}, [%[w3]]! @ load weights r3, q12,q13\n" \
"vmla.f32 q0, q4, q6 @ mul add\n" \
"vld1.32 {d20-d23}, [%[w2]]! @ load weights r2, q10,q11\n" \
"vmla.f32 q1, q4, q8 @ mul add\n" \
"vld1.32 {d24-d27}, [%[w3]]! @ load weights r3, q12,q13\n" \
/*"vmla.f32 q0, q4, q6 @ mul add\n" */ \
/*"vmla.f32 q1, q4, q8 @ mul add\n" */ \
"vmla.f32 q2, q4, q10 @ mul add\n" \
"vmla.f32 q3, q4, q12 @ mul add\n" \
"subs %[cnt], #1 @ sub loop count \n" \
......
if(LITE_WITH_CV AND (NOT LITE_WITH_FPGA) AND LITE_WITH_ARM)
lite_cc_library(paddle_cv_arm SRCS
image_convert.cc
bgr_rotate.cc
paddle_image_preprocess.cc
image2tensor.cc
image_flip.cc
......
此差异已折叠。
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
namespace paddle {
namespace lite {
namespace utils {
namespace cv {
void bgr_rotate_hwc(
const uint8_t* src, uint8_t* dst, int w_in, int h_in, int angle);
} // namespace cv
} // namespace utils
} // namespace lite
} // namespace paddle
......@@ -683,7 +683,6 @@ void resize(const uint8_t* src,
w_in = srcw * 3;
w_out = dstw * 3;
num = 3;
} else if (srcFormat == BGRA || srcFormat == RGBA) {
w_in = srcw * 4;
w_out = dstw * 4;
......@@ -725,10 +724,10 @@ void resize(const uint8_t* src,
int remain = w_out % 8;
int32x4_t _v2 = vdupq_n_s32(2);
int prev_sy1 = -1;
int16_t* rowsbuf0 = new int16_t[w_out + 1];
int16_t* rowsbuf1 = new int16_t[w_out + 1];
#pragma omp parallel for
for (int dy = 0; dy < dsth; dy++) {
int16_t* rowsbuf0 = new int16_t[w_out + 1];
int16_t* rowsbuf1 = new int16_t[w_out + 1];
int sy = yofs[dy];
if (dy >= orih) {
xofs = xofs1;
......@@ -852,8 +851,6 @@ void resize(const uint8_t* src,
2);
}
ibeta += 2;
delete[] rowsbuf0;
delete[] rowsbuf1;
}
if (orih < dsth) { // uv
delete[] xofs1;
......@@ -861,6 +858,8 @@ void resize(const uint8_t* src,
delete[] ialpha1;
}
delete[] buf;
delete[] rowsbuf0;
delete[] rowsbuf1;
}
// compute xofs, yofs, alpha, beta
void compute_xy(int srcw,
......
......@@ -15,6 +15,7 @@
#include "lite/utils/cv/image_rotate.h"
#include <math.h>
#include <string.h>
#include "lite/utils/cv/bgr_rotate.h"
namespace paddle {
namespace lite {
namespace utils {
......@@ -31,7 +32,8 @@ void ImageRotate::choose(const uint8_t* src,
if (srcFormat == GRAY) {
rotate_hwc1(src, dst, srcw, srch, degree);
} else if (srcFormat == BGR || srcFormat == RGB) {
rotate_hwc3(src, dst, srcw, srch, degree);
// rotate_hwc3(src, dst, srcw, srch, degree);
bgr_rotate_hwc(src, dst, srcw, srch, static_cast<int>(degree));
} else if (srcFormat == BGRA || srcFormat == RGBA) {
rotate_hwc4(src, dst, srcw, srch, degree);
} else {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册