未验证 提交 08f6217b 编写于 作者: X Xiaoyang LI 提交者: GitHub

cherry-pick: fix bias quantize error && fix clang build error, test=develop (#2065)

上级 66d7398e
......@@ -165,6 +165,11 @@ function(lite_cc_binary TARGET)
)
cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ${args_DEPS})
target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
# strip binary target to reduce size
add_custom_command(TARGET ${TARGET} POST_BUILD
COMMAND "${CMAKE_STRIP}" -s
"${TARGET}"
COMMENT "Strip debug symbols done on final executable file.")
# collect targets need to compile for lite
if (NOT args_EXCLUDE_COMPILE_DEPS)
add_dependencies(lite_compile_deps ${TARGET})
......@@ -207,6 +212,11 @@ function(lite_cc_test TARGET)
HVY_DEPS ${args_HVY_DEPS}
)
_lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
# strip binary target to reduce size
add_custom_command(TARGET ${TARGET} POST_BUILD
COMMAND "${CMAKE_STRIP}" -s
"${TARGET}"
COMMENT "Strip debug symbols done on final executable file.")
target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
file(APPEND ${offline_test_registry_file} "${TARGET}\n")
......
......@@ -68,7 +68,6 @@ class LITE_API Predictor {
GenRuntimeProgram();
}
program_->Run();
LOG(INFO) << "running";
}
// Get offset-th col of feed inputs.
......
......@@ -58,8 +58,9 @@ void TestModel(const std::vector<Place>& valid_places,
std::vector<std::vector<float>> results;
// i = 1
// ground truth result from fluid
results.emplace_back(std::vector<float>(
{0.000227548, 0.000262385, 0.000260347, 0.000293865, 0.00025008}));
{0.0002451055, 0.0002585023, 0.0002659616, 0.0002823}));
auto* out = predictor.GetOutput(0);
ASSERT_EQ(out->dims().size(), 2);
ASSERT_EQ(out->dims()[0], 1);
......
......@@ -6,6 +6,17 @@ if(NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM))
return()
endif()
set(script_dir ${CMAKE_CURRENT_SOURCE_DIR}/../../../tools/)
message(STATUS "generating arm dotprod code")
find_package(PythonInterp REQUIRED)
execute_process(COMMAND ${PYTHON_EXECUTABLE} ${script_dir}/convert_arm_sdot_to_machine_code.py
"--input_file=${CMAKE_CURRENT_SOURCE_DIR}/dotprod/__gemm_sdot_meta__.h"
"--output_file=${CMAKE_CURRENT_SOURCE_DIR}/dotprod/gemm_sdot.h"
RESULT_VARIABLE gen_code_ret)
if (NOT ${gen_code_ret} STREQUAL "0")
message(FATAL_ERROR "generating dotprod code quit with error: ${gen_code_ret}")
endif ()
set(HAS_ARM_MATH_LIB_DIR OFF)
# will search name as "libmath_arm.${os}.${abi}.${lang}.a"
if(ARM_MATH_LIB_DIR AND EXISTS "${ARM_MATH_LIB_DIR}")
......@@ -50,6 +61,25 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
funcs.cc
packed_sgemm.cc
sgemm.cc
gemm_prepacked_int8.cc
gemm_s8.cc
sgemv.cc
gemv_arm_int8.cc
conv3x3s1_direct_fp32.cc
conv3x3s2_direct_fp32.cc
conv3x3s1_depthwise_fp32.cc
conv3x3s2_depthwise_fp32.cc
conv3x3s1_direct_int8.cc
conv3x3s2_direct_int8.cc
conv3x3s1_depthwise_int8.cc
conv3x3s2_depthwise_int8.cc
conv5x5s1_depthwise_int8.cc
conv5x5s1_depthwise_fp32.cc
conv5x5s2_depthwise_fp32.cc
conv_depthwise_3x3p0.cc
conv_depthwise_3x3p1.cc
conv_winograd_3x3.cc
conv_impl.cc
softmax.cc
scale.cc
pooling.cc
......@@ -57,32 +87,13 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
lrn.cc
decode_bboxes.cc
concat.cc
sgemv.cc
type_trans.cc
box_coder.cc
conv_impl.cc
conv_direct_3x3s1.cc
conv_direct_3x3s2.cc
conv_direct.cc
conv_depthwise_3x3_int8.cc
conv_depthwise_5x5s1_int8.cc
conv_depthwise_3x3p0.cc
conv_depthwise_3x3p1.cc
conv_depthwise_5x5s1.cc
conv_depthwise_5x5s2.cc
conv_depthwise.cc
conv_gemmlike.cc
conv_winograd_3x3.cc
conv_winograd.cc
split.cc
shuffle_channel.cc
activation.cc
yolo_box.cc
dropout.cc
gemm_prepacked_int8.cc
gemv_arm_int8.cc
conv3x3s1_direct_int8.cc
conv3x3s2_direct_int8.cc
power.cc
interpolate.cc
argmax.cc
......
此差异已折叠。
此差异已折叠。
......@@ -51,12 +51,7 @@ void conv_3x3s1_direct_fp32(const float* i_data,
const int win_round = wout_round + 2;
bool flag_relu = param.fuse_relu;
bool flag_bias = param.bias != nullptr;
// if (param.activation_param.has_active) {
// if (param.activation_param.active == Active_relu &&
// fabs(param.activation_param.negative_slope) < 1e-6f) {
// flag_relu = true;
// }
// }
int hout_r_block = (l2_size - 2 * win_round * ic) /
(win_round * ic + hout_c_block * wout_round * threads);
hout_r_block = hout_r_block > oh ? oh : hout_r_block;
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <arm_neon.h>
#include "lite/backends/arm/math/conv_block_utils.h"
#include "lite/backends/arm/math/conv_impl.h"
#include "lite/core/context.h"
#include "lite/operators/op_params.h"
#ifdef ARM_WITH_OMP
#include <omp.h>
#endif
namespace paddle {
namespace lite {
namespace arm {
namespace math {
void conv_3x3s2_depthwise_fp32(const float* i_data,
float* o_data,
int bs,
int oc,
int oh,
int ow,
int ic,
int ih,
int win,
const float* weights,
const float* bias,
const operators::ConvParam& param,
ARMContext* ctx) {
int threads = ctx->threads();
const int pad_h = param.paddings[0];
const int pad_w = param.paddings[1];
const int out_c_block = 4;
const int out_h_kernel = 1;
const int out_w_kernel = 4;
const int win_ext = ow * 2 + 1;
const int ow_round = ROUNDUP(ow, 4);
const int win_round = ROUNDUP(win_ext, 4);
const int hin_round = oh * 2 + 1;
const int prein_size = win_round * hin_round * out_c_block;
auto workspace_size =
threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/;
ctx->ExtendWorkspace(sizeof(float) * workspace_size);
bool flag_relu = param.fuse_relu;
bool flag_bias = param.bias != nullptr;
/// get workspace
auto ptr_zero = ctx->workspace_data<float>();
memset(ptr_zero, 0, sizeof(float) * win_round);
float* ptr_write = ptr_zero + win_round;
int size_in_channel = win * ih;
int size_out_channel = ow * oh;
int ws = -pad_w;
int we = ws + win_round;
int hs = -pad_h;
int he = hs + hin_round;
int w_loop = ow_round / 4;
auto remain = w_loop * 4 - ow;
bool flag_remain = remain > 0;
remain = 4 - remain;
remain = remain > 0 ? remain : 0;
int row_len = win_round * out_c_block;
for (int n = 0; n < bs; ++n) {
const float* din_batch = i_data + n * ic * size_in_channel;
float* dout_batch = o_data + n * oc * size_out_channel;
#pragma omp parallel for num_threads(threads)
for (int c = 0; c < oc; c += out_c_block) {
#ifdef ARM_WITH_OMP
float* pre_din = ptr_write + ow_round + omp_get_thread_num() * prein_size;
#else
float* pre_din = ptr_write + ow_round;
#endif
/// const array size
prepack_input_nxwc4_dw(
din_batch, pre_din, c, hs, he, ws, we, ic, win, ih, ptr_zero);
const float* weight_c = weights + c * 9; // kernel_w * kernel_h
float* dout_c00 = dout_batch + c * size_out_channel;
float bias_local[4] = {0, 0, 0, 0};
if (flag_bias) {
bias_local[0] = bias[c];
bias_local[1] = bias[c + 1];
bias_local[2] = bias[c + 2];
bias_local[3] = bias[c + 3];
}
#ifdef __aarch64__
float32x4_t w0 = vld1q_f32(weight_c); // w0, v23
float32x4_t w1 = vld1q_f32(weight_c + 4); // w1, v24
float32x4_t w2 = vld1q_f32(weight_c + 8); // w2, v25
float32x4_t w3 = vld1q_f32(weight_c + 12); // w3, v26
float32x4_t w4 = vld1q_f32(weight_c + 16); // w4, v27
float32x4_t w5 = vld1q_f32(weight_c + 20); // w5, v28
float32x4_t w6 = vld1q_f32(weight_c + 24); // w6, v29
float32x4_t w7 = vld1q_f32(weight_c + 28); // w7, v30
float32x4_t w8 = vld1q_f32(weight_c + 32); // w8, v31
#endif
for (int h = 0; h < oh; h += out_h_kernel) {
float* outc0 = dout_c00 + h * ow;
float* outc1 = outc0 + size_out_channel;
float* outc2 = outc1 + size_out_channel;
float* outc3 = outc2 + size_out_channel;
const float* inr0 = pre_din + h * 2 * row_len;
const float* inr1 = inr0 + row_len;
const float* inr2 = inr1 + row_len;
if (c + out_c_block > oc) {
switch (c + out_c_block - oc) {
case 3:
outc1 = ptr_write;
case 2:
outc2 = ptr_write;
case 1:
outc3 = ptr_write;
default:
break;
}
}
auto c0 = outc0;
auto c1 = outc1;
auto c2 = outc2;
auto c3 = outc3;
float pre_out[16];
for (int w = 0; w < w_loop; ++w) {
bool flag_mask = (w == w_loop - 1) && flag_remain;
if (flag_mask) {
c0 = outc0;
c1 = outc1;
c2 = outc2;
c3 = outc3;
outc0 = pre_out;
outc1 = pre_out + 4;
outc2 = pre_out + 8;
outc3 = pre_out + 12;
}
// clang-format off
#ifdef __aarch64__
asm volatile(
"ldr q8, [%[bias]]\n" /* load bias */
"ldp q0, q1, [%[inr0]], #32\n" /* load input r0*/
"and v19.16b, v8.16b, v8.16b\n"
"ldp q2, q3, [%[inr0]], #32\n" /* load input r0*/
"and v20.16b, v8.16b, v8.16b\n"
"ldp q4, q5, [%[inr0]], #32\n" /* load input r0*/
"and v21.16b, v8.16b, v8.16b\n"
"ldp q6, q7, [%[inr0]], #32\n" /* load input r0*/
"and v22.16b, v8.16b, v8.16b\n"
"ldr q8, [%[inr0]]\n" /* load input r0*/
/* r0 mul w0-w2, get out */
"fmla v19.4s , %[w0].4s, v0.4s\n" /* outr0 = w0 * r0, 0*/
"fmla v20.4s , %[w0].4s, v2.4s\n" /* outr1 = w0 * r0, 2*/
"fmla v21.4s , %[w0].4s, v4.4s\n" /* outr2 = w0 * r0, 4*/
"fmla v22.4s , %[w0].4s, v6.4s\n" /* outr3 = w0 * r0, 6*/
"fmla v19.4s , %[w1].4s, v1.4s\n" /* outr0 = w1 * r0, 1*/
"ldp q0, q1, [%[inr1]], #32\n" /* load input r1*/
"fmla v20.4s , %[w1].4s, v3.4s\n" /* outr1 = w1 * r0, 3*/
"fmla v21.4s , %[w1].4s, v5.4s\n" /* outr2 = w1 * r0, 5*/
"fmla v22.4s , %[w1].4s, v7.4s\n" /* outr3 = w1 * r0, 7*/
"fmla v19.4s , %[w2].4s, v2.4s\n" /* outr0 = w0 * r0, 2*/
"ldp q2, q3, [%[inr1]], #32\n" /* load input r1*/
"fmla v20.4s , %[w2].4s, v4.4s\n" /* outr1 = w0 * r0, 4*/
"ldp q4, q5, [%[inr1]], #32\n" /* load input r1*/
"fmla v21.4s , %[w2].4s, v6.4s\n" /* outr2 = w0 * r0, 6*/
"ldp q6, q7, [%[inr1]], #32\n" /* load input r1*/
"fmla v22.4s , %[w2].4s, v8.4s\n" /* outr3 = w0 * r0, 8*/
"ldr q8, [%[inr1]]\n" /* load input r1*/
/* r1, mul w3-w5, get out */
"fmla v19.4s , %[w3].4s, v0.4s\n" /* outr0 = w3 * r1, 0*/
"fmla v20.4s , %[w3].4s, v2.4s\n" /* outr1 = w3 * r1, 2*/
"fmla v21.4s , %[w3].4s, v4.4s\n" /* outr2 = w3 * r1, 4*/
"fmla v22.4s , %[w3].4s, v6.4s\n" /* outr3 = w3 * r1, 6*/
"fmla v19.4s , %[w4].4s, v1.4s\n" /* outr0 = w4 * r1, 1*/
"ldp q0, q1, [%[inr2]], #32\n" /* load input r2*/
"fmla v20.4s , %[w4].4s, v3.4s\n" /* outr1 = w4 * r1, 3*/
"fmla v21.4s , %[w4].4s, v5.4s\n" /* outr2 = w4 * r1, 5*/
"fmla v22.4s , %[w4].4s, v7.4s\n" /* outr3 = w4 * r1, 7*/
"fmla v19.4s , %[w5].4s, v2.4s\n" /* outr0 = w5 * r1, 2*/
"ldp q2, q3, [%[inr2]], #32\n" /* load input r2*/
"fmla v20.4s , %[w5].4s, v4.4s\n" /* outr1 = w5 * r1, 4*/
"ldp q4, q5, [%[inr2]], #32\n" /* load input r2*/
"fmla v21.4s , %[w5].4s, v6.4s\n" /* outr2 = w5 * r1, 6*/
"ldp q6, q7, [%[inr2]], #32\n" /* load input r2*/
"fmla v22.4s , %[w5].4s, v8.4s\n" /* outr3 = w5 * r1, 8*/
"ldr q8, [%[inr2]]\n" /* load input r2*/
/* r2, mul w6-w8, get out r0, r1 */
"fmla v19.4s , %[w6].4s, v0.4s\n" /* outr0 = w6 * r2, 0*/
"fmla v20.4s , %[w6].4s, v2.4s\n" /* outr1 = w6 * r2, 2*/
"fmla v21.4s , %[w6].4s, v4.4s\n" /* outr2 = w6 * r2, 4*/
"fmla v22.4s , %[w6].4s, v6.4s\n" /* outr3 = w6 * r2, 6*/
"fmla v19.4s , %[w7].4s, v1.4s\n" /* outr0 = w7 * r2, 1*/
"fmla v20.4s , %[w7].4s, v3.4s\n" /* outr1 = w7 * r2, 3*/
"fmla v21.4s , %[w7].4s, v5.4s\n" /* outr2 = w7 * r2, 5*/
"fmla v22.4s , %[w7].4s, v7.4s\n" /* outr3 = w7 * r2, 7*/
"fmla v19.4s , %[w8].4s, v2.4s\n" /* outr0 = w8 * r2, 2*/
"fmla v20.4s , %[w8].4s, v4.4s\n" /* outr1 = w8 * r2, 4*/
"fmla v21.4s , %[w8].4s, v6.4s\n" /* outr2 = w8 * r2, 6*/
"fmla v22.4s , %[w8].4s, v8.4s\n" /* outr3 = w8 * r2, 8*/
/* transpose */
"trn1 v0.4s, v19.4s, v20.4s\n" /* r0: a0a1c0c1*/
"trn2 v1.4s, v19.4s, v20.4s\n" /* r0: b0b1d0d1*/
"trn1 v2.4s, v21.4s, v22.4s\n" /* r0: a2a3c2c3*/
"trn2 v3.4s, v21.4s, v22.4s\n" /* r0: b2b3d2d3*/
"trn1 v19.2d, v0.2d, v2.2d\n" /* r0: a0a1a2a3*/
"trn2 v21.2d, v0.2d, v2.2d\n" /* r0: c0c1c2c3*/
"trn1 v20.2d, v1.2d, v3.2d\n" /* r0: b0b1b2b3*/
"trn2 v22.2d, v1.2d, v3.2d\n" /* r0: d0d1d2d3*/
/* relu */
"cbz %w[flag_relu], 0f\n" /* skip relu*/
"movi v0.4s, #0\n" /* for relu */
"fmax v19.4s, v19.4s, v0.4s\n"
"fmax v20.4s, v20.4s, v0.4s\n"
"fmax v21.4s, v21.4s, v0.4s\n"
"fmax v22.4s, v22.4s, v0.4s\n"
/* save result */
"0:\n"
"str q19, [%[outc0]], #16\n"
"str q20, [%[outc1]], #16\n"
"str q21, [%[outc2]], #16\n"
"str q22, [%[outc3]], #16\n"
:[inr0] "+r"(inr0), [inr1] "+r"(inr1),
[inr2] "+r"(inr2),
[outc0]"+r"(outc0), [outc1]"+r"(outc1),
[outc2]"+r"(outc2), [outc3]"+r"(outc3)
:[w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2),
[w3] "w"(w3), [w4] "w"(w4), [w5] "w"(w5),
[w6] "w"(w6), [w7] "w"(w7), [w8] "w"(w8),
[bias] "r" (bias_local), [flag_relu]"r"(flag_relu)
: "cc", "memory",
"v0","v1","v2","v3","v4","v5","v6","v7",
"v8", "v19","v20","v21","v22"
);
#else
asm volatile(
/* fill with bias */
"vld1.32 {d16-d17}, [%[bias]]\n" /* load bias */
/* load weights */
"vld1.32 {d18-d21}, [%[wc0]]!\n" /* load w0-2, to q9-11 */
"vld1.32 {d0-d3}, [%[r0]]!\n" /* load input r0, 0,1*/
"vand.i32 q12, q8, q8\n"
"vld1.32 {d4-d7}, [%[r0]]!\n" /* load input r0, 2,3*/
"vand.i32 q13, q8, q8\n"
"vld1.32 {d8-d11}, [%[r0]]!\n" /* load input r0, 4,5*/
"vand.i32 q14, q8, q8\n"
"vld1.32 {d12-d15}, [%[r0]]!\n" /* load input r0, 6,7*/
"vand.i32 q15, q8, q8\n"
"vld1.32 {d16-d17}, [%[r0]]\n" /* load input r0, 8*/
/* mul r0 with w0, w1, w2 */
"vmla.f32 q12, q9, q0 @ w0 * inr0\n"
"vmla.f32 q13, q9, q2 @ w0 * inr2\n"
"vld1.32 {d22-d23}, [%[wc0]]!\n" /* load w2, to q11 */
"vmla.f32 q14, q9, q4 @ w0 * inr4\n"
"vmla.f32 q15, q9, q6 @ w0 * inr6\n"
"vmla.f32 q12, q10, q1 @ w1 * inr1\n"
"vld1.32 {d0-d3}, [%[r1]]! @ load r1, 0, 1\n"
"vmla.f32 q13, q10, q3 @ w1 * inr3\n"
"vmla.f32 q14, q10, q5 @ w1 * inr5\n"
"vmla.f32 q15, q10, q7 @ w1 * inr7\n"
"vld1.32 {d18-d21}, [%[wc0]]!\n" /* load w3-4, to q9-10 */
"vmla.f32 q12, q11, q2 @ w2 * inr2\n"
"vld1.32 {d4-d7}, [%[r1]]! @ load r1, 2, 3\n"
"vmla.f32 q13, q11, q4 @ w2 * inr4\n"
"vld1.32 {d8-d11}, [%[r1]]! @ load r1, 4, 5\n"
"vmla.f32 q14, q11, q6 @ w2 * inr6\n"
"vld1.32 {d12-d15}, [%[r1]]! @ load r1, 6, 7\n"
"vmla.f32 q15, q11, q8 @ w2 * inr8\n"
/* mul r1 with w3, w4, w5 */
"vmla.f32 q12, q9, q0 @ w3 * inr0\n"
"vmla.f32 q13, q9, q2 @ w3 * inr2\n"
"vld1.32 {d22-d23}, [%[wc0]]!\n" /* load w5, to q11 */
"vmla.f32 q14, q9, q4 @ w3 * inr4\n"
"vmla.f32 q15, q9, q6 @ w3 * inr6\n"
"vld1.32 {d16-d17}, [%[r1]]\n" /* load input r1, 8*/
"vmla.f32 q12, q10, q1 @ w4 * inr1\n"
"vld1.32 {d0-d3}, [%[r2]]! @ load r2, 0, 1\n"
"vmla.f32 q13, q10, q3 @ w4 * inr3\n"
"vmla.f32 q14, q10, q5 @ w4 * inr5\n"
"vmla.f32 q15, q10, q7 @ w4 * inr7\n"
"vld1.32 {d18-d21}, [%[wc0]]!\n" /* load w6-7, to q9-10 */
"vmla.f32 q12, q11, q2 @ w5 * inr2\n"
"vld1.32 {d4-d7}, [%[r2]]! @ load r2, 2, 3\n"
"vmla.f32 q13, q11, q4 @ w5 * inr4\n"
"vld1.32 {d8-d11}, [%[r2]]! @ load r2, 4, 5\n"
"vmla.f32 q14, q11, q6 @ w5 * inr6\n"
"vld1.32 {d12-d15}, [%[r2]]! @ load r2, 6, 7\n"
"vmla.f32 q15, q11, q8 @ w5 * inr8\n"
/* mul r2 with w6, w7, w8 */
"vmla.f32 q12, q9, q0 @ w6 * inr0\n"
"vmla.f32 q13, q9, q2 @ w6 * inr2\n"
"vld1.32 {d22-d23}, [%[wc0]]!\n" /* load w8, to q11 */
"vmla.f32 q14, q9, q4 @ w6 * inr4\n"
"vmla.f32 q15, q9, q6 @ w6 * inr6\n"
"vld1.32 {d16-d17}, [%[r2]]\n" /* load input r2, 8*/
"vmla.f32 q12, q10, q1 @ w7 * inr1\n"
"vmla.f32 q13, q10, q3 @ w7 * inr3\n"
"vmla.f32 q14, q10, q5 @ w7 * inr5\n"
"vmla.f32 q15, q10, q7 @ w7 * inr7\n"
"sub %[wc0], %[wc0], #144 @ wc0 - 144 to start address\n"
"vmla.f32 q12, q11, q2 @ w8 * inr2\n"
"vmla.f32 q13, q11, q4 @ w8 * inr4\n"
"vmla.f32 q14, q11, q6 @ w8 * inr6\n"
"vmla.f32 q15, q11, q8 @ w8 * inr8\n"
/* transpose */
"vtrn.32 q12, q13\n" /* a0a1c0c1, b0b1d0d1*/
"vtrn.32 q14, q15\n" /* a2a3c2c3, b2b3d2d3*/
"vswp d25, d28\n" /* a0a1a2a3, c0c1c2c3*/
"vswp d27, d30\n" /* b0b1b2b3, d0d1d2d3*/
"cmp %[flag_relu], #0\n"
"beq 0f\n" /* skip relu*/
"vmov.u32 q0, #0\n"
"vmax.f32 q12, q12, q0\n"
"vmax.f32 q13, q13, q0\n"
"vmax.f32 q14, q14, q0\n"
"vmax.f32 q15, q15, q0\n"
"0:\n"
"vst1.32 {d24-d25}, [%[outc0]]!\n" /* save outc0*/
"vst1.32 {d26-d27}, [%[outc1]]!\n" /* save outc1*/
"vst1.32 {d28-d29}, [%[outc2]]!\n" /* save outc2*/
"vst1.32 {d30-d31}, [%[outc3]]!\n" /* save outc3*/
:[r0] "+r"(inr0), [r1] "+r"(inr1),
[r2] "+r"(inr2), [wc0] "+r" (weight_c),
[outc0]"+r"(outc0), [outc1]"+r"(outc1),
[outc2]"+r"(outc2), [outc3]"+r"(outc3)
:[bias] "r" (bias_local),
[flag_relu]"r"(flag_relu)
:"cc", "memory",
"q0","q1","q2","q3","q4","q5","q6","q7",
"q8", "q9","q10","q11","q12","q13","q14","q15"
);
#endif // __arch64__
// clang-format off
if (flag_mask) {
for (int i = 0; i < remain; ++i) {
c0[i] = pre_out[i];
c1[i] = pre_out[i + 4];
c2[i] = pre_out[i + 8];
c3[i] = pre_out[i + 12];
}
}
}
}
}
}
}
} // namespace math
} // namespace arm
} // namespace lite
} // namespace paddle
此差异已折叠。
......@@ -51,12 +51,7 @@ void conv_3x3s2_direct_fp32(const float* i_data,
const int win_round = wout_round * 2 /*stride_w*/ + 1;
bool flag_relu = param.fuse_relu;
bool flag_bias = param.bias != nullptr;
// if (param.activation_param.has_active) {
// if (param.activation_param.active == Active_relu &&
// fabs(param.activation_param.negative_slope) < 1e-6f) {
// flag_relu = true;
// }
// }
//! get h block
//! win_round * ic * hin_r_block + wout_round * hout_c_block * hout_r_block
//! * threads = l2_size
......
......@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/arm/math/conv_depthwise.h"
#include <arm_neon.h>
#include "lite/backends/arm/math/conv_depthwise.h"
namespace paddle {
namespace lite {
......@@ -5073,7 +5073,7 @@ void conv_depthwise_5x5s1_small_impl(const float* din,
int w_in_new = w_in + 2 * pad_new;
int h_out_new = h_out - 2 * pad_0;
int w_out_new = w_out - 2 * pad_0;
float zero_ptr[w_in_new + w_out];
float zero_ptr[w_in_new + w_out]; // NOLINT
memset(zero_ptr, 0, w_in_new * sizeof(float));
float* write_ptr = zero_ptr + w_in_new;
int pad_cnt = pad_0 >> 2;
......@@ -5320,7 +5320,7 @@ void conv_depthwise_5x5s1_small_relu_impl(const float* din,
int pad_0 = pad - pad_new;
int h_in_new = h_in + 2 * pad_new;
int w_in_new = w_in + 2 * pad_new;
float zero_ptr[w_in_new + w_out];
float zero_ptr[w_in_new + w_out]; // NOLINT
memset(zero_ptr, 0, w_in_new * sizeof(float));
float* write_ptr = zero_ptr + w_in_new;
int h_out_new = h_out - 2 * pad_0;
......@@ -9177,7 +9177,7 @@ void conv_depthwise_5x5s1_small_impl(const float* din,
int w_in_new = w_in + 2 * pad_new;
int h_out_new = h_out - 2 * pad_0;
int w_out_new = w_out - 2 * pad_0;
float zero_ptr[w_in_new + w_out];
float zero_ptr[w_in_new + w_out]; // NOLINT
memset(zero_ptr, 0, w_in_new * sizeof(float));
float* write_ptr = zero_ptr + w_in_new;
int pad_cnt = pad_0 >> 2;
......@@ -9359,7 +9359,7 @@ void conv_depthwise_5x5s1_small_relu_impl(const float* din,
int w_in_new = w_in + 2 * pad_new;
int h_out_new = h_out - 2 * pad_0;
int w_out_new = w_out - 2 * pad_0;
float zero_ptr[w_in_new + w_out];
float zero_ptr[w_in_new + w_out]; // NOLINT
memset(zero_ptr, 0, w_in_new * sizeof(float));
float* write_ptr = zero_ptr + w_in_new;
int pad_cnt = pad_0 >> 2;
......@@ -9523,21 +9523,21 @@ void conv_depthwise_5x5s1_small_relu_impl(const float* din,
}
#endif // __aarch64__
void conv_depthwise_5x5s1(const float* din,
float* dout,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win,
const float* weights,
const float* bias,
int pad,
bool flag_bias,
bool flag_relu,
ARMContext* ctx) {
void conv_depthwise_5x5s1_fp32(const float* din,
float* dout,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win,
const float* weights,
const float* bias,
int pad,
bool flag_bias,
bool flag_relu,
ARMContext* ctx) {
if (win < 4) {
if (flag_relu) {
conv_depthwise_5x5s1_small_relu_impl(din,
......
......@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/arm/math/conv_depthwise.h"
#include <arm_neon.h>
#include "lite/backends/arm/math/conv_depthwise.h"
namespace paddle {
namespace lite {
......@@ -80,21 +80,21 @@ void conv_depthwise_5x5s2p2_relu_s(const float* din,
bool flag_relu,
ARMContext* ctx);
void conv_depthwise_5x5s2(const float* din,
float* dout,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win,
const float* weights,
const float* bias,
int pad,
bool flag_bias,
bool flag_relu,
ARMContext* ctx) {
void conv_depthwise_5x5s2_fp32(const float* din,
float* dout,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win,
const float* weights,
const float* bias,
int pad,
bool flag_bias,
bool flag_relu,
ARMContext* ctx) {
if (pad == 2) {
if (win >= 9) {
if (flag_relu) {
......
......@@ -16,20 +16,16 @@
#include <cmath>
#include <vector>
#include "lite/backends/arm/math/conv_impl.h"
#include "lite/core/context.h"
#include "lite/core/target_wrapper.h"
#include "lite/operators/op_params.h"
namespace paddle {
namespace lite {
namespace arm {
namespace math {
template <PrecisionType Ptype>
class DepthwiseConv
: public ImplBase<TARGET(kARM), Ptype, operators::ConvParam> {
public:
typedef void (*conv_dw_impl)(const float* i_data,
void conv_3x3s1_depthwise_fp32(const float* i_data,
float* o_data,
int bs,
int oc,
......@@ -37,62 +33,125 @@ class DepthwiseConv
int ow,
int ic,
int ih,
int kw,
const float* w_data,
const float* b_data,
int win,
const float* weights,
const float* bias,
const operators::ConvParam& param,
Context<TARGET(kARM)>* ctx);
DepthwiseConv() = default;
~DepthwiseConv() {}
ARMContext* ctx);
virtual bool init(const operators::ConvParam& param,
Context<TARGET(kARM)>* ctx);
virtual bool create(const operators::ConvParam& param,
Context<TARGET(kARM)>* ctx);
virtual bool run(const operators::ConvParam& param);
private:
conv_dw_impl impl_{nullptr};
};
void conv_3x3s2_depthwise_fp32(const float* i_data,
float* o_data,
int bs,
int oc,
int oh,
int ow,
int ic,
int ih,
int win,
const float* weights,
const float* bias,
const operators::ConvParam& param,
ARMContext* ctx);
template <PrecisionType Ptype_out>
class DepthwiseConvInt8
: public ImplBase<TARGET(kARM), PRECISION(kInt8), operators::ConvParam> {
public:
typedef void (*conv_dw_int8_impl)(const int8_t* i_data,
int32_t* o_data,
int bs,
int oc,
int oh,
int ow,
int ic,
int ih,
int kw,
const int8_t* w_data,
const int32_t* b_data,
const operators::ConvParam& param,
Context<TARGET(kARM)>* ctx,
PrecisionType out_type,
const float* scale);
void conv_depthwise_3x3p0_fp32(const float* din,
float* dout,
int num,
int ch_out,
int h_out,
int w_out,
int ch_in,
int h_in,
int w_in,
const float* weights,
const float* bias,
int stride,
bool flag_bias,
bool flag_relu,
ARMContext* ctx);
DepthwiseConvInt8() = default;
~DepthwiseConvInt8() {}
void conv_depthwise_3x3p1_fp32(const float* din,
float* dout,
int num,
int ch_out,
int h_out,
int w_out,
int ch_in,
int h_in,
int w_in,
const float* weights,
const float* bias,
int stride,
bool flag_bias,
bool flag_relu,
ARMContext* ctx);
virtual bool init(const operators::ConvParam& param,
Context<TARGET(kARM)>* ctx);
template <typename Dtype>
void conv_depthwise_3x3s1_int8(Dtype* dout,
const int8_t* din,
const int8_t* weights,
const float* scale,
const float* bias,
bool flag_bias,
bool flag_relu,
int num,
int chin,
int hin,
int win,
int hout,
int wout,
int padw,
int padh,
ARMContext* ctx);
virtual bool create(const operators::ConvParam& param,
Context<TARGET(kARM)>* ctx);
template <typename Dtype>
void conv_depthwise_3x3s2_int8(Dtype* dout,
const int8_t* din,
const int8_t* weights,
const float* scale,
const float* bias,
bool flag_bias,
bool flag_relu,
int num,
int chin,
int hin,
int win,
int hout,
int wout,
int padw,
int padh,
ARMContext* ctx);
virtual bool run(const operators::ConvParam& param);
void conv_depthwise_5x5s1_fp32(const float* din,
float* dout,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win,
const float* weights,
const float* bias,
int pad,
bool flag_bias,
bool flag_relu,
ARMContext* ctx);
private:
conv_dw_int8_impl impl_{nullptr};
std::vector<float> w_scale_;
Tensor tmp_int32_out_;
};
void conv_depthwise_5x5s2_fp32(const float* din,
float* dout,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win,
const float* weights,
const float* bias,
int pad,
bool flag_bias,
bool flag_relu,
ARMContext* ctx);
} // namespace math
} // namespace arm
......
......@@ -128,21 +128,21 @@ void conv_depthwise_3x3s2p0_bias_s_relu(float* dout,
const int w_out,
ARMContext* ctx);
void conv_depthwise_3x3p0(const float* din,
float* dout,
int num,
int ch_out,
int h_out,
int w_out,
int ch_in,
int h_in,
int w_in,
const float* weights,
const float* bias,
int stride,
bool flag_bias,
bool flag_relu,
ARMContext* ctx) {
void conv_depthwise_3x3p0_fp32(const float* din,
float* dout,
int num,
int ch_out,
int h_out,
int w_out,
int ch_in,
int h_in,
int w_in,
const float* weights,
const float* bias,
int stride,
bool flag_bias,
bool flag_relu,
ARMContext* ctx) {
if (stride == 1) {
if (flag_relu) {
if (w_in > 5) {
......
......@@ -128,21 +128,21 @@ void conv_depthwise_3x3s2p1_bias_s_relu(float* dout,
const int w_out,
ARMContext* ctx);
void conv_depthwise_3x3p1(const float* din,
float* dout,
int num,
int ch_out,
int h_out,
int w_out,
int ch_in,
int h_in,
int w_in,
const float* weights,
const float* bias,
int stride,
bool flag_bias,
bool flag_relu,
ARMContext* ctx) {
void conv_depthwise_3x3p1_fp32(const float* din,
float* dout,
int num,
int ch_out,
int h_out,
int w_out,
int ch_in,
int h_in,
int w_in,
const float* weights,
const float* bias,
int stride,
bool flag_bias,
bool flag_relu,
ARMContext* ctx) {
if (stride == 1) {
if (flag_relu) {
if (w_in > 4) {
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
......@@ -102,7 +102,7 @@ void conv_winograd3x3(const float* din,
//! dot mul
//! transpose input, convert from ch_in * tile_h * tile_w * 64 to
//! 64 * ch_in * tile_h * tile_w
int hblock = get_hblock(ctx->arch());
int hblock = get_hblock(ctx);
int m_round = hblock * ((chout + hblock - 1) / hblock);
int stride_a = m_round * chin;
int stride_b = chin * size_tile;
......
此差异已折叠。
此差异已折叠。
......@@ -15,7 +15,6 @@
#pragma once
#include <cmath>
#include "lite/core/context.h"
#include "lite/core/device_info.h"
#include "lite/core/tensor.h"
namespace paddle {
......@@ -34,7 +33,7 @@ const int NBLOCK_INT8_OTH = 16;
const int MBLOCK_INT8_DOT = 8;
const int NBLOCK_INT8_DOT = 12;
inline int get_hblock_int8(const ARMContext* ctx) {
inline int get_hblock_int8(ARMContext* ctx) {
#ifdef WITH_ARM_DOTPROD
if (ctx->has_dot()) {
return MBLOCK_INT8_DOT;
......@@ -51,7 +50,7 @@ inline int get_hblock_int8(const ARMContext* ctx) {
const int MBLOCK_INT8_OTH = 4;
const int NBLOCK_INT8_OTH = 8;
inline int get_hblock_int8(const ARMContext* ctx) { return 4; }
inline int get_hblock_int8(ARMContext* ctx) { return 4; }
#endif // __aarch64__
void prepackA_int8(void* out,
......@@ -75,7 +74,7 @@ void prepackA_int8(TensorLite* tout,
template <typename dtype>
void gemm_prepack_int8(const int8_t* A_packed,
const int8_t* B,
const int* bias,
const float* bias,
dtype* C,
int M,
int N,
......@@ -87,7 +86,6 @@ void gemm_prepack_int8(const int8_t* A_packed,
ARMContext* ctx);
#define ROUNDUP(a, b) ((((a) + (b)-1) / (b)) * (b))
} // namespace math
} // namespace arm
} // namespace lite
......
此差异已折叠。
此差异已折叠。
......@@ -14,7 +14,7 @@
#pragma once
#include <cmath>
#include "lite/core/device_info.h"
#include "lite/core/context.h"
namespace paddle {
namespace lite {
......@@ -30,9 +30,10 @@ bool gemv_int8(const int8_t* A,
int M,
int N,
const float* scale,
bool is_bias = false,
const int* bias = nullptr,
bool is_relu = false);
bool is_bias,
const float* bias,
bool is_relu,
const ARMContext* ctx);
} // namespace math
} // namespace arm
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
add_subdirectory(kernels)
add_subdirectory(math)
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册