// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/lite/arm/math/funcs.h" #include namespace paddle { namespace lite { namespace arm { namespace math { template <> void fill_bias_fc(float *tensor, const float *bias, const int num, const int channel) { int cnt = channel >> 4; int remain = channel & 15; for (int j = 0; j < num; ++j) { const float *ptr_bias = bias; float *ptr_out = tensor + j * channel; float32x4_t vout1; float32x4_t vout2; float32x4_t vout3; float32x4_t vout4; for (int i = 0; i < cnt; ++i) { float32x4_t vin1 = vld1q_f32(ptr_out); float32x4_t vb1 = vld1q_f32(ptr_bias); float32x4_t vin2 = vld1q_f32(ptr_out + 4); float32x4_t vb2 = vld1q_f32(ptr_bias + 4); float32x4_t vin3 = vld1q_f32(ptr_out + 8); float32x4_t vb3 = vld1q_f32(ptr_bias + 8); float32x4_t vin4 = vld1q_f32(ptr_out + 12); float32x4_t vb4 = vld1q_f32(ptr_bias + 12); vout1 = vaddq_f32(vin1, vb1); vout2 = vaddq_f32(vin2, vb2); vout3 = vaddq_f32(vin3, vb3); vout4 = vaddq_f32(vin4, vb4); vst1q_f32(ptr_out, vout1); vst1q_f32(ptr_out + 4, vout2); vst1q_f32(ptr_out + 8, vout3); vst1q_f32(ptr_out + 12, vout4); ptr_out += 16; ptr_bias += 16; } #if 0 if (cnt > 0) { asm( "1: \n" "vld1.32 {d0-d1}, [%[ptr_out]] @ load data\n" "vld1.32 {d2-d3}, [%[ptr_bias]]! @ load data\n" "vadd.f32 q2, q0, q1 @ add bias\n" "vst1.32 {d4-d5}, [%[ptr_out]]! @ store result\n" "subs %[cnt], #1 @ loop count -1\n" "bne 1b @ jump to main loop\n" :[ptr_out] "+r"(ptr_out), [ptr_bias] "+r"(ptr_bias), \ [cnt] "+r"(cnt) : :"q0", "q1", "q2" ); } #endif for (; remain > 0; remain--) { *(ptr_out++) += *(ptr_bias++); } } } template <> void fill_bias_fc(int *tensor, const int *bias, const int num, const int channel) { int cnt = channel >> 4; int remain = channel & 15; for (int j = 0; j < num; ++j) { const int *ptr_bias = bias; int *ptr_out = tensor + j * channel; int32x4_t vout1; int32x4_t vout2; int32x4_t vout3; int32x4_t vout4; for (int i = 0; i < cnt; ++i) { int32x4_t vin1 = vld1q_s32(ptr_out); int32x4_t vb1 = vld1q_s32(ptr_bias); int32x4_t vin2 = vld1q_s32(ptr_out + 4); int32x4_t vb2 = vld1q_s32(ptr_bias + 4); int32x4_t vin3 = vld1q_s32(ptr_out + 8); int32x4_t vb3 = vld1q_s32(ptr_bias + 8); int32x4_t vin4 = vld1q_s32(ptr_out + 12); int32x4_t vb4 = vld1q_s32(ptr_bias + 12); vout1 = vaddq_s32(vin1, vb1); vout2 = vaddq_s32(vin2, vb2); vout3 = vaddq_s32(vin3, vb3); vout4 = vaddq_s32(vin4, vb4); vst1q_s32(ptr_out, vout1); vst1q_s32(ptr_out + 4, vout2); vst1q_s32(ptr_out + 8, vout3); vst1q_s32(ptr_out + 12, vout4); ptr_out += 16; ptr_bias += 16; } #if 0 if (cnt > 0) { asm( "1: \n" "vld1.32 {d0-d1}, [%[ptr_out]] @ load data\n" "vld1.32 {d2-d3}, [%[ptr_bias]]! @ load data\n" "vadd.s32 q2, q0, q1 @ add bias\n" "vst1.32 {d4-d5}, [%[ptr_out]]! @ store result\n" "subs %[cnt], #1 @ loop count -1\n" "bne 1b @ jump to main loop\n" :[ptr_out] "+r"(ptr_out), [ptr_bias] "+r"(ptr_bias), \ [cnt] "+r"(cnt) : :"q0", "q1", "q2" ); } #endif for (; remain > 0; remain--) { *(ptr_out++) += *(ptr_bias++); } } } } // namespace math } // namespace arm } // namespace lite } // namespace paddle