提交 663d0fd4 编写于 作者: W WangLiu 提交者: GitHub

Merge pull request #410 from codeWorm2015/develop

fix #409 add relu assembler implementation
......@@ -4,7 +4,8 @@ project(paddle-mobile)
option(DEBUGING "enable debug mode" ON)
option(USE_OPENMP "openmp support" OFF)
option(USE_EXCEPTION "use std exception" ON)
option(LOG_PROFILE "log profile" ON)
# select the platform to build
option(CPU "cpu" ON)
option(MALI_GPU "mali gpu" OFF)
option(FPGA "fpga" OFF)
......@@ -45,6 +46,10 @@ else()
add_definitions(-fno-exceptions)
endif ()
if (LOG_PROFILE)
add_definitions(-DPADDLE_MOBILE_PROFILE)
endif()
if(IS_MAC)
add_definitions(-DX86)
elseif(IS_IOS)
......
......@@ -37,13 +37,71 @@ void ReluKernel<CPU, float>::Compute(const ReluParam &param) const {
auto *out = param.Out();
auto *out_ptr = out->mutable_data<float>();
ReluFunctor<float> func_;
math::Transform trans;
trans(input_x_ptr, input_x_ptr + input_x->numel(), out_ptr, func_);
int numel = input_x->numel();
if (numel > 32) {
asm volatile(
"pld [%[input_x_ptr], #0] \n\t"
"vmov.f32 q8, #0.0 \n\t"
"subs %[num], %[num], #32 \n\t"
"blt end_num_%= \n\t"
"loop_num_%=: \n\t"
"pld [%[input_x_ptr], #1024] \n\t"
// for (int i = 0; i < input_x->numel(); i++) {
// out_ptr[i] = input_x_ptr[i] > 0 ? input_x_ptr[i] : 0;
// }
"vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t"
"vmax.f32 q0, q0, q8 \n\t"
"vmax.f32 q1, q1, q8 \n\t"
"vmax.f32 q2, q2, q8 \n\t"
"vmax.f32 q3, q3, q8 \n\t"
"vmax.f32 q4, q4, q8 \n\t"
"vmax.f32 q5, q5, q8 \n\t"
"vmax.f32 q6, q6, q8 \n\t"
"vmax.f32 q7, q7, q8 \n\t"
"vst1.32 {q0, q1}, [%[out_ptr]]! \n\t"
"vst1.32 {q2, q3}, [%[out_ptr]]! \n\t"
"vst1.32 {q4, q5}, [%[out_ptr]]! \n\t"
"vst1.32 {q6, q7}, [%[out_ptr]]! \n\t"
"subs %[num], %[num], #32 \n\t"
"bge loop_num_%= \n\t"
"end_num_%=: \n\t"
"cmp %[num], #0 \n\t"
"bge end_%= \n\t"
"mov r6, #4 \n\t"
"mul r5, %[num], r6 \n\t"
"add %[input_x_ptr], %[input_x_ptr], r5 \n\t"
"vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t"
"vmax.f32 q0, q0, q8 \n\t"
"vmax.f32 q1, q1, q8 \n\t"
"vmax.f32 q2, q2, q8 \n\t"
"vmax.f32 q3, q3, q8 \n\t"
"vmax.f32 q4, q4, q8 \n\t"
"vmax.f32 q5, q5, q8 \n\t"
"vmax.f32 q6, q6, q8 \n\t"
"vmax.f32 q7, q7, q8 \n\t"
"add %[out_ptr], %[out_ptr], r5 \n\t"
"vst1.32 {q0, q1}, [%[out_ptr]]! \n\t"
"vst1.32 {q2, q3}, [%[out_ptr]]! \n\t"
"vst1.32 {q4, q5}, [%[out_ptr]]! \n\t"
"vst1.32 {q6, q7}, [%[out_ptr]]! \n\t"
"end_%=: \n\t"
:
:
[out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num] "r"(numel)
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "r5",
"r6");
} else {
ReluFunctor<float> func_;
math::Transform trans;
trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_);
}
}
} // namespace operators
} // namespace paddle_mobile
......
......@@ -20,9 +20,9 @@ int main() {
paddle_mobile::Loader<paddle_mobile::CPU> loader;
bool optimize = false;
auto time1 = time();
// auto program = loader.Load(g_googlenet, optimize);
auto program = loader.Load(g_googlenet_combine + "/model",
g_googlenet_combine + "/params", optimize);
auto program = loader.Load(g_googlenet, optimize);
// auto program = loader.Load(g_googlenet_combine + "/model",
// g_googlenet_combine + "/params", optimize);
auto time2 = time();
DLOG << "load cost :" << time_diff(time1, time2) << "ms\n";
paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, optimize);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册