提交 911e15cc 编写于 作者: L liuruilong

add relu asm implementation

上级 c9a81686
...@@ -4,7 +4,8 @@ project(paddle-mobile) ...@@ -4,7 +4,8 @@ project(paddle-mobile)
option(DEBUGING "enable debug mode" ON) option(DEBUGING "enable debug mode" ON)
option(USE_OPENMP "openmp support" OFF) option(USE_OPENMP "openmp support" OFF)
option(USE_EXCEPTION "use std exception" ON) option(USE_EXCEPTION "use std exception" ON)
option(LOG_PROFILE "log profile" ON)
# select the platform to build
option(CPU "cpu" ON) option(CPU "cpu" ON)
option(MALI_GPU "mali gpu" OFF) option(MALI_GPU "mali gpu" OFF)
option(FPGA "fpga" OFF) option(FPGA "fpga" OFF)
...@@ -45,6 +46,10 @@ else() ...@@ -45,6 +46,10 @@ else()
add_definitions(-fno-exceptions) add_definitions(-fno-exceptions)
endif () endif ()
if (LOG_PROFILE)
add_definitions(-DPADDLE_MOBILE_PROFILE)
endif()
if(IS_MAC) if(IS_MAC)
add_definitions(-DX86) add_definitions(-DX86)
elseif(IS_IOS) elseif(IS_IOS)
......
...@@ -37,13 +37,70 @@ void ReluKernel<CPU, float>::Compute(const ReluParam &param) const { ...@@ -37,13 +37,70 @@ void ReluKernel<CPU, float>::Compute(const ReluParam &param) const {
auto *out = param.Out(); auto *out = param.Out();
auto *out_ptr = out->mutable_data<float>(); auto *out_ptr = out->mutable_data<float>();
ReluFunctor<float> func_; int numel = input_x->numel();
math::Transform trans; if (numel > 32) {
trans(input_x_ptr, input_x_ptr + input_x->numel(), out_ptr, func_); asm volatile(
"pld [%[input_x_ptr], #0] \n\t"
"vmov.f32 q8, #0.0 \n\t"
"subs %[num], %[num], #32 \n\t"
"blt end_num_%= \n\t"
"loop_num_%=: \n\t"
"pld [%[input_x_ptr], #1024] \n\t"
// for (int i = 0; i < input_x->numel(); i++) { "vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t"
// out_ptr[i] = input_x_ptr[i] > 0 ? input_x_ptr[i] : 0; "vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t"
// } "vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t"
"vmax.f32 q0, q0, q8 \n\t"
"vmax.f32 q1, q1, q8 \n\t"
"vmax.f32 q2, q2, q8 \n\t"
"vmax.f32 q3, q3, q8 \n\t"
"vmax.f32 q4, q4, q8 \n\t"
"vmax.f32 q5, q5, q8 \n\t"
"vmax.f32 q6, q6, q8 \n\t"
"vmax.f32 q7, q7, q8 \n\t"
"vst1.32 {q0, q1}, [%[out_ptr]]! \n\t"
"vst1.32 {q2, q3}, [%[out_ptr]]! \n\t"
"vst1.32 {q4, q5}, [%[out_ptr]]! \n\t"
"vst1.32 {q6, q7}, [%[out_ptr]]! \n\t"
"subs %[num], %[num], #32 \n\t"
"bge loop_num_%= \n\t"
"end_num_%=: \n\t"
"cmp %[num], #0 \n\t"
"bge end_%= \n\t"
"mov r6, #4 \n\t"
"mul r5, %[num], r6 \n\t"
"add %[input_x_ptr], %[input_x_ptr], r5 \n\t"
"vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t"
"vmax.f32 q0, q0, q8 \n\t"
"vmax.f32 q1, q1, q8 \n\t"
"vmax.f32 q2, q2, q8 \n\t"
"vmax.f32 q3, q3, q8 \n\t"
"vmax.f32 q4, q4, q8 \n\t"
"vmax.f32 q5, q5, q8 \n\t"
"vmax.f32 q6, q6, q8 \n\t"
"vmax.f32 q7, q7, q8 \n\t"
"add %[out_ptr], %[out_ptr], r5 \n\t"
"vst1.32 {q0, q1}, [%[out_ptr]]! \n\t"
"vst1.32 {q2, q3}, [%[out_ptr]]! \n\t"
"vst1.32 {q4, q5}, [%[out_ptr]]! \n\t"
"vst1.32 {q6, q7}, [%[out_ptr]]! \n\t"
"end_%=: \n\t"
:
:[out_ptr]"r"(out_ptr), [input_x_ptr]"r"(input_x_ptr), [num]"r"(numel)
:"memory", "q0", "q1", "q2", "q3", "q4","q5","q6", "q7", "q8", "r5", "r6"
);
} else {
ReluFunctor<float> func_;
math::Transform trans;
trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_);
}
} }
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -20,9 +20,9 @@ int main() { ...@@ -20,9 +20,9 @@ int main() {
paddle_mobile::Loader<paddle_mobile::CPU> loader; paddle_mobile::Loader<paddle_mobile::CPU> loader;
bool optimize = false; bool optimize = false;
auto time1 = time(); auto time1 = time();
// auto program = loader.Load(g_googlenet, optimize); auto program = loader.Load(g_googlenet, optimize);
auto program = loader.Load(g_googlenet_combine + "/model", // auto program = loader.Load(g_googlenet_combine + "/model",
g_googlenet_combine + "/params", optimize); // g_googlenet_combine + "/params", optimize);
auto time2 = time(); auto time2 = time();
DLOG << "load cost :" << time_diff(time1, time2) << "ms\n"; DLOG << "load cost :" << time_diff(time1, time2) << "ms\n";
paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, optimize); paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, optimize);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册