diff --git a/CMakeLists.txt b/CMakeLists.txt index 99032ada82dc6c0e085bc9d4e6b98ee204f63d33..83da232f9f56565028ca95876f7ecfbbf355a253 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,8 @@ project(paddle-mobile) option(DEBUGING "enable debug mode" ON) option(USE_OPENMP "openmp support" OFF) option(USE_EXCEPTION "use std exception" ON) - +option(LOG_PROFILE "log profile" ON) +# select the platform to build option(CPU "cpu" ON) option(MALI_GPU "mali gpu" OFF) option(FPGA "fpga" OFF) @@ -45,6 +46,10 @@ else() add_definitions(-fno-exceptions) endif () +if (LOG_PROFILE) + add_definitions(-DPADDLE_MOBILE_PROFILE) +endif() + if(IS_MAC) add_definitions(-DX86) elseif(IS_IOS) diff --git a/src/framework/program/program-optimize/program_optimize.cpp b/src/framework/program/program-optimize/program_optimize.cpp index 15724523ded18e14cecf5d5aacf506992dadb3b4..2e802b120a6effd33bbe68048123c33f76a8aee8 100644 --- a/src/framework/program/program-optimize/program_optimize.cpp +++ b/src/framework/program/program-optimize/program_optimize.cpp @@ -106,11 +106,14 @@ std::shared_ptr ProgramOptimize::FushionOptimize( } std::vector> op_descs; - for (int m = 0; m < nodes.size(); ++m) { - auto &node = nodes[m]; - op_descs.push_back(node->op_desc_); + if (add_split) { + GenerateOps(&op_descs, begin_node.get(), add_split); + } else { + for (int m = 0; m < nodes.size(); ++m) { + auto &node = nodes[m]; + op_descs.push_back(node->op_desc_); + } } - // GenerateOps(&op_descs, begin_node.get()); block->ops_ = op_descs; } @@ -267,12 +270,12 @@ void ProgramOptimize::GenerateOps( } void ProgramOptimize::GenerateOps( - std::vector> *op_descs, - Node *begin_node) { + std::vector> *op_descs, Node *begin_node, + bool can_add_split) { // std::vector> *op_desc, // Node *input_node, Node *current_node, bool adding_thread, int // thread_num - if (false) { + if (can_add_split) { this->GenerateOps(op_descs, begin_node, begin_node, false, -1, nullptr); } else { this->GenerateOps(op_descs, begin_node, begin_node); diff --git a/src/framework/program/program-optimize/program_optimize.h b/src/framework/program/program-optimize/program_optimize.h index 93943cf83951565d91f67bfa77881dbcb130278d..ae632da4bdf004f23e7dab86ab06a4e007fdb75b 100644 --- a/src/framework/program/program-optimize/program_optimize.h +++ b/src/framework/program/program-optimize/program_optimize.h @@ -34,7 +34,7 @@ class ProgramOptimize { int current_block_; std::vector> new_blocks_; void GenerateOps(std::vector> *op_descs, - Node *begin_node); + Node *begin_node, bool can_add_split); void GenerateOps(std::vector> *op_desc, Node *input_node, Node *current_node); void GenerateOps(std::vector> *op_desc, diff --git a/src/io/io.cpp b/src/io/io.cpp index 019770399e29ea8bdd896b2348a23c09a5d27a95..c60113bb7882b7482cf5a23e4ad48adb6ec63de8 100644 --- a/src/io/io.cpp +++ b/src/io/io.cpp @@ -76,8 +76,9 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) { template const framework::Program Loader::Load( - const std::string &dirname, bool optimize) { - auto program = this->LoadProgram(dirname + "/__model__", optimize); + const std::string &dirname, bool optimize, bool can_add_split) { + auto program = + this->LoadProgram(dirname + "/__model__", optimize, can_add_split); program.model_path = dirname; return program; } @@ -94,7 +95,7 @@ const framework::Program Loader::Load( template const framework::Program Loader::LoadProgram( - const std::string &model_path, bool optimize) { + const std::string &model_path, bool optimize, bool can_add_split) { std::string model_filename = model_path; PaddleMobile__Framework__Proto__ProgramDesc *c_program; uint8_t *buf = NULL; @@ -146,7 +147,7 @@ const framework::Program Loader::LoadProgram( if (optimize) { framework::ProgramOptimize program_optimize; program.optimizeProgram = - program_optimize.FushionOptimize(originProgramDesc); + program_optimize.FushionOptimize(originProgramDesc, can_add_split); } if (optimize) { program.optimizeProgram->Description("optimize: "); @@ -310,6 +311,7 @@ void Executor::InitMemory() { template void Executor::InitCombineMemory() { + LOG(kLOG_INFO) << " begin init combine memory"; char *origin_data = Get_binary_data(program_.para_path); char *data = origin_data; for (const auto &block : to_predict_program_->Blocks()) { @@ -330,6 +332,7 @@ void Executor::InitCombineMemory() { } } delete origin_data; + LOG(kLOG_INFO) << " end init combine memory "; } template diff --git a/src/io/io.h b/src/io/io.h index fb18ca0cc1768f5cfe39acfcba7d0117a67e1de5..a1fbf158c2b026336d363db512cb44fe58ee93db 100644 --- a/src/io/io.h +++ b/src/io/io.h @@ -35,7 +35,8 @@ class Loader { * @b 加载分开形式的 fluid 模型 * */ const framework::Program Load(const std::string &dirname, - bool optimize = false); + bool optimize = false, + bool can_add_split = false); /* * @b load combine format fluid mode @@ -47,7 +48,8 @@ class Loader { private: const framework::Program LoadProgram(const std::string &model_path, - bool optimize = false); + bool optimize = false, + bool can_add_split = false); }; template diff --git a/src/operators/kernel/arm/relu_kernel.cpp b/src/operators/kernel/arm/relu_kernel.cpp index 854fa1d185ddb002aa37a10ade0683d841af8793..e7e0941a4d0bf48d86525cc52ee33301cdcbf67e 100644 --- a/src/operators/kernel/arm/relu_kernel.cpp +++ b/src/operators/kernel/arm/relu_kernel.cpp @@ -37,13 +37,71 @@ void ReluKernel::Compute(const ReluParam ¶m) const { auto *out = param.Out(); auto *out_ptr = out->mutable_data(); - ReluFunctor func_; - math::Transform trans; - trans(input_x_ptr, input_x_ptr + input_x->numel(), out_ptr, func_); + int numel = input_x->numel(); + if (numel > 32) { + asm volatile( + "pld [%[input_x_ptr], #0] \n\t" + "vmov.f32 q8, #0.0 \n\t" + "subs %[num], %[num], #32 \n\t" + "blt end_num_%= \n\t" + "loop_num_%=: \n\t" + "pld [%[input_x_ptr], #1024] \n\t" - // for (int i = 0; i < input_x->numel(); i++) { - // out_ptr[i] = input_x_ptr[i] > 0 ? input_x_ptr[i] : 0; - // } + "vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t" + "vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t" + "vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t" + "vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t" + + "vmax.f32 q0, q0, q8 \n\t" + "vmax.f32 q1, q1, q8 \n\t" + "vmax.f32 q2, q2, q8 \n\t" + "vmax.f32 q3, q3, q8 \n\t" + "vmax.f32 q4, q4, q8 \n\t" + "vmax.f32 q5, q5, q8 \n\t" + "vmax.f32 q6, q6, q8 \n\t" + "vmax.f32 q7, q7, q8 \n\t" + + "vst1.32 {q0, q1}, [%[out_ptr]]! \n\t" + "vst1.32 {q2, q3}, [%[out_ptr]]! \n\t" + "vst1.32 {q4, q5}, [%[out_ptr]]! \n\t" + "vst1.32 {q6, q7}, [%[out_ptr]]! \n\t" + + "subs %[num], %[num], #32 \n\t" + "bge loop_num_%= \n\t" + "end_num_%=: \n\t" + "cmp %[num], #0 \n\t" + "bge end_%= \n\t" + "mov r6, #4 \n\t" + "mul r5, %[num], r6 \n\t" + "add %[input_x_ptr], %[input_x_ptr], r5 \n\t" + "vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t" + "vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t" + "vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t" + "vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t" + "vmax.f32 q0, q0, q8 \n\t" + "vmax.f32 q1, q1, q8 \n\t" + "vmax.f32 q2, q2, q8 \n\t" + "vmax.f32 q3, q3, q8 \n\t" + "vmax.f32 q4, q4, q8 \n\t" + "vmax.f32 q5, q5, q8 \n\t" + "vmax.f32 q6, q6, q8 \n\t" + "vmax.f32 q7, q7, q8 \n\t" + "add %[out_ptr], %[out_ptr], r5 \n\t" + "vst1.32 {q0, q1}, [%[out_ptr]]! \n\t" + "vst1.32 {q2, q3}, [%[out_ptr]]! \n\t" + "vst1.32 {q4, q5}, [%[out_ptr]]! \n\t" + "vst1.32 {q6, q7}, [%[out_ptr]]! \n\t" + "end_%=: \n\t" + : + : + [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num] "r"(numel) + : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "r5", + "r6"); + } else { + ReluFunctor func_; + math::Transform trans; + trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_); + } } } // namespace operators } // namespace paddle_mobile diff --git a/test/framework/test_load.cpp b/test/framework/test_load.cpp index 2300f05c99a122b352d888a45ca3c6ef082469ba..32d314826f8d6bd4e504b16cd78464d660919a30 100644 --- a/test/framework/test_load.cpp +++ b/test/framework/test_load.cpp @@ -19,9 +19,10 @@ int main() { paddle_mobile::Loader loader; // ../../../test/models/googlenet // ../../../test/models/mobilenet - auto program = loader.Load(g_resnet, true); - loader.Load(g_googlenet_combine + "/model", g_googlenet_combine + "/params", - true); + auto program = loader.Load(g_googlenet, true, true); + // loader.Load(g_googlenet_combine + "/model", g_googlenet_combine + + // "/params", + // true); program.originProgram->Description("program desc: "); return 0; diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp index ab4fd2fe0d1eaaa58fabc38fbf512a0b860c36f0..d25a9eb7ce83876ca339adf8aff1a027b70ac611 100644 --- a/test/net/test_googlenet.cpp +++ b/test/net/test_googlenet.cpp @@ -20,9 +20,9 @@ int main() { paddle_mobile::Loader loader; bool optimize = false; auto time1 = time(); - // auto program = loader.Load(g_googlenet, optimize); - auto program = loader.Load(g_googlenet_combine + "/model", - g_googlenet_combine + "/params", optimize); + auto program = loader.Load(g_googlenet, optimize); + // auto program = loader.Load(g_googlenet_combine + "/model", + // g_googlenet_combine + "/params", optimize); auto time2 = time(); DLOG << "load cost :" << time_diff(time1, time2) << "ms\n"; paddle_mobile::Executor executor(program, 1, optimize);