提交 eb2837c6 编写于 作者: D dolphin8 提交者: GitHub

Merge branch 'develop' into develop

......@@ -4,7 +4,8 @@ project(paddle-mobile)
option(DEBUGING "enable debug mode" ON)
option(USE_OPENMP "openmp support" OFF)
option(USE_EXCEPTION "use std exception" ON)
option(LOG_PROFILE "log profile" ON)
# select the platform to build
option(CPU "cpu" ON)
option(MALI_GPU "mali gpu" OFF)
option(FPGA "fpga" OFF)
......@@ -45,6 +46,10 @@ else()
add_definitions(-fno-exceptions)
endif ()
if (LOG_PROFILE)
add_definitions(-DPADDLE_MOBILE_PROFILE)
endif()
if(IS_MAC)
add_definitions(-DX86)
elseif(IS_IOS)
......
......@@ -106,11 +106,14 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
}
std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
for (int m = 0; m < nodes.size(); ++m) {
auto &node = nodes[m];
op_descs.push_back(node->op_desc_);
if (add_split) {
GenerateOps(&op_descs, begin_node.get(), add_split);
} else {
for (int m = 0; m < nodes.size(); ++m) {
auto &node = nodes[m];
op_descs.push_back(node->op_desc_);
}
}
// GenerateOps(&op_descs, begin_node.get());
block->ops_ = op_descs;
}
......@@ -267,12 +270,12 @@ void ProgramOptimize::GenerateOps(
}
void ProgramOptimize::GenerateOps(
std::vector<std::shared_ptr<framework::OpDesc>> *op_descs,
Node *begin_node) {
std::vector<std::shared_ptr<framework::OpDesc>> *op_descs, Node *begin_node,
bool can_add_split) {
// std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
// Node *input_node, Node *current_node, bool adding_thread, int
// thread_num
if (false) {
if (can_add_split) {
this->GenerateOps(op_descs, begin_node, begin_node, false, -1, nullptr);
} else {
this->GenerateOps(op_descs, begin_node, begin_node);
......
......@@ -34,7 +34,7 @@ class ProgramOptimize {
int current_block_;
std::vector<std::shared_ptr<BlockDesc>> new_blocks_;
void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_descs,
Node *begin_node);
Node *begin_node, bool can_add_split);
void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
Node *input_node, Node *current_node);
void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
......
......@@ -76,8 +76,9 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) {
template <typename Dtype, Precision P>
const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
const std::string &dirname, bool optimize) {
auto program = this->LoadProgram(dirname + "/__model__", optimize);
const std::string &dirname, bool optimize, bool can_add_split) {
auto program =
this->LoadProgram(dirname + "/__model__", optimize, can_add_split);
program.model_path = dirname;
return program;
}
......@@ -94,7 +95,7 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
template <typename Dtype, Precision P>
const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
const std::string &model_path, bool optimize) {
const std::string &model_path, bool optimize, bool can_add_split) {
std::string model_filename = model_path;
PaddleMobile__Framework__Proto__ProgramDesc *c_program;
uint8_t *buf = NULL;
......@@ -146,7 +147,7 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
if (optimize) {
framework::ProgramOptimize program_optimize;
program.optimizeProgram =
program_optimize.FushionOptimize(originProgramDesc);
program_optimize.FushionOptimize(originProgramDesc, can_add_split);
}
if (optimize) {
program.optimizeProgram->Description("optimize: ");
......@@ -310,6 +311,7 @@ void Executor<Dtype, P>::InitMemory() {
template <typename Dtype, Precision P>
void Executor<Dtype, P>::InitCombineMemory() {
LOG(kLOG_INFO) << " begin init combine memory";
char *origin_data = Get_binary_data(program_.para_path);
char *data = origin_data;
for (const auto &block : to_predict_program_->Blocks()) {
......@@ -330,6 +332,7 @@ void Executor<Dtype, P>::InitCombineMemory() {
}
}
delete origin_data;
LOG(kLOG_INFO) << " end init combine memory ";
}
template <typename Dtype, Precision P>
......
......@@ -35,7 +35,8 @@ class Loader {
* @b 加载分开形式的 fluid 模型
* */
const framework::Program<Dtype, P> Load(const std::string &dirname,
bool optimize = false);
bool optimize = false,
bool can_add_split = false);
/*
* @b load combine format fluid mode
......@@ -47,7 +48,8 @@ class Loader {
private:
const framework::Program<Dtype, P> LoadProgram(const std::string &model_path,
bool optimize = false);
bool optimize = false,
bool can_add_split = false);
};
template <typename Dtype = CPU, Precision P = Precision::FP32>
......
......@@ -37,13 +37,71 @@ void ReluKernel<CPU, float>::Compute(const ReluParam &param) const {
auto *out = param.Out();
auto *out_ptr = out->mutable_data<float>();
ReluFunctor<float> func_;
math::Transform trans;
trans(input_x_ptr, input_x_ptr + input_x->numel(), out_ptr, func_);
int numel = input_x->numel();
if (numel > 32) {
asm volatile(
"pld [%[input_x_ptr], #0] \n\t"
"vmov.f32 q8, #0.0 \n\t"
"subs %[num], %[num], #32 \n\t"
"blt end_num_%= \n\t"
"loop_num_%=: \n\t"
"pld [%[input_x_ptr], #1024] \n\t"
// for (int i = 0; i < input_x->numel(); i++) {
// out_ptr[i] = input_x_ptr[i] > 0 ? input_x_ptr[i] : 0;
// }
"vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t"
"vmax.f32 q0, q0, q8 \n\t"
"vmax.f32 q1, q1, q8 \n\t"
"vmax.f32 q2, q2, q8 \n\t"
"vmax.f32 q3, q3, q8 \n\t"
"vmax.f32 q4, q4, q8 \n\t"
"vmax.f32 q5, q5, q8 \n\t"
"vmax.f32 q6, q6, q8 \n\t"
"vmax.f32 q7, q7, q8 \n\t"
"vst1.32 {q0, q1}, [%[out_ptr]]! \n\t"
"vst1.32 {q2, q3}, [%[out_ptr]]! \n\t"
"vst1.32 {q4, q5}, [%[out_ptr]]! \n\t"
"vst1.32 {q6, q7}, [%[out_ptr]]! \n\t"
"subs %[num], %[num], #32 \n\t"
"bge loop_num_%= \n\t"
"end_num_%=: \n\t"
"cmp %[num], #0 \n\t"
"bge end_%= \n\t"
"mov r6, #4 \n\t"
"mul r5, %[num], r6 \n\t"
"add %[input_x_ptr], %[input_x_ptr], r5 \n\t"
"vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t"
"vmax.f32 q0, q0, q8 \n\t"
"vmax.f32 q1, q1, q8 \n\t"
"vmax.f32 q2, q2, q8 \n\t"
"vmax.f32 q3, q3, q8 \n\t"
"vmax.f32 q4, q4, q8 \n\t"
"vmax.f32 q5, q5, q8 \n\t"
"vmax.f32 q6, q6, q8 \n\t"
"vmax.f32 q7, q7, q8 \n\t"
"add %[out_ptr], %[out_ptr], r5 \n\t"
"vst1.32 {q0, q1}, [%[out_ptr]]! \n\t"
"vst1.32 {q2, q3}, [%[out_ptr]]! \n\t"
"vst1.32 {q4, q5}, [%[out_ptr]]! \n\t"
"vst1.32 {q6, q7}, [%[out_ptr]]! \n\t"
"end_%=: \n\t"
:
:
[out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num] "r"(numel)
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "r5",
"r6");
} else {
ReluFunctor<float> func_;
math::Transform trans;
trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_);
}
}
} // namespace operators
} // namespace paddle_mobile
......
......@@ -19,9 +19,10 @@ int main() {
paddle_mobile::Loader<paddle_mobile::CPU> loader;
// ../../../test/models/googlenet
// ../../../test/models/mobilenet
auto program = loader.Load(g_resnet, true);
loader.Load(g_googlenet_combine + "/model", g_googlenet_combine + "/params",
true);
auto program = loader.Load(g_googlenet, true, true);
// loader.Load(g_googlenet_combine + "/model", g_googlenet_combine +
// "/params",
// true);
program.originProgram->Description("program desc: ");
return 0;
......
......@@ -20,9 +20,9 @@ int main() {
paddle_mobile::Loader<paddle_mobile::CPU> loader;
bool optimize = false;
auto time1 = time();
// auto program = loader.Load(g_googlenet, optimize);
auto program = loader.Load(g_googlenet_combine + "/model",
g_googlenet_combine + "/params", optimize);
auto program = loader.Load(g_googlenet, optimize);
// auto program = loader.Load(g_googlenet_combine + "/model",
// g_googlenet_combine + "/params", optimize);
auto time2 = time();
DLOG << "load cost :" << time_diff(time1, time2) << "ms\n";
paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, optimize);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册