From 5f1070ea2f9679d11eec459f4252b5a84972afcf Mon Sep 17 00:00:00 2001 From: yangfei Date: Sat, 29 Sep 2018 17:11:28 +0800 Subject: [PATCH] load memory for CLImage in GPU_CL mode --- src/framework/executor.cpp | 238 +++++++++++++++++++++++++++++-------- 1 file changed, 186 insertions(+), 52 deletions(-) diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp index 26cfa4ecba..fc1030b639 100644 --- a/src/framework/executor.cpp +++ b/src/framework/executor.cpp @@ -60,13 +60,13 @@ char *Get_binary_data(std::string filename) { #pragma mark - executor -template +template Executor::Executor(const framework::Program p, int batch_size, bool use_optimize, bool loddable) - : program_(p), - batch_size_(batch_size), - use_optimize_(use_optimize), - loddable_(loddable) { + : program_(p), + batch_size_(batch_size), + use_optimize_(use_optimize), + loddable_(loddable) { if (use_optimize_) { to_predict_program_ = program_.optimizeProgram; } else { @@ -77,7 +77,7 @@ Executor::Executor(const framework::Program p, int batch_size, PADDLE_MOBILE_ENFORCE(to_predict_program_ != nullptr, "to_predict_program_ == NULL!"); const std::vector> blocks = - to_predict_program_->Blocks(); + to_predict_program_->Blocks(); #ifdef PADDLE_EXECUTOR_MULTITHREAD depManager.resize(blocks.size()); #endif @@ -89,8 +89,8 @@ Executor::Executor(const framework::Program p, int batch_size, std::shared_ptr op = ops[j]; DLOG << "create op: " << j << " " << op->Type(); auto op_base = framework::OpRegistry::CreateOp( - op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(), - program_.scope); + op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(), + program_.scope); // use pre_infershape to pre resize , but if u use an lod mode tensor u // need to resize in runtime if (!loddable_) { @@ -109,7 +109,7 @@ Executor::Executor(const framework::Program p, int batch_size, InitMemory(); } std::shared_ptr to_predict_block = - to_predict_program_->Block(0); + to_predict_program_->Block(0); auto &ops = ops_of_block_[*to_predict_block.get()]; int i = 0; for (const auto &op : ops) { @@ -118,7 +118,7 @@ Executor::Executor(const framework::Program p, int batch_size, } } -template +template void Executor::LoadMemory(const framework::VarDesc var_desc, framework::LoDTensor *tensor, char **data) { // 1. version @@ -226,7 +226,7 @@ void Executor::LoadMemory(const framework::VarDesc var_desc, } } -template +template void Executor::InitMemory() { for (const auto &block : to_predict_program_->Blocks()) { for (const auto &var_desc : block->Vars()) { @@ -238,7 +238,7 @@ void Executor::InitMemory() { } char *origin_data = - Get_binary_data(program_.model_path + "/" + var_desc->Name()); + Get_binary_data(program_.model_path + "/" + var_desc->Name()); char *data = origin_data; LoadMemory(*var_desc, tensor, &data); @@ -251,21 +251,21 @@ void Executor::InitMemory() { is_mute_match = varInputMemory(var_desc, var, tensor); PADDLE_MOBILE_ENFORCE( - is_mute_match, - "got unhandled var_desc->Tensor_desc().DataType(): %d", - var_desc->Tensor_desc().DataType()); + is_mute_match, + "got unhandled var_desc->Tensor_desc().DataType(): %d", + var_desc->Tensor_desc().DataType()); } } } } } -template +template void Executor::InitCombineMemory() { char *origin_data; if (program_.combined_params_buf && program_.combined_params_len) { LOG(kLOG_INFO) << "use outter memory"; - origin_data = (char *) program_.combined_params_buf; + origin_data = (char *)program_.combined_params_buf; } else { LOG(kLOG_INFO) << " begin init combine memory"; origin_data = Get_binary_data(program_.para_path); @@ -289,9 +289,9 @@ void Executor::InitCombineMemory() { is_mute_match = varInputMemory(var_desc, var, tensor); PADDLE_MOBILE_ENFORCE( - is_mute_match, - "got unhandled var_desc->Tensor_desc().DataType(): %d", - var_desc->Tensor_desc().DataType()); + is_mute_match, + "got unhandled var_desc->Tensor_desc().DataType(): %d", + var_desc->Tensor_desc().DataType()); } } } @@ -300,10 +300,10 @@ void Executor::InitCombineMemory() { LOG(kLOG_INFO) << " end init combine memory "; } -template +template bool Executor::varInputMemory( - const std::shared_ptr &var_desc, Variable *var, - framework::LoDTensor *tensor) const { + const std::shared_ptr &var_desc, Variable *var, + framework::LoDTensor *tensor) const { bool is_mute_match = false; switch (var_desc->Tensor_desc().DataType()) { case framework::VARTYPE_TYPE_FP16: { @@ -338,24 +338,22 @@ bool Executor::varInputMemory( break; } - default: { - break; - } + default: { break; } } return is_mute_match; } -template +template std::shared_ptr Executor::Predict( - const framework::Tensor &t) { + const framework::Tensor &t) { framework::Variable *g_feed_value = program_.scope->Var("feed"); framework::Tensor *feed_tensor = - g_feed_value->GetMutable(); + g_feed_value->GetMutable(); feed_tensor->Resize(t.dims()); feed_tensor->ShareDataWith(t); std::shared_ptr to_predict_block = - to_predict_program_->Block(0); + to_predict_program_->Block(0); auto &ops = ops_of_block_[*to_predict_block.get()]; #ifdef PADDLE_MOBILE_PROFILE @@ -435,8 +433,8 @@ std::shared_ptr Executor::Predict( std::vector out_keys = (*last_op)->GetOutKeys(); PADDLE_MOBILE_ENFORCE(out_keys.size() > 0, "the last op contains no output"); framework::LoDTensor *output_tensor = - framework::GetVarValue(out_keys[0], output_map, - *(program_.scope)); + framework::GetVarValue(out_keys[0], output_map, + *(program_.scope)); #ifdef PADDLE_MOBILE_PROFILE #ifdef PADDLE_EXECUTOR_MULTITHREAD // TODO(haipeng): expose profile info as an interface, user can get them to @@ -488,18 +486,18 @@ std::shared_ptr Executor::Predict( return std::make_shared(framework::Tensor(*output_tensor)); } -template +template std::shared_ptr Executor::PredictLod( - const framework::LoDTensor &t) { + const framework::LoDTensor &t) { framework::Variable *g_feed_value = program_.scope->Var("feed"); framework::LoDTensor *feed_tensor = - g_feed_value->GetMutable(); + g_feed_value->GetMutable(); feed_tensor->Resize(t.dims()); feed_tensor->ShareDataWith(t); feed_tensor->set_lod(t.lod()); std::shared_ptr to_predict_block = - to_predict_program_->Block(0); + to_predict_program_->Block(0); auto &ops = ops_of_block_[*to_predict_block.get()]; @@ -584,8 +582,8 @@ std::shared_ptr Executor::PredictLod( std::vector out_keys = (*last_op)->GetOutKeys(); PADDLE_MOBILE_ENFORCE(out_keys.size() > 0, "the last op contains no output"); framework::LoDTensor *output_tensor = - framework::GetVarValue(out_keys[0], output_map, - *(program_.scope)); + framework::GetVarValue(out_keys[0], output_map, + *(program_.scope)); #ifdef PADDLE_MOBILE_PROFILE #ifdef PADDLE_EXECUTOR_MULTITHREAD // TODO(haipeng): expose profile info as an interface, user can get them to @@ -635,22 +633,22 @@ std::shared_ptr Executor::PredictLod( printf("====================[---------]======================\n"); #endif return std::make_shared( - framework::LoDTensor(*output_tensor)); + framework::LoDTensor(*output_tensor)); } -template +template std::shared_ptr Executor::Predict( - const framework::Tensor &t, int block_id) { + const framework::Tensor &t, int block_id) { return Predict(t); } -template +template std::vector::Ptype> Executor::Predict( - const std::vector &input, const std::vector &dims) { + const std::vector &input, const std::vector &dims) { framework::Tensor tensor(input, framework::make_ddim(dims)); std::shared_ptr output_tensor = Predict(tensor, 0); Executor::Ptype *output_ptr = - output_tensor->data::Ptype>(); + output_tensor->data::Ptype>(); std::vector::Ptype> result_vector; for (int j = 0; j < output_tensor->numel(); ++j) { result_vector.push_back(output_ptr[j]); @@ -730,17 +728,153 @@ void Executor::Predict_To(int end) { }; #endif -template -class Executor; +#ifdef PADDLE_MOBILE_FPGA + +template +void Executor::InjectVariable(const framework::Tensor &t, + string var_name) { + framework::Variable *g_feed_value = program_.scope->Var(var_name); + framework::Tensor *feed_tensor = + g_feed_value->GetMutable(); + feed_tensor->Resize(t.dims()); + feed_tensor->ShareDataWith(t); +}; + +template +void Executor::FeedData(const framework::Tensor &t) { + InjectVariable(t, "feed"); +}; + +template +std::shared_ptr Executor::FetchResult(int id) { + std::shared_ptr to_predict_block = + to_predict_program_->Block(0); + auto &ops = ops_of_block_[*to_predict_block.get()]; + + PADDLE_MOBILE_ENFORCE(id < ops.size(), "Index out of range"); + auto last_op = id < 0 ? ops[ops.size() - 1] : ops[id]; + auto output_map = last_op->Outputs(); + std::vector out_keys = last_op->GetOutKeys(); + PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "the last op contains no output"); + auto *output_tensor = framework::GetVarValue( + out_keys[0], output_map, *(program_.scope)); + return std::make_shared(framework::Tensor(*output_tensor)); +}; + +template +void Executor::Predict_From_To(int start, int end) { + std::shared_ptr to_predict_block = + to_predict_program_->Block(0); + auto &ops = ops_of_block_[*to_predict_block.get()]; + end = end < 0 ? (int)ops.size() : end; + PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(), + "start or end parameter is wrong"); + +#ifdef PADDLE_MOBILE_PROFILE + std::vector profile(ops.size()); +#endif + for (int i = start; i < end; i++) { +#ifdef PADDLE_MOBILE_PROFILE + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec; +#endif + DLOG << "Running op: " << i << " " << ops[i]->Type(); + ops[i]->Run(); + +#ifdef PADDLE_MOBILE_PROFILE + clock_gettime(CLOCK_MONOTONIC, &ts); + profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec; +#endif + } +}; + +template +void Executor::Predict_From(int start) { + Predict_From_To(start); +}; + +template +void Executor::Predict_To(int end) { + Predict_From_To(0, end); +}; +#endif + +#ifdef PADDLE_MOBILE_CL -template -class Executor; +template <> +void Executor::InitMemory() { + for (const auto &block : to_predict_program_->Blocks()) { + for (const auto &var_desc : block->Vars()) { + auto var = program_.scope->Var(var_desc->Name()); + if (var_desc->Persistable()) { + auto cl_image = var->template GetMutable(); + if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") { + continue; + } -template -class Executor; + char *origin_data = + Get_binary_data(program_.model_path + "/" + var_desc->Name()); + cl_context context = program_.scope->GetCLScpoe()->Context(); -template -class Executor; + float *tensorInput = (float *)origin_data; + framework::DDim ddim = cl_image->dims(); + cl_image->Init(context, tensorInput, ddim); + delete origin_data; + } + } + } +} +template <> +void Executor::InitCombineMemory() { + char *origin_data; + if (program_.combined_params_buf && program_.combined_params_len) { + LOG(kLOG_INFO) << "use outter memory"; + origin_data = (char *)program_.combined_params_buf; + } else { + LOG(kLOG_INFO) << " begin init combine memory"; + origin_data = Get_binary_data(program_.para_path); + } + PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!"); + float *data = (float *)origin_data; + + for (const auto &block : to_predict_program_->Blocks()) { + for (const auto &var_desc : block->Vars()) { + auto var = program_.scope->Var(var_desc->Name()); + if (var_desc->Persistable()) { + auto cl_image = var->template GetMutable(); + if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") { + continue; + } + + cl_context context = program_.scope->GetCLScpoe()->Context(); + + framework::DDim ddim = cl_image->dims(); + + int numel = 1; + for (int i = 0; i < ddim.size(); i++) { + numel = numel * ddim[i]; + } + float *tensorInput = data; + data += numel; + cl_image->Init(context, tensorInput, ddim); + } + } + } + delete origin_data; + LOG(kLOG_INFO) << " end init combine memory "; } + +#endif + +template class Executor; + +template class Executor; + +template class Executor; + +template class Executor; + +} // namespace framework } // namespace paddle_mobile -- GitLab