// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/inference/api/analysis_predictor.h" #include #include #include #include #include #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/profiler.h" #if PADDLE_WITH_TENSORRT #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #endif DECLARE_bool(profile); namespace paddle { using contrib::AnalysisConfig; namespace { bool IsPersistable(const framework::VarDesc *var) { if (var->Persistable() && var->GetType() != framework::proto::VarType::FEED_MINIBATCH && var->GetType() != framework::proto::VarType::FETCH_LIST) { return true; } return false; } } // namespace bool AnalysisPredictor::Init( const std::shared_ptr &parent_scope, const std::shared_ptr &program) { VLOG(3) << "Predictor::init()"; if (FLAGS_profile) { LOG(WARNING) << "Profiler is actived, might affect the performance"; LOG(INFO) << "You can turn off by set gflags '-profile false'"; auto tracking_device = config_.use_gpu() ? platform::ProfilerState::kAll : platform::ProfilerState::kCPU; platform::EnableProfiler(tracking_device); } // no matter with or without MKLDNN paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads()); if (!PrepareScope(parent_scope)) { return false; } if (!CreateExecutor()) { return false; } if (!PrepareProgram(program)) { return false; } // Prepare executor, create local variables. if (!PrepareExecutor()) { return true; } // Get the feed_target_names and fetch_target_names PrepareFeedFetch(); return true; } bool AnalysisPredictor::PrepareScope( const std::shared_ptr &parent_scope) { if (parent_scope) { PADDLE_ENFORCE_NOT_NULL( parent_scope, "Both program and parent_scope should be set in Clone mode."); scope_ = parent_scope; status_is_cloned_ = true; } else { paddle::framework::InitDevices(false); scope_.reset(new paddle::framework::Scope()); status_is_cloned_ = false; } sub_scope_ = &scope_->NewScope(); return true; } bool AnalysisPredictor::PrepareProgram( const std::shared_ptr &program) { if (!program) { if (!LoadProgramDesc()) return false; // Optimize the program, and load parameters and modify them in the // scope_. // This will change the scope_ address. if (config_.ir_optim()) { status_ir_optim_enabled_ = true; OptimizeInferenceProgram(); } else { // If the parent_scope is passed, we assert that the persistable variables // are already created, so just create the no persistable variables. // If not cloned, the parameters should be loaded // OptimizeInferenceProgram. // So in both cases, just the local variables are needed to load, not the // parematers. executor_->CreateVariables(*inference_program_, 0, true, sub_scope_); // Load parameters LOG(INFO) << "load parameters "; LoadParameters(); } } else { // If the program is passed from external, no need to optimize it, this // logic is used in the clone scenario. inference_program_ = program; } executor_->CreateVariables(*inference_program_, 0, false, sub_scope_); return true; } bool AnalysisPredictor::CreateExecutor() { if (config_.use_gpu_) { status_use_gpu_ = true; place_ = paddle::platform::CUDAPlace(config_.device_id_); } else { place_ = paddle::platform::CPUPlace(); } executor_.reset(new paddle::framework::NaiveExecutor(place_)); return true; } bool AnalysisPredictor::PrepareExecutor() { executor_->Prepare(sub_scope_, *inference_program_, 0, config_.use_feed_fetch_ops_); PADDLE_ENFORCE_NOT_NULL(sub_scope_); return true; } void AnalysisPredictor::SetMkldnnThreadID(int tid) { #ifdef PADDLE_WITH_MKLDNN platform::set_cur_thread_id(tid); #else LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN"; #endif } bool AnalysisPredictor::Run(const std::vector &inputs, std::vector *output_data, int batch_size) { VLOG(3) << "Predictor::predict"; inference::Timer timer; timer.tic(); // set feed variable framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get(); if (!SetFeed(inputs, scope)) { LOG(ERROR) << "fail to set feed"; return false; } // Run the inference program // if share variables, we need not create variables executor_->Run(); // get fetch variable if (!GetFetch(output_data, scope)) { LOG(ERROR) << "fail to get fetches"; return false; } // Collect variable shapes for memory optimization. if (need_collect_var_shapes_for_memory_optim()) { CollectVarShapes(); } VLOG(3) << "predict cost: " << timer.toc() << "ms"; // All the containers in the scope will be hold in inference, but the // operators assume that the container will be reset after each batch. // Here is a bugfix, collect all the container variables, and reset then to a // bool; the next time, the operator will call MutableData and construct a new // container again, so that the container will be empty for each batch. tensor_array_batch_cleaner_.CollectNoTensorVars(sub_scope_); tensor_array_batch_cleaner_.ResetNoTensorVars(); return true; } bool AnalysisPredictor::SetFeed(const std::vector &inputs, framework::Scope *scope) { VLOG(3) << "Predictor::set_feed"; if (inputs.size() != feeds_.size()) { LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get " << inputs.size(); return false; } // Cache the inputs memory for better concurrency performance. feed_tensors_.resize(inputs.size()); for (size_t i = 0; i < inputs.size(); ++i) { auto &input = feed_tensors_[i]; framework::DDim ddim = framework::make_ddim(inputs[i].shape); void *input_ptr; if (inputs[i].dtype == PaddleDType::INT64) { input_ptr = input.mutable_data(ddim, place_); } else if (inputs[i].dtype == PaddleDType::FLOAT32) { input_ptr = input.mutable_data(ddim, place_); } else { LOG(ERROR) << "unsupported feed type " << inputs[i].dtype; return false; } if (platform::is_cpu_place(place_)) { // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. std::memcpy(static_cast(input_ptr), inputs[i].data.data(), inputs[i].data.length()); } else { #ifdef PADDLE_WITH_CUDA platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *dev_ctx = static_cast(pool.Get(place_)); auto dst_gpu_place = boost::get(place_); memory::Copy(dst_gpu_place, static_cast(input_ptr), platform::CPUPlace(), inputs[i].data.data(), inputs[i].data.length(), dev_ctx->stream()); #else PADDLE_THROW("Not compile with CUDA, should not reach here."); #endif } // TODO(Superjomn) Low performance, need optimization for heavy LoD copy. framework::LoD lod; for (auto &level : inputs[i].lod) { lod.emplace_back(level); } input.set_lod(lod); int idx = -1; if (config_.specify_input_name_) { auto name = inputs[i].name; if (feed_names_.find(name) == feed_names_.end()) { LOG(ERROR) << "feed names from program do not have name: [" << name << "] from specified input"; } idx = feed_names_[name]; } else { idx = boost::get(feeds_[i]->GetAttr("col")); } framework::SetFeedVariable(scope, input, "feed", idx); } return true; } template void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch, PaddleTensor *output) { // set shape. auto shape = framework::vectorize(fetch.dims()); output->shape.assign(shape.begin(), shape.end()); // set data. const T *data = fetch.data(); int num_elems = inference::VecReduceToInt(shape); output->data.Resize(num_elems * sizeof(T)); // The fetched tensor output by fetch op, should always in CPU memory, so just // copy. memcpy(output->data.data(), data, num_elems * sizeof(T)); // set lod output->lod.clear(); for (auto &level : fetch.lod()) { output->lod.emplace_back(level.begin(), level.end()); } } bool AnalysisPredictor::GetFetch(std::vector *outputs, framework::Scope *scope) { VLOG(3) << "Predictor::get_fetch"; outputs->resize(fetchs_.size()); for (size_t i = 0; i < fetchs_.size(); ++i) { int idx = boost::get(fetchs_[i]->GetAttr("col")); PADDLE_ENFORCE((size_t)idx == i); framework::LoDTensor &fetch = framework::GetFetchVariable(*scope, "fetch", idx); auto type = fetch.type(); auto output = &(outputs->at(i)); output->name = fetchs_[idx]->Input("X")[0]; if (type == framework::proto::VarType::FP32) { GetFetchOne(fetch, output); output->dtype = PaddleDType::FLOAT32; } else if (type == framework::proto::VarType::INT64) { GetFetchOne(fetch, output); output->dtype = PaddleDType::INT64; } else { LOG(ERROR) << "unknown type, only support float32 and int64 now."; } } return true; } // NOTE All the members in AnalysisConfig should be copied to Argument. void AnalysisPredictor::OptimizeInferenceProgram() { status_program_optimized_ = true; argument_.SetUseGPU(config_.use_gpu()); argument_.SetGPUDeviceId(config_.gpu_device_id()); argument_.SetEnableMemoryOptim(config_.enable_memory_optim()); argument_.SetMemoryOptimForceUpdate(config_.memory_optim_force_update_); argument_.SetModelFromMemory(config_.model_from_memory_); // Analyze inference_program if (!config_.model_dir().empty()) { argument_.SetModelDir(config_.model_dir()); } else { PADDLE_ENFORCE( !config_.params_file().empty(), "Either model_dir or (param_file, prog_file) should be set."); PADDLE_ENFORCE(!config_.prog_file().empty()); argument_.SetModelProgramPath(config_.prog_file()); argument_.SetModelParamsPath(config_.params_file()); } if (config_.use_gpu() && config_.tensorrt_engine_enabled()) { LOG(INFO) << "TensorRT subgraph engine is enabled"; argument_.SetUseTensorRT(true); argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_); argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_); } if (config_.use_mkldnn_) { LOG(INFO) << "MKLDNN is enabled"; argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_); } auto passes = config_.pass_builder()->AllPasses(); if (!config_.ir_optim()) { passes.clear(); LOG(INFO) << "ir_optim is turned off, no IR pass will be executed"; } argument_.SetIrAnalysisPasses(passes); argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses()); argument_.SetScopeNotOwned(const_cast(scope_.get())); Analyzer().Run(&argument_); PADDLE_ENFORCE(argument_.scope_valid()); VLOG(5) << "to prepare executor"; ARGUMENT_CHECK_FIELD((&argument_), ir_analyzed_program); inference_program_.reset( new framework::ProgramDesc(argument_.ir_analyzed_program())); LOG(INFO) << "== optimize end =="; } template <> std::unique_ptr CreatePaddlePredictor< AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) { VLOG(3) << "create AnalysisConfig"; if (config.use_gpu()) { // 1. GPU memeroy PADDLE_ENFORCE_GT(config.memory_pool_init_size_mb(), 0.f); PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d", config.gpu_device_id()); std::vector flags; float fraction_of_gpu_memory = config.fraction_of_gpu_memory_for_pool(); if (fraction_of_gpu_memory > 0.95f) { LOG(ERROR) << "Allocate too much memory for the GPU memory pool, assigned " << config.memory_pool_init_size_mb() << " MB"; LOG(ERROR) << "Try to shink the value by setting AnalysisConfig::EnableGpu(...)"; } if (fraction_of_gpu_memory >= 0.0f || fraction_of_gpu_memory <= 0.95f) { flags.push_back("dummpy"); std::string flag = "--fraction_of_gpu_memory_to_use=" + std::to_string(fraction_of_gpu_memory); flags.push_back(flag); VLOG(3) << "set flag: " << flag; framework::InitGflags(flags); } } std::unique_ptr predictor(new AnalysisPredictor(config)); if (!dynamic_cast(predictor.get())->Init(nullptr)) { return nullptr; } return std::move(predictor); } void AnalysisPredictor::PrepareFeedFetch() { PADDLE_ENFORCE_NOT_NULL(sub_scope_); CreateFeedFetchVar(sub_scope_); for (auto *op : inference_program_->Block(0).AllOps()) { if (op->Type() == "feed") { int idx = boost::get(op->GetAttr("col")); if (feeds_.size() <= static_cast(idx)) { feeds_.resize(idx + 1); } feeds_[idx] = op; feed_names_[op->Output("Out")[0]] = idx; } else if (op->Type() == "fetch") { int idx = boost::get(op->GetAttr("col")); if (fetchs_.size() <= static_cast(idx)) { fetchs_.resize(idx + 1); } fetchs_[idx] = op; } } } void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) { PADDLE_ENFORCE_NOT_NULL(scope); auto *var = scope->Var("feed"); var->GetMutable(); var = scope->Var("fetch"); var->GetMutable(); } std::unique_ptr AnalysisPredictor::GetInputTensor( const std::string &name) { PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name); std::unique_ptr res( new ZeroCopyTensor(static_cast(executor_->scope()))); res->input_or_output_ = true; res->SetName(name); return res; } std::unique_ptr AnalysisPredictor::GetOutputTensor( const std::string &name) { PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name); std::unique_ptr res( new ZeroCopyTensor(static_cast(executor_->scope()))); res->input_or_output_ = false; res->SetName(name); return res; } bool AnalysisPredictor::ZeroCopyRun() { executor_->Run(); // Fix TensorArray reuse not cleaned bug. tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_); tensor_array_batch_cleaner_.ResetTensorArray(); return true; } bool AnalysisPredictor::LoadProgramDesc() { // Initialize the inference program std::string filename; if (!config_.model_dir().empty()) { filename = config_.model_dir() + "/__model__"; } else if (!config_.prog_file().empty() && !config_.params_file().empty()) { // All parameters are saved in a single file. // The file names should be consistent with that used // in Python API `fluid.io.save_inference_model`. filename = config_.prog_file(); } else { if (config_.model_dir().empty() && config_.prog_file().empty()) { LOG(ERROR) << "Either model_dir or (prog_file, param_file) should be set."; return false; } LOG(ERROR) << string::Sprintf( "not valid model path '%s' or program path '%s'.", config_.model_dir(), config_.params_file()); return false; } // Create ProgramDesc framework::proto::ProgramDesc proto; if (!config_.model_from_memory()) { std::string pb_content; // Read binary std::ifstream fin(filename, std::ios::in | std::ios::binary); PADDLE_ENFORCE(static_cast(fin.is_open()), "Cannot open file %s", filename); fin.seekg(0, std::ios::end); pb_content.resize(fin.tellg()); fin.seekg(0, std::ios::beg); fin.read(&(pb_content.at(0)), pb_content.size()); fin.close(); proto.ParseFromString(pb_content); } else { proto.ParseFromString(config_.prog_file()); } inference_program_.reset(new framework::ProgramDesc(proto)); return true; } bool AnalysisPredictor::LoadParameters() { PADDLE_ENFORCE_NOT_NULL(inference_program_.get(), "The inference program should be loaded first."); const auto &global_block = inference_program_->MutableBlock(0); // create a temporary program to load parameters. std::unique_ptr load_program( new framework::ProgramDesc()); framework::BlockDesc *load_block = load_program->MutableBlock(0); std::vector params; for (auto *var : global_block->AllVars()) { if (IsPersistable(var)) { VLOG(3) << "persistable variable's name: " << var->Name(); framework::VarDesc *new_var = load_block->Var(var->Name()); new_var->SetShape(var->GetShape()); new_var->SetDataType(var->GetDataType()); new_var->SetType(var->GetType()); new_var->SetLoDLevel(var->GetLoDLevel()); new_var->SetPersistable(true); if (!config_.params_file().empty()) { params.push_back(new_var->Name()); } else { // append_op framework::OpDesc *op = load_block->AppendOp(); op->SetType("load"); op->SetOutput("Out", {new_var->Name()}); op->SetAttr("file_path", {config_.model_dir() + "/" + new_var->Name()}); op->CheckAttrs(); } } } if (!config_.params_file().empty()) { // sort paramlist to have consistent ordering std::sort(params.begin(), params.end()); // append just the load_combine op framework::OpDesc *op = load_block->AppendOp(); op->SetType("load_combine"); op->SetOutput("Out", params); op->SetAttr("file_path", {config_.params_file()}); op->CheckAttrs(); } // Use NaiveExecutor to Load parameters. framework::NaiveExecutor e(place_); e.Prepare(scope_.get(), *load_program, 0, false); e.Run(); VLOG(3) << "get " << scope_->LocalVarNames().size() << " vars after load"; return true; } AnalysisPredictor::~AnalysisPredictor() { if (FLAGS_profile) { platform::DisableProfiler(platform::EventSortingKey::kTotal, "./profile.log"); } if (sub_scope_) { scope_->DeleteScope(sub_scope_); } // TODO(Superjomn) deduce the directory path. std::string out_path = inference::analysis::GetMemoryCachePath( config_.model_dir(), config_.prog_file()); if (need_collect_var_shapes_for_memory_optim()) { SerializeBatchVarShapes(out_path); } } std::unique_ptr AnalysisPredictor::Clone() { std::lock_guard lk(clone_mutex_); auto *x = new AnalysisPredictor(config_); x->Init(scope_, inference_program_); return std::unique_ptr(x); } void AnalysisPredictor::CollectVarShapes() { VLOG(4) << "Collecting var shapes"; if (batch_var_shapes_.size() >= max_shape_collect_count_) return; std::map> var_shapes; for (auto var_name : inference_program_->Block(0).LocalVarNames()) { auto *var = sub_scope_->FindVar(var_name); PADDLE_ENFORCE_NOT_NULL(var); if (var->Type() == framework::VarTypeTrait::kId || var->Type() == framework::VarTypeTrait::kId) { auto &tensor = var->Get(); auto shape = framework::vectorize(tensor.dims()); var_shapes[var_name].assign(shape.begin(), shape.end()); } } batch_var_shapes_.push_back(var_shapes); LOG_FIRST_N(INFO, 1) << "Collected " << batch_var_shapes_.size() << " batch of var shapes for analysis"; } void AnalysisPredictor::SerializeBatchVarShapes(const std::string &path) { LOG(INFO) << "serialize batch var shapes to " << path; std::ofstream file(path); if (!file.is_open()) { LOG(ERROR) << "failed to serialize the var shapes to " << path; return; } // The sirialized data format: // :dim0,dim1,dim2,; for (auto &batch : batch_var_shapes_) { for (auto &ele : batch) { file << ele.first << ":"; for (size_t i = 0; i < ele.second.size() - 1; i++) { file << ele.second[i] << ","; } file << ele.second.back() << ";"; } file << "\n"; } } bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() { if (need_collect_var_shapes_ >= 0) return need_collect_var_shapes_; bool need = false; // check if the cache exists if (!config_.enable_memory_optim()) { need = false; } else if (config_.enable_memory_optim() && !inference::IsFileExists(inference::analysis::GetMemoryCachePath( config_.model_dir(), config_.prog_file()))) { need = true; } else if (config_.enable_memory_optim() && config_.memory_optim_force_update_) { need = true; } need_collect_var_shapes_ = need ? 1 : 0; return need; } template <> std::unique_ptr CreatePaddlePredictor( const contrib::AnalysisConfig &config) { return CreatePaddlePredictor(config); } } // namespace paddle #if PADDLE_WITH_TENSORRT USE_TRT_CONVERTER(elementwise_add_weight); USE_TRT_CONVERTER(elementwise_add_tensor); USE_TRT_CONVERTER(elementwise_sub_tensor); USE_TRT_CONVERTER(elementwise_div_tensor); USE_TRT_CONVERTER(elementwise_mul_tensor); USE_TRT_CONVERTER(elementwise_max_tensor); USE_TRT_CONVERTER(elementwise_min_tensor); USE_TRT_CONVERTER(elementwise_pow_tensor); USE_TRT_CONVERTER(mul); USE_TRT_CONVERTER(conv2d); USE_TRT_CONVERTER(relu); USE_TRT_CONVERTER(sigmoid); USE_TRT_CONVERTER(tanh); USE_TRT_CONVERTER(fc); USE_TRT_CONVERTER(pool2d); USE_TRT_CONVERTER(softmax); USE_TRT_CONVERTER(batch_norm); USE_TRT_CONVERTER(concat); USE_TRT_CONVERTER(dropout); USE_TRT_CONVERTER(pad); USE_TRT_CONVERTER(split); USE_TRT_CONVERTER(prelu); USE_TRT_CONVERTER(conv2d_transpose); USE_TRT_CONVERTER(leaky_relu); #endif