// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/inference/api/analysis_predictor.h" #include #include #include #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/profiler.h" DECLARE_bool(profile); DECLARE_int32(paddle_num_threads); namespace paddle { using contrib::AnalysisConfig; bool AnalysisPredictor::Init( const std::shared_ptr &parent_scope, const std::shared_ptr &program) { VLOG(3) << "Predictor::init()"; #if !defined(_WIN32) if (FLAGS_profile) { LOG(WARNING) << "Profiler is actived, might affect the performance"; LOG(INFO) << "You can turn off by set gflags '-profile false'"; auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll : platform::ProfilerState::kCPU; platform::EnableProfiler(tracking_device); } #endif // no matter with or without MKLDNN paddle::platform::SetNumThreads(FLAGS_paddle_num_threads); if (config_.use_gpu) { place_ = paddle::platform::CUDAPlace(config_.device); LOG(WARNING) << "ir optimize only supports CPU currently, enable_ir_optim " "is turned false."; config_.enable_ir_optim = false; } else { place_ = paddle::platform::CPUPlace(); } if (parent_scope) { scope_ = parent_scope; sub_scope_ = &(parent_scope->NewScope()); } else { paddle::framework::InitDevices(false); scope_.reset(new paddle::framework::Scope()); } executor_.reset(new paddle::framework::NaiveExecutor(place_)); if (!program) { if (!LoadProgramDesc()) return false; OptimizeInferenceProgram(); } else { inference_program_ = program; } if (config_._use_mkldnn) { executor_->EnableMKLDNN(*inference_program_); } executor_->Prepare(scope_.get(), *inference_program_, 0, config_.use_feed_fetch_ops); // Get the feed_target_names and fetch_target_names PrepareFeedFetch(); return true; } bool AnalysisPredictor::Run(const std::vector &inputs, std::vector *output_data, int batch_size) { VLOG(3) << "Predictor::predict"; inference::Timer timer; timer.tic(); // set feed variable std::vector feeds; framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get(); if (!SetFeed(inputs, scope)) { LOG(ERROR) << "fail to set feed"; return false; } // Run the inference program // if share variables, we need not create variables executor_->Run(); // get fetch variable if (!GetFetch(output_data, scope)) { LOG(ERROR) << "fail to get fetches"; return false; } VLOG(3) << "predict cost: " << timer.toc() << "ms"; return true; } bool AnalysisPredictor::SetFeed(const std::vector &inputs, framework::Scope *scope) { VLOG(3) << "Predictor::set_feed"; if (inputs.size() != feeds_.size()) { LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get " << inputs.size(); return false; } // Cache the inputs memory for better concurrency performance. feed_tensors_.resize(inputs.size()); for (size_t i = 0; i < inputs.size(); ++i) { auto &input = feed_tensors_[i]; framework::DDim ddim = framework::make_ddim(inputs[i].shape); void *input_ptr; if (inputs[i].dtype == PaddleDType::INT64) { input_ptr = input.mutable_data(ddim, platform::CPUPlace()); } else if (inputs[i].dtype == PaddleDType::FLOAT32) { input_ptr = input.mutable_data(ddim, platform::CPUPlace()); } else { LOG(ERROR) << "unsupported feed type " << inputs[i].dtype; return false; } // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. std::memcpy(static_cast(input_ptr), inputs[i].data.data(), inputs[i].data.length()); // TODO(Superjomn) Low performance, need optimization for heavy LoD copy. framework::LoD lod; for (auto &level : inputs[i].lod) { lod.emplace_back(level); } input.set_lod(lod); int idx = -1; if (config_.specify_input_name) { idx = feed_names_[inputs[i].name]; } else { idx = boost::get(feeds_[i]->GetAttr("col")); } framework::SetFeedVariable(scope, input, "feed", idx); } return true; } template void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch, PaddleTensor *output) { // set shape. auto shape = framework::vectorize(fetch.dims()); output->shape.assign(shape.begin(), shape.end()); // set data. const T *data = fetch.data(); int num_elems = inference::VecReduceToInt(shape); output->data.Resize(num_elems * sizeof(T)); // The fetched tensor output by fetch op, should always in CPU memory, so just // copy. memcpy(output->data.data(), data, num_elems * sizeof(T)); // set lod output->lod.clear(); for (auto &level : fetch.lod()) { output->lod.emplace_back(level.begin(), level.end()); } } bool AnalysisPredictor::GetFetch(std::vector *outputs, framework::Scope *scope) { VLOG(3) << "Predictor::get_fetch"; outputs->resize(fetchs_.size()); for (size_t i = 0; i < fetchs_.size(); ++i) { int idx = boost::get(fetchs_[i]->GetAttr("col")); PADDLE_ENFORCE((size_t)idx == i); framework::LoDTensor &fetch = framework::GetFetchVariable(*scope, "fetch", idx); auto type = fetch.type(); auto output = &(outputs->at(i)); if (type == typeid(float)) { GetFetchOne(fetch, output); output->dtype = PaddleDType::FLOAT32; } else if (type == typeid(int64_t)) { GetFetchOne(fetch, output); output->dtype = PaddleDType::INT64; } else { LOG(ERROR) << "unknown type, only support float32 and int64 now."; } } return true; } void AnalysisPredictor::OptimizeInferenceProgram() { LOG(INFO) << "optimize begin"; FLAGS_IA_enable_ir = config_.enable_ir_optim; FLAGS_IA_enable_tensorrt_subgraph_engine = false; FLAGS_IA_output_storage_path = ""; // Don't output the model. // Analyze inference_program if (!config_.model_dir.empty()) { argument_.fluid_model_dir.reset(new std::string(config_.model_dir)); } else { PADDLE_ENFORCE( !config_.param_file.empty(), "Either model_dir or (param_file, prog_file) should be set."); PADDLE_ENFORCE(!config_.prog_file.empty()); argument_.fluid_model_program_path.reset( new std::string(config_.prog_file)); argument_.fluid_model_param_path.reset(new std::string(config_.param_file)); } argument_.origin_program_desc.reset( new ProgramDesc(*inference_program_->Proto())); bool use_mkldnn = config_._use_mkldnn; switch (config_.ir_mode) { case contrib::AnalysisConfig::IrPassMode::kExclude: Analyzer() .IncludeAllIrPasses() .SetUseMkldnn(use_mkldnn) .DisableIrPasses(use_mkldnn ? config_.ir_mkldnn_passes : config_.ir_passes) .Run(&argument_); break; case contrib::AnalysisConfig::IrPassMode::kInclude: Analyzer() .SetUseMkldnn(use_mkldnn) .IncludeIrPasses(use_mkldnn ? config_.ir_mkldnn_passes : config_.ir_passes) .Run(&argument_); break; default: LOG(ERROR) << "Only kExclude and kInclude modes are supoorted yet."; } CHECK(argument_.transformed_program_desc); VLOG(5) << "to prepare executor"; inference_program_.reset( new framework::ProgramDesc(*argument_.transformed_program_desc)); if (argument_.Has(framework::ir::kParamScopeAttr)) { // Update scope. scope_.reset( argument_.Release(framework::ir::kParamScopeAttr)); } LOG(INFO) << "== optimize end =="; } template <> std::unique_ptr CreatePaddlePredictor< AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) { VLOG(3) << "create AnalysisConfig"; if (config.use_gpu) { // 1. GPU memeroy PADDLE_ENFORCE_GT( config.fraction_of_gpu_memory, 0.f, "fraction_of_gpu_memory in the config should be set to range (0., 1.]"); PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device); std::vector flags; if (config.fraction_of_gpu_memory >= 0.0f || config.fraction_of_gpu_memory <= 0.95f) { flags.push_back("dummpy"); std::string flag = "--fraction_of_gpu_memory_to_use=" + std::to_string(config.fraction_of_gpu_memory); flags.push_back(flag); VLOG(3) << "set flag: " << flag; framework::InitGflags(flags); } } std::unique_ptr predictor(new AnalysisPredictor(config)); if (!dynamic_cast(predictor.get())->Init(nullptr)) { return nullptr; } return predictor; } void AnalysisPredictor::PrepareFeedFetch() { for (auto *op : inference_program_->Block(0).AllOps()) { if (op->Type() == "feed") { int idx = boost::get(op->GetAttr("col")); if (feeds_.size() <= static_cast(idx)) { feeds_.resize(idx + 1); } feeds_[idx] = op; feed_names_[op->Output("Out")[0]] = idx; } else if (op->Type() == "fetch") { int idx = boost::get(op->GetAttr("col")); if (fetchs_.size() <= static_cast(idx)) { fetchs_.resize(idx + 1); } fetchs_[idx] = op; } } } std::unique_ptr AnalysisPredictor::GetInputTensor( const std::string &name) { PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name); std::unique_ptr res( new ZeroCopyTensor(static_cast(executor_->scope()))); res->input_or_output_ = true; res->SetName(name); return res; } std::unique_ptr AnalysisPredictor::GetOutputTensor( const std::string &name) { PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name); std::unique_ptr res( new ZeroCopyTensor(static_cast(executor_->scope()))); res->input_or_output_ = false; res->SetName(name); return res; } bool AnalysisPredictor::ZeroCopyRun() { executor_->Run(); return true; } bool AnalysisPredictor::LoadProgramDesc() { // Initialize the inference program std::unique_ptr tmp_exe( new framework::Executor(platform::CPUPlace())); if (!config_.model_dir.empty()) { // Parameters are saved in separate files sited in // the specified `dirname`. inference_program_ = paddle::inference::Load( static_cast(tmp_exe.get()), scope_.get(), config_.model_dir); } else if (!config_.prog_file.empty() && !config_.param_file.empty()) { // All parameters are saved in a single file. // The file names should be consistent with that used // in Python API `fluid.io.save_inference_model`. inference_program_ = paddle::inference::Load( static_cast(tmp_exe.get()), scope_.get(), config_.prog_file, config_.param_file); } else { LOG(ERROR) << string::Sprintf( "not valid model path '%s' or program path '%s'.", config_.model_dir, config_.param_file); return false; } return true; } AnalysisPredictor::~AnalysisPredictor() { #if !defined(_WIN32) if (FLAGS_profile) { platform::DisableProfiler(platform::EventSortingKey::kTotal, "./profile.log"); } #endif if (sub_scope_) { scope_->DeleteScope(sub_scope_); } } std::unique_ptr AnalysisPredictor::Clone() { auto *x = new AnalysisPredictor(config_); x->Init(scope_, inference_program_); return std::unique_ptr(x); } template <> std::unique_ptr CreatePaddlePredictor( const contrib::AnalysisConfig &config) { return CreatePaddlePredictor(config); } } // namespace paddle