analysis_predictor.cc 22.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

Y
Yan Chunwei 已提交
15
#include "paddle/fluid/inference/api/analysis_predictor.h"
16 17
#include <glog/logging.h>
#include <algorithm>
N
nhzlx 已提交
18
#include <fstream>
19
#include <memory>
20 21
#include <string>
#include <vector>
22
#include "paddle/fluid/framework/feed_fetch_method.h"
23
#include "paddle/fluid/framework/feed_fetch_type.h"
Y
Yan Chunwei 已提交
24
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
25
#include "paddle/fluid/framework/ir/pass.h"
26
#include "paddle/fluid/framework/naive_executor.h"
27
#include "paddle/fluid/framework/scope.h"
28
#include "paddle/fluid/inference/api/helper.h"
29
#include "paddle/fluid/inference/api/paddle_inference_api.h"
L
luotao1 已提交
30
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
31 32
#if PADDLE_WITH_TENSORRT
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
N
nhzlx 已提交
33
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
34
#endif
N
nhzlx 已提交
35
#include "paddle/fluid/inference/analysis/helper.h"
36
#include "paddle/fluid/inference/utils/singleton.h"
37
#include "paddle/fluid/memory/memcpy.h"
38
#include "paddle/fluid/platform/cpu_helper.h"
39
#include "paddle/fluid/platform/gpu_info.h"
T
tensor-tang 已提交
40 41 42
#include "paddle/fluid/platform/profiler.h"

DECLARE_bool(profile);
43 44 45

namespace paddle {

46
using contrib::AnalysisConfig;
N
nhzlx 已提交
47
using inference::Singleton;
N
nhzlx 已提交
48
#if PADDLE_WITH_TENSORRT
N
nhzlx 已提交
49
using inference::tensorrt::TRTInt8Calibrator;
N
nhzlx 已提交
50 51
using inference::tensorrt::TRTCalibratorEngine;
using inference::tensorrt::TRTCalibratorEngineManager;
N
nhzlx 已提交
52
#endif
53

54 55 56 57 58 59 60 61 62 63 64
namespace {
bool IsPersistable(const framework::VarDesc *var) {
  if (var->Persistable() &&
      var->GetType() != framework::proto::VarType::FEED_MINIBATCH &&
      var->GetType() != framework::proto::VarType::FETCH_LIST) {
    return true;
  }
  return false;
}
}  // namespace

Y
Yan Chunwei 已提交
65
bool AnalysisPredictor::Init(
66 67
    const std::shared_ptr<framework::Scope> &parent_scope,
    const std::shared_ptr<framework::ProgramDesc> &program) {
M
minqiyang 已提交
68
  VLOG(3) << "Predictor::init()";
T
tensor-tang 已提交
69 70 71
  if (FLAGS_profile) {
    LOG(WARNING) << "Profiler is actived, might affect the performance";
    LOG(INFO) << "You can turn off by set gflags '-profile false'";
72 73
    auto tracking_device = config_.use_gpu() ? platform::ProfilerState::kAll
                                             : platform::ProfilerState::kCPU;
T
tensor-tang 已提交
74 75 76
    platform::EnableProfiler(tracking_device);
  }

77
  // no matter with or without MKLDNN
L
luotao1 已提交
78
  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
79

80 81 82 83 84 85 86 87 88 89 90 91 92
  if (!PrepareScope(parent_scope)) {
    return false;
  }
  if (!CreateExecutor()) {
    return false;
  }
  if (!PrepareProgram(program)) {
    return false;
  }

  // Prepare executor, create local variables.
  if (!PrepareExecutor()) {
    return true;
Y
Yan Chunwei 已提交
93
  }
94 95 96 97 98 99 100 101 102

  // Get the feed_target_names and fetch_target_names
  PrepareFeedFetch();

  return true;
}

bool AnalysisPredictor::PrepareScope(
    const std::shared_ptr<framework::Scope> &parent_scope) {
Y
Yan Chunwei 已提交
103
  if (parent_scope) {
104 105 106
    PADDLE_ENFORCE_NOT_NULL(
        parent_scope,
        "Both program and parent_scope should be set in Clone mode.");
Y
Yan Chunwei 已提交
107
    scope_ = parent_scope;
108
    status_is_cloned_ = true;
Y
Yan Chunwei 已提交
109 110 111
  } else {
    paddle::framework::InitDevices(false);
    scope_.reset(new paddle::framework::Scope());
112
    status_is_cloned_ = false;
Y
Yan Chunwei 已提交
113
  }
114 115 116 117 118
  sub_scope_ = &scope_->NewScope();
  return true;
}
bool AnalysisPredictor::PrepareProgram(
    const std::shared_ptr<framework::ProgramDesc> &program) {
119 120
  if (!program) {
    if (!LoadProgramDesc()) return false;
121 122 123 124

    // Optimize the program, and load parameters and modify them in the
    // scope_.
    // This will change the scope_ address.
125
    if (config_.ir_optim()) {
126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
      status_ir_optim_enabled_ = true;
      OptimizeInferenceProgram();
    } else {
      // If the parent_scope is passed, we assert that the persistable variables
      // are already created, so just create the no persistable variables.

      // If not cloned, the parameters should be loaded
      // OptimizeInferenceProgram.
      // So in both cases, just the local variables are needed to load, not the
      // parematers.
      executor_->CreateVariables(*inference_program_, 0, true, sub_scope_);

      // Load parameters
      LOG(INFO) << "load parameters ";
      LoadParameters();
    }
Y
Yan Chunwei 已提交
142
  } else {
143 144
    // If the program is passed from external, no need to optimize it, this
    // logic is used in the clone scenario.
145 146
    inference_program_ = program;
  }
M
Michal Gallus 已提交
147

148 149 150 151 152
  executor_->CreateVariables(*inference_program_, 0, false, sub_scope_);

  return true;
}
bool AnalysisPredictor::CreateExecutor() {
153
  if (config_.use_gpu_) {
154
    status_use_gpu_ = true;
155
    place_ = paddle::platform::CUDAPlace(config_.device_id_);
156 157 158 159 160 161 162 163
  } else {
    place_ = paddle::platform::CPUPlace();
  }
  executor_.reset(new paddle::framework::NaiveExecutor(place_));
  return true;
}
bool AnalysisPredictor::PrepareExecutor() {
  executor_->Prepare(sub_scope_, *inference_program_, 0,
164
                     config_.use_feed_fetch_ops_);
165

166
  PADDLE_ENFORCE_NOT_NULL(sub_scope_);
Y
Yan Chunwei 已提交
167

168 169 170
  return true;
}

L
luotao1 已提交
171
void AnalysisPredictor::SetMkldnnThreadID(int tid) {
L
luotao1 已提交
172 173 174 175 176 177 178
#ifdef PADDLE_WITH_MKLDNN
  platform::set_cur_thread_id(tid);
#else
  LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
#endif
}

179 180 181
bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
                            std::vector<PaddleTensor> *output_data,
                            int batch_size) {
M
minqiyang 已提交
182
  VLOG(3) << "Predictor::predict";
183 184 185 186 187 188
  inference::Timer timer;
  timer.tic();
  // set feed variable
  framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get();
  if (!SetFeed(inputs, scope)) {
    LOG(ERROR) << "fail to set feed";
Y
Yan Chunwei 已提交
189
    return false;
190
  }
M
Michal Gallus 已提交
191

192 193 194
  // Run the inference program
  // if share variables, we need not create variables
  executor_->Run();
195

196 197 198 199
  // get fetch variable
  if (!GetFetch(output_data, scope)) {
    LOG(ERROR) << "fail to get fetches";
    return false;
T
tensor-tang 已提交
200
  }
M
minqiyang 已提交
201
  VLOG(3) << "predict cost: " << timer.toc() << "ms";
Y
Yan Chunwei 已提交
202

Y
Yan Chunwei 已提交
203 204 205 206 207 208 209
  // All the containers in the scope will be hold in inference, but the
  // operators assume that the container will be reset after each batch.
  // Here is a bugfix, collect all the container variables, and reset then to a
  // bool; the next time, the operator will call MutableData and construct a new
  // container again, so that the container will be empty for each batch.
  tensor_array_batch_cleaner_.CollectNoTensorVars(sub_scope_);
  tensor_array_batch_cleaner_.ResetNoTensorVars();
210 211
  return true;
}
212

213 214
bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                                framework::Scope *scope) {
M
minqiyang 已提交
215
  VLOG(3) << "Predictor::set_feed";
216 217 218 219 220 221 222 223 224 225 226 227 228 229
  if (inputs.size() != feeds_.size()) {
    LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get "
               << inputs.size();
    return false;
  }

  // Cache the inputs memory for better concurrency performance.
  feed_tensors_.resize(inputs.size());

  for (size_t i = 0; i < inputs.size(); ++i) {
    auto &input = feed_tensors_[i];
    framework::DDim ddim = framework::make_ddim(inputs[i].shape);
    void *input_ptr;
    if (inputs[i].dtype == PaddleDType::INT64) {
230
      input_ptr = input.mutable_data<int64_t>(ddim, place_);
231
    } else if (inputs[i].dtype == PaddleDType::FLOAT32) {
232
      input_ptr = input.mutable_data<float>(ddim, place_);
233 234 235 236 237
    } else {
      LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
      return false;
    }

238 239 240 241 242 243
    if (platform::is_cpu_place(place_)) {
      // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
      std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
                  inputs[i].data.length());
    } else {
#ifdef PADDLE_WITH_CUDA
Q
qingqing01 已提交
244 245 246 247
      platform::DeviceContextPool &pool =
          platform::DeviceContextPool::Instance();
      auto *dev_ctx =
          static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
248 249 250
      auto dst_gpu_place = boost::get<platform::CUDAPlace>(place_);
      memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
                   platform::CPUPlace(), inputs[i].data.data(),
Q
qingqing01 已提交
251
                   inputs[i].data.length(), dev_ctx->stream());
252 253 254 255
#else
      PADDLE_THROW("Not compile with CUDA, should not reach here.");
#endif
    }
256 257 258 259 260 261 262
    // TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
    framework::LoD lod;
    for (auto &level : inputs[i].lod) {
      lod.emplace_back(level);
    }
    input.set_lod(lod);
    int idx = -1;
263
    if (config_.specify_input_name_) {
T
tensor-tang 已提交
264 265
      auto name = inputs[i].name;
      if (feed_names_.find(name) == feed_names_.end()) {
T
tensor-tang 已提交
266 267
        LOG(ERROR) << "feed names from program do not have name: [" << name
                   << "] from specified input";
T
tensor-tang 已提交
268 269
      }
      idx = feed_names_[name];
270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299
    } else {
      idx = boost::get<int>(feeds_[i]->GetAttr("col"));
    }
    framework::SetFeedVariable(scope, input, "feed", idx);
  }
  return true;
}

template <typename T>
void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch,
                                    PaddleTensor *output) {
  // set shape.
  auto shape = framework::vectorize(fetch.dims());
  output->shape.assign(shape.begin(), shape.end());
  // set data.
  const T *data = fetch.data<T>();
  int num_elems = inference::VecReduceToInt(shape);
  output->data.Resize(num_elems * sizeof(T));
  // The fetched tensor output by fetch op, should always in CPU memory, so just
  // copy.
  memcpy(output->data.data(), data, num_elems * sizeof(T));
  // set lod
  output->lod.clear();
  for (auto &level : fetch.lod()) {
    output->lod.emplace_back(level.begin(), level.end());
  }
}

bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
                                 framework::Scope *scope) {
M
minqiyang 已提交
300
  VLOG(3) << "Predictor::get_fetch";
301 302 303 304 305 306 307 308
  outputs->resize(fetchs_.size());
  for (size_t i = 0; i < fetchs_.size(); ++i) {
    int idx = boost::get<int>(fetchs_[i]->GetAttr("col"));
    PADDLE_ENFORCE((size_t)idx == i);
    framework::LoDTensor &fetch =
        framework::GetFetchVariable(*scope, "fetch", idx);
    auto type = fetch.type();
    auto output = &(outputs->at(i));
309
    output->name = fetchs_[idx]->Input("X")[0];
Y
Yu Yang 已提交
310
    if (type == framework::proto::VarType::FP32) {
311 312
      GetFetchOne<float>(fetch, output);
      output->dtype = PaddleDType::FLOAT32;
Y
Yu Yang 已提交
313
    } else if (type == framework::proto::VarType::INT64) {
314 315 316 317 318 319
      GetFetchOne<int64_t>(fetch, output);
      output->dtype = PaddleDType::INT64;
    } else {
      LOG(ERROR) << "unknown type, only support float32 and int64 now.";
    }
  }
Y
Yan Chunwei 已提交
320 321
  return true;
}
322

323
// NOTE All the members in AnalysisConfig should be copied to Argument.
Y
Yan Chunwei 已提交
324
void AnalysisPredictor::OptimizeInferenceProgram() {
325 326
  status_program_optimized_ = true;

327 328
  argument_.SetUseGPU(config_.use_gpu());
  argument_.SetGPUDeviceId(config_.gpu_device_id());
T
Tao Luo 已提交
329
  argument_.SetModelFromMemory(config_.model_from_memory_);
Y
Yan Chunwei 已提交
330
  // Analyze inference_program
331 332
  if (!config_.model_dir().empty()) {
    argument_.SetModelDir(config_.model_dir());
N
nhzlx 已提交
333
    argument_.SetModelPath(config_.model_dir());
T
Tao Luo 已提交
334 335
  } else {
    PADDLE_ENFORCE(
336
        !config_.params_file().empty(),
T
Tao Luo 已提交
337
        "Either model_dir or (param_file, prog_file) should be set.");
338
    PADDLE_ENFORCE(!config_.prog_file().empty());
N
nhzlx 已提交
339
    std::string dir = inference::analysis::GetDirRoot(config_.prog_file());
N
nhzlx 已提交
340 341

    argument_.SetModelPath(dir);
342 343
    argument_.SetModelProgramPath(config_.prog_file());
    argument_.SetModelParamsPath(config_.params_file());
Y
Yan Chunwei 已提交
344
  }
345

346
  if (config_.use_gpu() && config_.tensorrt_engine_enabled()) {
347 348 349
    argument_.SetUseTensorRT(true);
    argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_);
    argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
350
    argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_);
N
nhzlx 已提交
351
    argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_);
W
Wojciech Uss 已提交
352
  }
353

354 355 356 357
  if (config_.use_mkldnn_) {
    argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);
  }

358
  auto passes = config_.pass_builder()->AllPasses();
359
  if (!config_.ir_optim()) passes.clear();
360 361 362 363 364 365 366
  argument_.SetIrAnalysisPasses(passes);
  argument_.SetScopeNotOwned(const_cast<framework::Scope *>(scope_.get()));
  Analyzer().Run(&argument_);

  PADDLE_ENFORCE(argument_.scope_valid());
  VLOG(5) << "to prepare executor";
  ARGUMENT_CHECK_FIELD((&argument_), ir_analyzed_program);
Y
Yan Chunwei 已提交
367
  inference_program_.reset(
368
      new framework::ProgramDesc(argument_.ir_analyzed_program()));
369
  LOG(INFO) << "== optimize end ==";
Y
Yan Chunwei 已提交
370
}
371 372

template <>
373 374
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
    AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
M
minqiyang 已提交
375
  VLOG(3) << "create AnalysisConfig";
376
  if (config.use_gpu()) {
377
    // 1. GPU memeroy
378 379 380
    PADDLE_ENFORCE_GT(config.memory_pool_init_size_mb(), 0.f);
    PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d",
                      config.gpu_device_id());
381
    std::vector<std::string> flags;
382 383 384 385 386 387 388 389 390 391 392

    float fraction_of_gpu_memory = config.fraction_of_gpu_memory_for_pool();
    if (fraction_of_gpu_memory > 0.95f) {
      LOG(ERROR)
          << "Allocate too much memory for the GPU memory pool, assigned "
          << config.memory_pool_init_size_mb() << " MB";
      LOG(ERROR)
          << "Try to shink the value by setting AnalysisConfig::EnableGpu(...)";
    }

    if (fraction_of_gpu_memory >= 0.0f || fraction_of_gpu_memory <= 0.95f) {
393 394
      flags.push_back("dummpy");
      std::string flag = "--fraction_of_gpu_memory_to_use=" +
395
                         std::to_string(fraction_of_gpu_memory);
396
      flags.push_back(flag);
M
minqiyang 已提交
397
      VLOG(3) << "set flag: " << flag;
398 399 400 401 402
      framework::InitGflags(flags);
    }
  }

  std::unique_ptr<PaddlePredictor> predictor(new AnalysisPredictor(config));
403
  if (!dynamic_cast<AnalysisPredictor *>(predictor.get())->Init(nullptr)) {
404 405
    return nullptr;
  }
406
  return std::move(predictor);
407 408
}

409
void AnalysisPredictor::PrepareFeedFetch() {
410 411
  PADDLE_ENFORCE_NOT_NULL(sub_scope_);
  CreateFeedFetchVar(sub_scope_);
412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429
  for (auto *op : inference_program_->Block(0).AllOps()) {
    if (op->Type() == "feed") {
      int idx = boost::get<int>(op->GetAttr("col"));
      if (feeds_.size() <= static_cast<size_t>(idx)) {
        feeds_.resize(idx + 1);
      }
      feeds_[idx] = op;
      feed_names_[op->Output("Out")[0]] = idx;
    } else if (op->Type() == "fetch") {
      int idx = boost::get<int>(op->GetAttr("col"));
      if (fetchs_.size() <= static_cast<size_t>(idx)) {
        fetchs_.resize(idx + 1);
      }
      fetchs_[idx] = op;
    }
  }
}

430 431 432 433 434 435 436 437
void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) {
  PADDLE_ENFORCE_NOT_NULL(scope);
  auto *var = scope->Var("feed");
  var->GetMutable<framework::FeedFetchList>();
  var = scope->Var("fetch");
  var->GetMutable<framework::FeedFetchList>();
}

438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459
std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
    const std::string &name) {
  PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
  std::unique_ptr<ZeroCopyTensor> res(
      new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
  res->input_or_output_ = true;
  res->SetName(name);
  return res;
}

std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
    const std::string &name) {
  PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
  std::unique_ptr<ZeroCopyTensor> res(
      new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
  res->input_or_output_ = false;
  res->SetName(name);
  return res;
}

bool AnalysisPredictor::ZeroCopyRun() {
  executor_->Run();
Y
Yan Chunwei 已提交
460
  // Fix TensorArray reuse not cleaned bug.
Y
Yan Chunwei 已提交
461
  tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_);
Y
Yan Chunwei 已提交
462
  tensor_array_batch_cleaner_.ResetTensorArray();
463 464 465 466 467
  return true;
}

bool AnalysisPredictor::LoadProgramDesc() {
  // Initialize the inference program
468
  std::string filename;
469 470 471
  if (!config_.model_dir().empty()) {
    filename = config_.model_dir() + "/__model__";
  } else if (!config_.prog_file().empty() && !config_.params_file().empty()) {
472 473 474
    // All parameters are saved in a single file.
    // The file names should be consistent with that used
    // in Python API `fluid.io.save_inference_model`.
475
    filename = config_.prog_file();
476
  } else {
477
    if (config_.model_dir().empty() && config_.prog_file().empty()) {
478 479 480 481
      LOG(ERROR)
          << "Either model_dir or (prog_file, param_file) should be set.";
      return false;
    }
482
    LOG(ERROR) << string::Sprintf(
483 484
        "not valid model path '%s' or program path '%s'.", config_.model_dir(),
        config_.params_file());
485 486
    return false;
  }
487 488 489

  // Create ProgramDesc
  framework::proto::ProgramDesc proto;
T
Tao Luo 已提交
490
  if (!config_.model_from_memory()) {
T
Tao Luo 已提交
491 492 493
    std::string pb_content;
    // Read binary
    std::ifstream fin(filename, std::ios::in | std::ios::binary);
T
Tao Luo 已提交
494 495
    PADDLE_ENFORCE(static_cast<bool>(fin.is_open()), "Cannot open file %s",
                   filename);
T
Tao Luo 已提交
496 497 498 499 500 501 502 503
    fin.seekg(0, std::ios::end);
    pb_content.resize(fin.tellg());
    fin.seekg(0, std::ios::beg);
    fin.read(&(pb_content.at(0)), pb_content.size());
    fin.close();

    proto.ParseFromString(pb_content);
  } else {
504
    proto.ParseFromString(config_.prog_file());
T
Tao Luo 已提交
505
  }
506 507 508 509 510 511 512
  inference_program_.reset(new framework::ProgramDesc(proto));
  return true;
}

bool AnalysisPredictor::LoadParameters() {
  PADDLE_ENFORCE_NOT_NULL(inference_program_.get(),
                          "The inference program should be loaded first.");
T
Tao Luo 已提交
513

514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533
  const auto &global_block = inference_program_->MutableBlock(0);

  // create a temporary program to load parameters.

  std::unique_ptr<framework::ProgramDesc> load_program(
      new framework::ProgramDesc());
  framework::BlockDesc *load_block = load_program->MutableBlock(0);
  std::vector<std::string> params;

  for (auto *var : global_block->AllVars()) {
    if (IsPersistable(var)) {
      VLOG(3) << "persistable variable's name: " << var->Name();

      framework::VarDesc *new_var = load_block->Var(var->Name());
      new_var->SetShape(var->GetShape());
      new_var->SetDataType(var->GetDataType());
      new_var->SetType(var->GetType());
      new_var->SetLoDLevel(var->GetLoDLevel());
      new_var->SetPersistable(true);

534
      if (!config_.params_file().empty()) {
535 536 537 538 539 540
        params.push_back(new_var->Name());
      } else {
        // append_op
        framework::OpDesc *op = load_block->AppendOp();
        op->SetType("load");
        op->SetOutput("Out", {new_var->Name()});
541
        op->SetAttr("file_path", {config_.model_dir() + "/" + new_var->Name()});
542 543 544 545 546
        op->CheckAttrs();
      }
    }
  }

547
  if (!config_.params_file().empty()) {
548 549 550 551 552 553
    // sort paramlist to have consistent ordering
    std::sort(params.begin(), params.end());
    // append just the load_combine op
    framework::OpDesc *op = load_block->AppendOp();
    op->SetType("load_combine");
    op->SetOutput("Out", params);
554
    op->SetAttr("file_path", {config_.params_file()});
555 556 557 558
    op->CheckAttrs();
  }

  // Use NaiveExecutor to Load parameters.
S
superjomn 已提交
559
  framework::NaiveExecutor e(place_);
560 561 562 563
  e.Prepare(scope_.get(), *load_program, 0, false);
  e.Run();
  VLOG(3) << "get " << scope_->LocalVarNames().size() << " vars after load";

564 565
  return true;
}
566

N
nhzlx 已提交
567
#if PADDLE_WITH_TENSORRT
N
nhzlx 已提交
568 569 570 571 572 573 574 575
bool AnalysisPredictor::SaveTrtCalibToDisk() {
  PADDLE_ENFORCE(config_.tensorrt_engine_enabled(),
                 "This func can be invoked only in trt mode");
  auto &block = inference_program_->Block(0);
  for (auto &op_desc : block.AllOps()) {
    if (op_desc->Type() == "tensorrt_engine") {
      std::string engine_name =
          boost::get<std::string>(op_desc->GetAttr("engine_key"));
N
nhzlx 已提交
576
      if (!Singleton<TRTCalibratorEngineManager>::Global().Has(engine_name)) {
N
nhzlx 已提交
577 578 579 580
        LOG(ERROR) << "You should run the predictor(with trt) on the real data "
                      "to generate calibration info";
        return false;
      }
N
nhzlx 已提交
581 582
      TRTCalibratorEngine *calib_engine =
          Singleton<TRTCalibratorEngineManager>::Global().Get(engine_name);
N
nhzlx 已提交
583
      LOG(INFO) << "Wait for calib threads done.";
N
nhzlx 已提交
584
      calib_engine->calib_->waitAndSetDone();
N
nhzlx 已提交
585
      LOG(INFO) << "Finish wait.";
N
nhzlx 已提交
586 587 588
      calib_engine->thr_->join();
      std::string calibration_table_data =
          calib_engine->calib_->getCalibrationTableAsString();
N
nhzlx 已提交
589

N
nhzlx 已提交
590
      if (calibration_table_data.empty()) {
N
nhzlx 已提交
591 592 593
        LOG(ERROR) << "the calibration table is empty.";
        return false;
      }
N
nhzlx 已提交
594 595 596 597 598 599 600 601 602

      std::string calibration_table_data_path =
          inference::analysis::GetTrtCalibPath(argument_.model_path(),
                                               engine_name);

      std::ofstream ofile(calibration_table_data_path, std::ios::out);
      LOG(INFO) << "Write Paddle-TRT INT8 calibration table data to file "
                << calibration_table_data_path;
      ofile << calibration_table_data;
N
nhzlx 已提交
603 604 605 606
      ofile.close();
    }
  }
  // Free all calibrator resources.
N
nhzlx 已提交
607
  Singleton<TRTCalibratorEngineManager>::Global().DeleteALL();
N
nhzlx 已提交
608 609
  return true;
}
N
nhzlx 已提交
610
#endif
N
nhzlx 已提交
611

612
AnalysisPredictor::~AnalysisPredictor() {
N
nhzlx 已提交
613
#if PADDLE_WITH_TENSORRT
N
nhzlx 已提交
614
  if (config_.tensorrt_engine_enabled() &&
N
nhzlx 已提交
615 616
      config_.tensorrt_precision_mode_ == AnalysisConfig::Precision::kInt8 &&
      Singleton<TRTCalibratorEngineManager>::Global().Has()) {
N
nhzlx 已提交
617 618
    SaveTrtCalibToDisk();
  }
N
nhzlx 已提交
619
#endif
620 621 622 623 624 625 626 627 628
  if (FLAGS_profile) {
    platform::DisableProfiler(platform::EventSortingKey::kTotal,
                              "./profile.log");
  }
  if (sub_scope_) {
    scope_->DeleteScope(sub_scope_);
  }
}

629 630 631 632 633 634
std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
  auto *x = new AnalysisPredictor(config_);
  x->Init(scope_, inference_program_);
  return std::unique_ptr<PaddlePredictor>(x);
}

Y
Yan Chunwei 已提交
635 636
template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<contrib::AnalysisConfig>(
637
    const contrib::AnalysisConfig &config) {
Y
Yan Chunwei 已提交
638 639 640 641
  return CreatePaddlePredictor<contrib::AnalysisConfig,
                               PaddleEngineKind::kAnalysis>(config);
}

642
}  // namespace paddle
643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664

#if PADDLE_WITH_TENSORRT
USE_TRT_CONVERTER(elementwise_add_weight);
USE_TRT_CONVERTER(elementwise_add_tensor);
USE_TRT_CONVERTER(elementwise_sub_tensor);
USE_TRT_CONVERTER(elementwise_div_tensor);
USE_TRT_CONVERTER(elementwise_mul_tensor);
USE_TRT_CONVERTER(elementwise_max_tensor);
USE_TRT_CONVERTER(elementwise_min_tensor);
USE_TRT_CONVERTER(elementwise_pow_tensor);
USE_TRT_CONVERTER(mul);
USE_TRT_CONVERTER(conv2d);
USE_TRT_CONVERTER(relu);
USE_TRT_CONVERTER(sigmoid);
USE_TRT_CONVERTER(tanh);
USE_TRT_CONVERTER(fc);
USE_TRT_CONVERTER(pool2d);
USE_TRT_CONVERTER(softmax);
USE_TRT_CONVERTER(batch_norm);
USE_TRT_CONVERTER(concat);
USE_TRT_CONVERTER(dropout);
USE_TRT_CONVERTER(pad);
665
USE_TRT_CONVERTER(split);
666 667
USE_TRT_CONVERTER(prelu);
USE_TRT_CONVERTER(conv2d_transpose);
H
hjchen2 已提交
668
USE_TRT_CONVERTER(leaky_relu);
669
#endif