api_impl.cc 14.8 KB
Newer Older
X
Xin Pan 已提交
1 2
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Y
Yan Chunwei 已提交
3 4 5
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
X
Xin Pan 已提交
6

Y
Yan Chunwei 已提交
7
http://www.apache.org/licenses/LICENSE-2.0
X
Xin Pan 已提交
8

Y
Yan Chunwei 已提交
9 10 11 12 13
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
X
Xin Pan 已提交
14

F
flame 已提交
15
#include <glog/logging.h>
X
Xin Pan 已提交
16 17
#include <algorithm>
#include <map>
W
Wilber 已提交
18
#include <memory>
X
Xin Pan 已提交
19 20 21 22 23 24
#include <set>
#include <sstream>
#include <string>
#include <utility>
#include <vector>

25
#include "paddle/fluid/framework/feed_fetch_method.h"
L
Luo Tao 已提交
26
#include "paddle/fluid/inference/api/api_impl.h"
Y
Yan Chunwei 已提交
27
#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
28
#include "paddle/fluid/inference/api/helper.h"
W
Wilber 已提交
29
#include "paddle/fluid/inference/api/paddle_inference_api.h"
30
#include "paddle/fluid/memory/memcpy.h"
31
#include "paddle/fluid/platform/cpu_helper.h"
32 33 34
#include "paddle/fluid/platform/profiler.h"

DEFINE_bool(profile, false, "Turn on profiler for fluid");
X
Xin Pan 已提交
35 36

namespace paddle {
37 38 39 40 41 42 43 44 45 46
namespace {
using paddle::inference::Timer;

template <class T>
std::string num2str(T a) {
  std::stringstream istr;
  istr << a;
  return istr.str();
}
}  // namespace
X
Xin Pan 已提交
47

48 49 50
void NativePaddlePredictor::PrepareFeedFetch() {
  for (auto *op : inference_program_->Block(0).AllOps()) {
    if (op->Type() == "feed") {
51
      int idx = BOOST_GET_CONST(int, op->GetAttr("col"));
T
tensor-tang 已提交
52
      if (feeds_.size() <= static_cast<size_t>(idx)) {
53 54 55 56 57
        feeds_.resize(idx + 1);
      }
      feeds_[idx] = op;
      feed_names_[op->Output("Out")[0]] = idx;
    } else if (op->Type() == "fetch") {
58
      int idx = BOOST_GET_CONST(int, op->GetAttr("col"));
T
tensor-tang 已提交
59
      if (fetchs_.size() <= static_cast<size_t>(idx)) {
60 61 62 63 64 65 66
        fetchs_.resize(idx + 1);
      }
      fetchs_[idx] = op;
    }
  }
}

T
tensor-tang 已提交
67 68
bool NativePaddlePredictor::Init(
    std::shared_ptr<framework::Scope> parent_scope) {
69
  VLOG(3) << "Predictor::init()";
70 71 72 73 74 75 76 77 78
  if (FLAGS_profile) {
    LOG(WARNING) << "Profiler is actived, might affect the performance";
    LOG(INFO) << "You can turn off by set gflags '-profile false'";

    auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll
                                           : platform::ProfilerState::kCPU;
    platform::EnableProfiler(tracking_device);
  }

79
  // no matter with or without MKLDNN
L
luotao1 已提交
80
  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
81

Y
Yan Chunwei 已提交
82
  if (config_.use_gpu) {
83 84 85
    PADDLE_ENFORCE_EQ(config_.use_xpu, false,
                      platform::errors::InvalidArgument(
                          "Only one choice can be made between CPU and XPU."));
X
Xin Pan 已提交
86
    place_ = paddle::platform::CUDAPlace(config_.device);
87 88
  } else if (config_.use_xpu) {
    place_ = paddle::platform::XPUPlace(config_.device);
X
Xin Pan 已提交
89 90 91
  } else {
    place_ = paddle::platform::CPUPlace();
  }
T
tensor-tang 已提交
92 93 94
  if (parent_scope) {
    scope_ = parent_scope;
    sub_scope_ = &(parent_scope->NewScope());
95 96 97
    PADDLE_ENFORCE_NOT_NULL(sub_scope_,
                            platform::errors::PreconditionNotMet(
                                "The sub_scope should not be nullptr."));
98
  } else {
99
    paddle::framework::InitDevices();
100 101
    scope_.reset(new paddle::framework::Scope());
  }
102

X
Xin Pan 已提交
103
  executor_.reset(new paddle::framework::Executor(place_));
104

X
Xin Pan 已提交
105 106 107 108
  // Initialize the inference program
  if (!config_.model_dir.empty()) {
    // Parameters are saved in separate files sited in
    // the specified `dirname`.
109 110
    inference_program_ = paddle::inference::Load(executor_.get(), scope_.get(),
                                                 config_.model_dir);
X
Xin Pan 已提交
111 112 113 114 115 116 117
  } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
    // All parameters are saved in a single file.
    // The file names should be consistent with that used
    // in Python API `fluid.io.save_inference_model`.
    inference_program_ = paddle::inference::Load(
        executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
  } else {
Y
Yan Chunwei 已提交
118
    LOG(ERROR) << "fail to load inference model from " << config_.model_dir;
X
Xin Pan 已提交
119 120
    return false;
  }
121

X
Xin Pan 已提交
122
  ctx_ = executor_->Prepare(*inference_program_, 0);
123 124
  executor_->CreateVariables(*inference_program_,
                             sub_scope_ ? sub_scope_ : scope_.get(), 0);
Y
Yan Chunwei 已提交
125

X
Xin Pan 已提交
126
  // Get the feed_target_names and fetch_target_names
127
  PrepareFeedFetch();
X
Xin Pan 已提交
128 129 130
  return true;
}

131
NativePaddlePredictor::~NativePaddlePredictor() {
132 133 134 135
  if (FLAGS_profile) {
    platform::DisableProfiler(platform::EventSortingKey::kTotal,
                              "./profile.log");
  }
136 137 138
  if (sub_scope_) {
    scope_->DeleteScope(sub_scope_);
  }
L
Luo Tao 已提交
139
}
140

Y
Yan Chunwei 已提交
141
bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
142 143
                                std::vector<PaddleTensor> *output_data,
                                int batch_size) {
F
flame 已提交
144 145 146 147 148 149 150 151 152 153
#ifndef PADDLE_ON_INFERENCE
  LOG_FIRST_N(WARNING, 5) << "The NaiveExecutor can not work properly if the "
                             "cmake flag ON_INFER is not set.";
  LOG_FIRST_N(WARNING, 5) << "Unlike the training phase, all the scopes and "
                             "variables will be reused to save the allocation "
                             "overhead.";
  LOG_FIRST_N(WARNING, 5) << "Please re-compile the inference library by "
                             "setting the cmake flag ON_INFER=ON if you are "
                             "running Paddle Inference";
#endif  // PADDLE_ON_INFERENCE
L
luotao1 已提交
154 155 156
  if (UNLIKELY(config_.cpu_math_library_num_threads() > 1)) {
    paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
  }
157
  VLOG(3) << "Predictor::predict";
X
Xin Pan 已提交
158 159 160
  Timer timer;
  timer.tic();
  // set feed variable
161 162
  framework::Scope *scope = sub_scope_ != nullptr ? sub_scope_ : scope_.get();
  if (!SetFeed(inputs, scope)) {
X
Xin Pan 已提交
163 164 165 166 167
    LOG(ERROR) << "fail to set feed";
    return false;
  }
  // Run the inference program
  // if share variables, we need not create variables
168
  VLOG(4) << "Run prepared context";
169 170
  executor_->RunPreparedContext(ctx_.get(), scope,
                                false, /* don't create local scope each time*/
171
                                false /* don't create variable each time */);
172
  VLOG(4) << "Finish prepared context";
173 174
  // get fetch variable
  if (!GetFetch(output_data, scope)) {
175
    LOG(ERROR) << "fail to get fetches";
X
Xin Pan 已提交
176 177
    return false;
  }
M
minqiyang 已提交
178
  VLOG(3) << "predict cost: " << timer.toc() << "ms";
Y
Yan Chunwei 已提交
179

Y
Yan Chunwei 已提交
180 181 182
  // For some other vector like containers not cleaned after each batch.
  tensor_array_batch_cleaner_.CollectNoTensorVars(scope_.get());
  tensor_array_batch_cleaner_.ResetNoTensorVars();
X
Xin Pan 已提交
183 184 185
  return true;
}

Y
Yan Chunwei 已提交
186
std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
Y
Yan Chunwei 已提交
187 188
  std::lock_guard<std::mutex> lk(clone_mutex_);
  VLOG(3) << "Predictor::clone";
Y
Yan Chunwei 已提交
189
  std::unique_ptr<PaddlePredictor> cls(new NativePaddlePredictor(config_));
Y
Yan Chunwei 已提交
190 191
  // Hot fix the bug that result diff in multi-thread.
  // TODO(Superjomn) re-implement a real clone here.
192 193 194 195
  PADDLE_ENFORCE_NOT_NULL(
      dynamic_cast<NativePaddlePredictor *>(cls.get()),
      platform::errors::PreconditionNotMet(
          "Dynamic_cast from PaddlePredictor to NativePaddlePredictor failed"));
Y
Yan Chunwei 已提交
196
  if (!dynamic_cast<NativePaddlePredictor *>(cls.get())->Init(nullptr)) {
Y
Yan Chunwei 已提交
197
    LOG(ERROR) << "fail to call Init";
X
Xin Pan 已提交
198 199
    return nullptr;
  }
Y
Yan Chunwei 已提交
200

J
Fix mac  
JiabinYang 已提交
201 202 203 204
#ifdef __clang__
  // fix clang compile error
  return cls;
#else
205 206
  // fix manylinux compile error.
  return std::move(cls);
J
Fix mac  
JiabinYang 已提交
207
#endif
X
Xin Pan 已提交
208 209
}

Y
Yan Chunwei 已提交
210
bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
211
                                    framework::Scope *scope) {
212
  VLOG(3) << "Predictor::set_feed";
213
  if (inputs.size() != feeds_.size()) {
214 215
    LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get "
               << inputs.size();
X
Xin Pan 已提交
216 217
    return false;
  }
218 219 220 221

  // Cache the inputs memory for better concurrency performance.
  feed_tensors_.resize(inputs.size());

222
  for (size_t i = 0; i < inputs.size(); ++i) {
223
    auto &input = feed_tensors_[i];
224
    framework::DDim ddim = framework::make_ddim(inputs[i].shape);
X
Xin Pan 已提交
225 226
    void *input_ptr;
    if (inputs[i].dtype == PaddleDType::INT64) {
227
      input_ptr = input.mutable_data<int64_t>(ddim, place_);
X
Xin Pan 已提交
228
    } else if (inputs[i].dtype == PaddleDType::FLOAT32) {
229
      input_ptr = input.mutable_data<float>(ddim, place_);
230 231
    } else if (inputs[i].dtype == PaddleDType::INT32) {
      input_ptr = input.mutable_data<int32_t>(ddim, place_);
X
Xin Pan 已提交
232 233 234 235 236
    } else {
      LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
      return false;
    }

237 238 239 240 241 242 243
    PADDLE_ENFORCE_NOT_NULL(input_ptr,
                            platform::errors::InvalidArgument(
                                "The input_ptr should not be nullptr."));
    PADDLE_ENFORCE_NOT_NULL(
        inputs[i].data.data(),
        platform::errors::InvalidArgument(
            "The data of input tensor should not be null."));
244 245 246 247
    if (platform::is_cpu_place(place_)) {
      // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
      std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
                  inputs[i].data.length());
248 249 250 251 252
    } else if (platform::is_gpu_place(place_)) {
      PADDLE_ENFORCE_EQ(
          platform::is_xpu_place(place_), false,
          platform::errors::InvalidArgument(
              "Only one choice can be made between CPU and XPU."));
253
#ifdef PADDLE_WITH_CUDA
Q
qingqing01 已提交
254 255 256 257
      platform::DeviceContextPool &pool =
          platform::DeviceContextPool::Instance();
      auto *dev_ctx =
          static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
258
      auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
259 260
      memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
                   platform::CPUPlace(), inputs[i].data.data(),
Q
qingqing01 已提交
261
                   inputs[i].data.length(), dev_ctx->stream());
262
#else
263 264
      PADDLE_THROW(platform::errors::Unavailable(
          "Not compile with CUDA, should not reach here."));
265 266 267 268 269 270 271 272 273 274
#endif
    } else {
#ifdef PADDLE_WITH_XPU
      auto dst_xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
      memory::Copy(dst_xpu_place, static_cast<void *>(input_ptr),
                   platform::CPUPlace(), inputs[i].data.data(),
                   inputs[i].data.length());
#else
      PADDLE_THROW(platform::errors::Unavailable(
          "Not compile with XPU, should not reach here."));
275 276 277
#endif
    }

Y
Yan Chunwei 已提交
278 279 280 281 282 283
    // TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
    framework::LoD lod;
    for (auto &level : inputs[i].lod) {
      lod.emplace_back(level);
    }
    input.set_lod(lod);
284 285
    int idx = -1;
    if (config_.specify_input_name) {
X
polish  
Xin Pan 已提交
286
      idx = feed_names_[inputs[i].name];
287
    } else {
288
      idx = BOOST_GET_CONST(int, feeds_[i]->GetAttr("col"));
289 290
    }
    framework::SetFeedVariable(scope, input, "feed", idx);
X
Xin Pan 已提交
291 292 293
  }
  return true;
}
L
luotao1 已提交
294 295 296
template <typename T>
void NativePaddlePredictor::GetFetchOne(const framework::LoDTensor &fetch,
                                        PaddleTensor *output) {
297 298 299 300 301 302 303 304 305 306 307 308 309 310
  // set shape.
  auto shape = framework::vectorize(fetch.dims());
  output->shape.assign(shape.begin(), shape.end());
  // set data.
  const T *data = fetch.data<T>();
  int num_elems = inference::VecReduceToInt(shape);
  output->data.Resize(num_elems * sizeof(T));
  // The fetched tensor output by fetch op, should always in CPU memory, so just
  // copy.
  memcpy(output->data.data(), data, num_elems * sizeof(T));
  // set lod
  output->lod.clear();
  for (auto &level : fetch.lod()) {
    output->lod.emplace_back(level.begin(), level.end());
L
luotao1 已提交
311 312
  }
}
X
Xin Pan 已提交
313

314 315
bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
                                     framework::Scope *scope) {
316
  VLOG(3) << "Predictor::get_fetch";
317 318
  outputs->resize(fetchs_.size());
  for (size_t i = 0; i < fetchs_.size(); ++i) {
319
    int idx = BOOST_GET_CONST(int, fetchs_[i]->GetAttr("col"));
320 321 322 323 324
    PADDLE_ENFORCE_EQ(
        static_cast<size_t>(idx), i,
        platform::errors::InvalidArgument(
            "Fetch op's col attr(%d) should be equal to the index(%d)", idx,
            i));
325
    framework::FetchType &fetch_var =
326
        framework::GetFetchVariable(*scope, "fetch", idx);
327
    auto fetch = BOOST_GET_CONST(framework::LoDTensor, fetch_var);
L
luotao1 已提交
328 329
    auto type = fetch.type();
    auto output = &(outputs->at(i));
330
    output->name = fetchs_[idx]->Input("X")[0];
331
    if (type == framework::DataTypeTrait<float>::DataType()) {
L
luotao1 已提交
332 333
      GetFetchOne<float>(fetch, output);
      output->dtype = PaddleDType::FLOAT32;
334
    } else if (type == framework::DataTypeTrait<int64_t>::DataType()) {
L
luotao1 已提交
335 336
      GetFetchOne<int64_t>(fetch, output);
      output->dtype = PaddleDType::INT64;
337
    } else if (type == framework::DataTypeTrait<int32_t>::DataType()) {
338 339
      GetFetchOne<int32_t>(fetch, output);
      output->dtype = PaddleDType::INT32;
X
Xin Pan 已提交
340
    } else {
341
      LOG(ERROR) << "unknown type, only support float32, int64 and int32 now.";
Y
Yan Chunwei 已提交
342
    }
X
Xin Pan 已提交
343 344 345 346
  }
  return true;
}

347
template <>
348 349
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
    NativeConfig, PaddleEngineKind::kNative>(const NativeConfig &config) {
W
Wilber 已提交
350 351
  // TODO(NHZlX): Should add the link to the doc of
  // paddle_infer::CreatePredictor<paddle_infer::Config>
352
  VLOG(3) << "create NativePaddlePredictor";
Y
Yan Chunwei 已提交
353
  if (config.use_gpu) {
S
Sylwester Fraczek 已提交
354
    // 1. GPU memory
355 356 357 358 359 360 361 362 363
    PADDLE_ENFORCE_GE(config.fraction_of_gpu_memory, 0.f,
                      platform::errors::InvalidArgument(
                          "fraction_of_gpu_memory in the config should be set "
                          "to range (0., 1.]"));
    PADDLE_ENFORCE_GE(config.device, 0,
                      platform::errors::PreconditionNotMet(
                          "Invalid device id %d, the device id should be "
                          "greater than or equal to 0.",
                          config.device));
Y
Yan Chunwei 已提交
364 365 366 367 368
    std::vector<std::string> flags;
    if (config.fraction_of_gpu_memory >= 0.0f ||
        config.fraction_of_gpu_memory <= 0.95f) {
      flags.push_back("dummpy");
      std::string flag = "--fraction_of_gpu_memory_to_use=" +
369
                         num2str<float>(config.fraction_of_gpu_memory);
Y
Yan Chunwei 已提交
370
      flags.push_back(flag);
371
      VLOG(3) << "set flag: " << flag;
Y
Yan Chunwei 已提交
372 373
      framework::InitGflags(flags);
    }
X
Xin Pan 已提交
374
  }
375

Y
Yan Chunwei 已提交
376
  std::unique_ptr<PaddlePredictor> predictor(new NativePaddlePredictor(config));
L
liuwei1031 已提交
377
  PADDLE_ENFORCE_NOT_NULL(
378 379 380
      dynamic_cast<NativePaddlePredictor *>(predictor.get()),
      platform::errors::PreconditionNotMet(
          "Dynamic_cast from PaddlePredictor to NativePaddlePredictor failed"));
T
tensor-tang 已提交
381
  if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) {
X
Xin Pan 已提交
382 383
    return nullptr;
  }
J
Fix mac  
JiabinYang 已提交
384
#ifdef __clang__
J
Jiabin Yang 已提交
385
  // fix clang compile error
J
Fix mac  
JiabinYang 已提交
386 387
  return predictor;
#else
388
  return std::move(predictor);
J
Fix mac  
JiabinYang 已提交
389
#endif
X
Xin Pan 已提交
390 391
}

Y
Yan Chunwei 已提交
392 393 394
template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<NativeConfig>(
    const NativeConfig &config) {
W
Wilber 已提交
395
  LOG(WARNING) << "Deprecated. Please use CreatePredictor instead.";
Y
Yan Chunwei 已提交
396 397 398
  return CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
}

X
Xin Pan 已提交
399
}  // namespace paddle