api_impl.cc 15.1 KB
Newer Older
X
Xin Pan 已提交
1 2
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Y
Yan Chunwei 已提交
3 4 5
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
X
Xin Pan 已提交
6

Y
Yan Chunwei 已提交
7
http://www.apache.org/licenses/LICENSE-2.0
X
Xin Pan 已提交
8

Y
Yan Chunwei 已提交
9 10 11 12 13
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
X
Xin Pan 已提交
14

15 16
#include "paddle/fluid/inference/api/api_impl.h"

F
flame 已提交
17
#include <glog/logging.h>
18

W
Wilber 已提交
19
#include <memory>
X
Xin Pan 已提交
20 21 22
#include <sstream>
#include <string>

23
#include "paddle/fluid/framework/feed_fetch_method.h"
24
#include "paddle/fluid/inference/api/helper.h"
25
#include "paddle/fluid/platform/cpu_helper.h"
W
Wilber 已提交
26
#include "paddle/fluid/platform/place.h"
27 28 29
#include "paddle/fluid/platform/profiler.h"

DEFINE_bool(profile, false, "Turn on profiler for fluid");
X
Xin Pan 已提交
30 31

namespace paddle {
32 33 34 35 36 37 38 39 40 41
namespace {
using paddle::inference::Timer;

template <class T>
std::string num2str(T a) {
  std::stringstream istr;
  istr << a;
  return istr.str();
}
}  // namespace
X
Xin Pan 已提交
42

43 44 45
void NativePaddlePredictor::PrepareFeedFetch() {
  for (auto *op : inference_program_->Block(0).AllOps()) {
    if (op->Type() == "feed") {
46
      int idx = BOOST_GET_CONST(int, op->GetAttr("col"));
T
tensor-tang 已提交
47
      if (feeds_.size() <= static_cast<size_t>(idx)) {
48 49 50 51 52
        feeds_.resize(idx + 1);
      }
      feeds_[idx] = op;
      feed_names_[op->Output("Out")[0]] = idx;
    } else if (op->Type() == "fetch") {
53
      int idx = BOOST_GET_CONST(int, op->GetAttr("col"));
T
tensor-tang 已提交
54
      if (fetchs_.size() <= static_cast<size_t>(idx)) {
55 56 57 58 59 60 61
        fetchs_.resize(idx + 1);
      }
      fetchs_[idx] = op;
    }
  }
}

T
tensor-tang 已提交
62 63
bool NativePaddlePredictor::Init(
    std::shared_ptr<framework::Scope> parent_scope) {
64
  VLOG(3) << "Predictor::init()";
65 66 67 68 69 70 71 72 73
  if (FLAGS_profile) {
    LOG(WARNING) << "Profiler is actived, might affect the performance";
    LOG(INFO) << "You can turn off by set gflags '-profile false'";

    auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll
                                           : platform::ProfilerState::kCPU;
    platform::EnableProfiler(tracking_device);
  }

74
  // no matter with or without MKLDNN
L
luotao1 已提交
75
  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
76

Y
Yan Chunwei 已提交
77
  if (config_.use_gpu) {
78 79 80
    PADDLE_ENFORCE_EQ(config_.use_xpu, false,
                      platform::errors::InvalidArgument(
                          "Only one choice can be made between CPU and XPU."));
X
Xin Pan 已提交
81
    place_ = paddle::platform::CUDAPlace(config_.device);
82 83
  } else if (config_.use_xpu) {
    place_ = paddle::platform::XPUPlace(config_.device);
W
Wilber 已提交
84 85
  } else if (config_.use_npu) {
    place_ = paddle::platform::NPUPlace(config_.device);
X
Xin Pan 已提交
86 87 88
  } else {
    place_ = paddle::platform::CPUPlace();
  }
T
tensor-tang 已提交
89 90 91
  if (parent_scope) {
    scope_ = parent_scope;
    sub_scope_ = &(parent_scope->NewScope());
92 93 94
    PADDLE_ENFORCE_NOT_NULL(sub_scope_,
                            platform::errors::PreconditionNotMet(
                                "The sub_scope should not be nullptr."));
95
  } else {
96
    paddle::framework::InitDevices();
97
    paddle::framework::InitDefaultKernelSignatureMap();
98 99
    scope_.reset(new paddle::framework::Scope());
  }
100

X
Xin Pan 已提交
101
  executor_.reset(new paddle::framework::Executor(place_));
102

X
Xin Pan 已提交
103 104 105 106
  // Initialize the inference program
  if (!config_.model_dir.empty()) {
    // Parameters are saved in separate files sited in
    // the specified `dirname`.
107 108
    inference_program_ = paddle::inference::Load(executor_.get(), scope_.get(),
                                                 config_.model_dir);
X
Xin Pan 已提交
109 110 111 112 113 114 115
  } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
    // All parameters are saved in a single file.
    // The file names should be consistent with that used
    // in Python API `fluid.io.save_inference_model`.
    inference_program_ = paddle::inference::Load(
        executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
  } else {
Y
Yan Chunwei 已提交
116
    LOG(ERROR) << "fail to load inference model from " << config_.model_dir;
X
Xin Pan 已提交
117 118
    return false;
  }
119

X
Xin Pan 已提交
120
  ctx_ = executor_->Prepare(*inference_program_, 0);
121 122
  executor_->CreateVariables(*inference_program_,
                             sub_scope_ ? sub_scope_ : scope_.get(), 0);
Y
Yan Chunwei 已提交
123

X
Xin Pan 已提交
124
  // Get the feed_target_names and fetch_target_names
125
  PrepareFeedFetch();
X
Xin Pan 已提交
126 127 128
  return true;
}

129
NativePaddlePredictor::~NativePaddlePredictor() {
130 131 132 133
  if (FLAGS_profile) {
    platform::DisableProfiler(platform::EventSortingKey::kTotal,
                              "./profile.log");
  }
134 135 136
  if (sub_scope_) {
    scope_->DeleteScope(sub_scope_);
  }
L
Luo Tao 已提交
137
}
138

Y
Yan Chunwei 已提交
139
bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
140 141
                                std::vector<PaddleTensor> *output_data,
                                int batch_size) {
F
flame 已提交
142 143 144 145 146 147 148 149 150 151
#ifndef PADDLE_ON_INFERENCE
  LOG_FIRST_N(WARNING, 5) << "The NaiveExecutor can not work properly if the "
                             "cmake flag ON_INFER is not set.";
  LOG_FIRST_N(WARNING, 5) << "Unlike the training phase, all the scopes and "
                             "variables will be reused to save the allocation "
                             "overhead.";
  LOG_FIRST_N(WARNING, 5) << "Please re-compile the inference library by "
                             "setting the cmake flag ON_INFER=ON if you are "
                             "running Paddle Inference";
#endif  // PADDLE_ON_INFERENCE
L
luotao1 已提交
152 153 154
  if (UNLIKELY(config_.cpu_math_library_num_threads() > 1)) {
    paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
  }
155
  VLOG(3) << "Predictor::predict";
X
Xin Pan 已提交
156 157 158
  Timer timer;
  timer.tic();
  // set feed variable
159 160
  framework::Scope *scope = sub_scope_ != nullptr ? sub_scope_ : scope_.get();
  if (!SetFeed(inputs, scope)) {
X
Xin Pan 已提交
161 162 163 164 165
    LOG(ERROR) << "fail to set feed";
    return false;
  }
  // Run the inference program
  // if share variables, we need not create variables
166
  VLOG(4) << "Run prepared context";
167 168
  executor_->RunPreparedContext(ctx_.get(), scope,
                                false, /* don't create local scope each time*/
169
                                false /* don't create variable each time */);
170
  VLOG(4) << "Finish prepared context";
171 172
  // get fetch variable
  if (!GetFetch(output_data, scope)) {
173
    LOG(ERROR) << "fail to get fetches";
X
Xin Pan 已提交
174 175
    return false;
  }
M
minqiyang 已提交
176
  VLOG(3) << "predict cost: " << timer.toc() << "ms";
Y
Yan Chunwei 已提交
177

Y
Yan Chunwei 已提交
178 179 180
  // For some other vector like containers not cleaned after each batch.
  tensor_array_batch_cleaner_.CollectNoTensorVars(scope_.get());
  tensor_array_batch_cleaner_.ResetNoTensorVars();
X
Xin Pan 已提交
181 182 183
  return true;
}

184
std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone(void *stream) {
Y
Yan Chunwei 已提交
185 186
  std::lock_guard<std::mutex> lk(clone_mutex_);
  VLOG(3) << "Predictor::clone";
Y
Yan Chunwei 已提交
187
  std::unique_ptr<PaddlePredictor> cls(new NativePaddlePredictor(config_));
Y
Yan Chunwei 已提交
188 189
  // Hot fix the bug that result diff in multi-thread.
  // TODO(Superjomn) re-implement a real clone here.
190 191 192 193
  PADDLE_ENFORCE_NOT_NULL(
      dynamic_cast<NativePaddlePredictor *>(cls.get()),
      platform::errors::PreconditionNotMet(
          "Dynamic_cast from PaddlePredictor to NativePaddlePredictor failed"));
Y
Yan Chunwei 已提交
194
  if (!dynamic_cast<NativePaddlePredictor *>(cls.get())->Init(nullptr)) {
Y
Yan Chunwei 已提交
195
    LOG(ERROR) << "fail to call Init";
X
Xin Pan 已提交
196 197
    return nullptr;
  }
J
Fix mac  
JiabinYang 已提交
198
  return cls;
X
Xin Pan 已提交
199 200
}

Y
Yan Chunwei 已提交
201
bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
202
                                    framework::Scope *scope) {
203
  VLOG(3) << "Predictor::set_feed";
204
  if (inputs.size() != feeds_.size()) {
205 206
    LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get "
               << inputs.size();
X
Xin Pan 已提交
207 208
    return false;
  }
209 210 211 212

  // Cache the inputs memory for better concurrency performance.
  feed_tensors_.resize(inputs.size());

213
  for (size_t i = 0; i < inputs.size(); ++i) {
214
    auto &input = feed_tensors_[i];
215
    framework::DDim ddim = phi::make_ddim(inputs[i].shape);
X
Xin Pan 已提交
216 217
    void *input_ptr;
    if (inputs[i].dtype == PaddleDType::INT64) {
218
      input_ptr = input.mutable_data<int64_t>(ddim, place_);
X
Xin Pan 已提交
219
    } else if (inputs[i].dtype == PaddleDType::FLOAT32) {
220
      input_ptr = input.mutable_data<float>(ddim, place_);
221 222
    } else if (inputs[i].dtype == PaddleDType::INT32) {
      input_ptr = input.mutable_data<int32_t>(ddim, place_);
X
Xin Pan 已提交
223 224 225 226 227
    } else {
      LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
      return false;
    }

228 229 230 231 232 233 234
    PADDLE_ENFORCE_NOT_NULL(input_ptr,
                            platform::errors::InvalidArgument(
                                "The input_ptr should not be nullptr."));
    PADDLE_ENFORCE_NOT_NULL(
        inputs[i].data.data(),
        platform::errors::InvalidArgument(
            "The data of input tensor should not be null."));
235 236 237 238
    if (platform::is_cpu_place(place_)) {
      // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
      std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
                  inputs[i].data.length());
239 240 241 242 243
    } else if (platform::is_gpu_place(place_)) {
      PADDLE_ENFORCE_EQ(
          platform::is_xpu_place(place_), false,
          platform::errors::InvalidArgument(
              "Only one choice can be made between CPU and XPU."));
244
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Q
qingqing01 已提交
245 246 247 248
      platform::DeviceContextPool &pool =
          platform::DeviceContextPool::Instance();
      auto *dev_ctx =
          static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
249
      auto dst_gpu_place = place_;
250 251
      memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
                   platform::CPUPlace(), inputs[i].data.data(),
Q
qingqing01 已提交
252
                   inputs[i].data.length(), dev_ctx->stream());
253
#else
254 255
      PADDLE_THROW(platform::errors::Unavailable(
          "Not compile with CUDA, should not reach here."));
256
#endif
W
Wilber 已提交
257
    } else if (platform::is_xpu_place(place_)) {
258
#ifdef PADDLE_WITH_XPU
259
      auto dst_xpu_place = place_;
260 261 262 263 264 265
      memory::Copy(dst_xpu_place, static_cast<void *>(input_ptr),
                   platform::CPUPlace(), inputs[i].data.data(),
                   inputs[i].data.length());
#else
      PADDLE_THROW(platform::errors::Unavailable(
          "Not compile with XPU, should not reach here."));
W
Wilber 已提交
266 267 268 269 270 271 272
#endif
    } else {
#ifdef PADDLE_WITH_ASCEND_CL
      platform::DeviceContextPool &pool =
          platform::DeviceContextPool::Instance();
      auto *dev_ctx =
          static_cast<const platform::NPUDeviceContext *>(pool.Get(place_));
273
      auto dst_npu_place = place_;
W
Wilber 已提交
274 275 276 277 278 279
      memory::Copy(dst_npu_place, static_cast<void *>(input_ptr),
                   platform::CPUPlace(), inputs[i].data.data(),
                   inputs[i].data.length(), dev_ctx->stream());
#else
      PADDLE_THROW(platform::errors::Unavailable(
          "Not compile with NPU, should not reach here."));
280 281 282
#endif
    }

Y
Yan Chunwei 已提交
283 284 285 286 287 288
    // TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
    framework::LoD lod;
    for (auto &level : inputs[i].lod) {
      lod.emplace_back(level);
    }
    input.set_lod(lod);
289 290
    int idx = -1;
    if (config_.specify_input_name) {
X
polish  
Xin Pan 已提交
291
      idx = feed_names_[inputs[i].name];
292
    } else {
293
      idx = BOOST_GET_CONST(int, feeds_[i]->GetAttr("col"));
294 295
    }
    framework::SetFeedVariable(scope, input, "feed", idx);
X
Xin Pan 已提交
296 297 298
  }
  return true;
}
L
luotao1 已提交
299 300 301
template <typename T>
void NativePaddlePredictor::GetFetchOne(const framework::LoDTensor &fetch,
                                        PaddleTensor *output) {
302
  // set shape.
303
  auto shape = phi::vectorize(fetch.dims());
304 305 306 307 308 309 310 311 312 313 314 315
  output->shape.assign(shape.begin(), shape.end());
  // set data.
  const T *data = fetch.data<T>();
  int num_elems = inference::VecReduceToInt(shape);
  output->data.Resize(num_elems * sizeof(T));
  // The fetched tensor output by fetch op, should always in CPU memory, so just
  // copy.
  memcpy(output->data.data(), data, num_elems * sizeof(T));
  // set lod
  output->lod.clear();
  for (auto &level : fetch.lod()) {
    output->lod.emplace_back(level.begin(), level.end());
L
luotao1 已提交
316 317
  }
}
X
Xin Pan 已提交
318

319 320
bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
                                     framework::Scope *scope) {
321
  VLOG(3) << "Predictor::get_fetch";
322 323
  outputs->resize(fetchs_.size());
  for (size_t i = 0; i < fetchs_.size(); ++i) {
324
    int idx = BOOST_GET_CONST(int, fetchs_[i]->GetAttr("col"));
325 326 327 328 329
    PADDLE_ENFORCE_EQ(
        static_cast<size_t>(idx), i,
        platform::errors::InvalidArgument(
            "Fetch op's col attr(%d) should be equal to the index(%d)", idx,
            i));
330
    framework::FetchType &fetch_var =
331
        framework::GetFetchVariable(*scope, "fetch", idx);
332
    auto fetch = BOOST_GET_CONST(framework::LoDTensor, fetch_var);
333
    auto type = framework::TransToProtoVarType(fetch.dtype());
L
luotao1 已提交
334
    auto output = &(outputs->at(i));
335
    output->name = fetchs_[idx]->Input("X")[0];
336
    if (type == framework::DataTypeTrait<float>::DataType()) {
L
luotao1 已提交
337 338
      GetFetchOne<float>(fetch, output);
      output->dtype = PaddleDType::FLOAT32;
339
    } else if (type == framework::DataTypeTrait<int64_t>::DataType()) {
L
luotao1 已提交
340 341
      GetFetchOne<int64_t>(fetch, output);
      output->dtype = PaddleDType::INT64;
342
    } else if (type == framework::DataTypeTrait<int32_t>::DataType()) {
343 344
      GetFetchOne<int32_t>(fetch, output);
      output->dtype = PaddleDType::INT32;
X
Xin Pan 已提交
345
    } else {
346
      LOG(ERROR) << "unknown type, only support float32, int64 and int32 now.";
Y
Yan Chunwei 已提交
347
    }
X
Xin Pan 已提交
348 349 350 351
  }
  return true;
}

352
template <>
353 354 355
std::unique_ptr<PaddlePredictor>
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
    const NativeConfig &config) {
W
Wilber 已提交
356 357
  // TODO(NHZlX): Should add the link to the doc of
  // paddle_infer::CreatePredictor<paddle_infer::Config>
358
  VLOG(3) << "create NativePaddlePredictor";
Y
Yan Chunwei 已提交
359
  if (config.use_gpu) {
S
Sylwester Fraczek 已提交
360
    // 1. GPU memory
361 362 363 364 365 366 367 368 369
    PADDLE_ENFORCE_GE(config.fraction_of_gpu_memory, 0.f,
                      platform::errors::InvalidArgument(
                          "fraction_of_gpu_memory in the config should be set "
                          "to range (0., 1.]"));
    PADDLE_ENFORCE_GE(config.device, 0,
                      platform::errors::PreconditionNotMet(
                          "Invalid device id %d, the device id should be "
                          "greater than or equal to 0.",
                          config.device));
Y
Yan Chunwei 已提交
370 371 372 373 374
    std::vector<std::string> flags;
    if (config.fraction_of_gpu_memory >= 0.0f ||
        config.fraction_of_gpu_memory <= 0.95f) {
      flags.push_back("dummpy");
      std::string flag = "--fraction_of_gpu_memory_to_use=" +
375
                         num2str<float>(config.fraction_of_gpu_memory);
Y
Yan Chunwei 已提交
376
      flags.push_back(flag);
377
      VLOG(3) << "set flag: " << flag;
Y
Yan Chunwei 已提交
378 379
      framework::InitGflags(flags);
    }
X
Xin Pan 已提交
380
  }
381

Y
Yan Chunwei 已提交
382
  std::unique_ptr<PaddlePredictor> predictor(new NativePaddlePredictor(config));
L
liuwei1031 已提交
383
  PADDLE_ENFORCE_NOT_NULL(
384 385 386
      dynamic_cast<NativePaddlePredictor *>(predictor.get()),
      platform::errors::PreconditionNotMet(
          "Dynamic_cast from PaddlePredictor to NativePaddlePredictor failed"));
T
tensor-tang 已提交
387
  if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) {
X
Xin Pan 已提交
388 389
    return nullptr;
  }
J
Fix mac  
JiabinYang 已提交
390
  return predictor;
X
Xin Pan 已提交
391 392
}

Y
Yan Chunwei 已提交
393 394 395
template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<NativeConfig>(
    const NativeConfig &config) {
W
Wilber 已提交
396
  LOG(WARNING) << "Deprecated. Please use CreatePredictor instead.";
Y
Yan Chunwei 已提交
397 398 399
  return CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
}

X
Xin Pan 已提交
400
}  // namespace paddle