api_impl.cc 15.1 KB
Newer Older
X
Xin Pan 已提交
1 2
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Y
Yan Chunwei 已提交
3 4 5
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
X
Xin Pan 已提交
6

Y
Yan Chunwei 已提交
7
http://www.apache.org/licenses/LICENSE-2.0
X
Xin Pan 已提交
8

Y
Yan Chunwei 已提交
9 10 11 12 13
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
X
Xin Pan 已提交
14

F
flame 已提交
15
#include <glog/logging.h>
W
Wilber 已提交
16
#include <memory>
X
Xin Pan 已提交
17 18 19
#include <sstream>
#include <string>

20
#include "paddle/fluid/framework/feed_fetch_method.h"
L
Luo Tao 已提交
21
#include "paddle/fluid/inference/api/api_impl.h"
22
#include "paddle/fluid/inference/api/helper.h"
23
#include "paddle/fluid/platform/cpu_helper.h"
W
Wilber 已提交
24
#include "paddle/fluid/platform/place.h"
25 26 27
#include "paddle/fluid/platform/profiler.h"

DEFINE_bool(profile, false, "Turn on profiler for fluid");
X
Xin Pan 已提交
28 29

namespace paddle {
30 31 32 33 34 35 36 37 38 39
namespace {
using paddle::inference::Timer;

template <class T>
std::string num2str(T a) {
  std::stringstream istr;
  istr << a;
  return istr.str();
}
}  // namespace
X
Xin Pan 已提交
40

41 42 43
void NativePaddlePredictor::PrepareFeedFetch() {
  for (auto *op : inference_program_->Block(0).AllOps()) {
    if (op->Type() == "feed") {
44
      int idx = BOOST_GET_CONST(int, op->GetAttr("col"));
T
tensor-tang 已提交
45
      if (feeds_.size() <= static_cast<size_t>(idx)) {
46 47 48 49 50
        feeds_.resize(idx + 1);
      }
      feeds_[idx] = op;
      feed_names_[op->Output("Out")[0]] = idx;
    } else if (op->Type() == "fetch") {
51
      int idx = BOOST_GET_CONST(int, op->GetAttr("col"));
T
tensor-tang 已提交
52
      if (fetchs_.size() <= static_cast<size_t>(idx)) {
53 54 55 56 57 58 59
        fetchs_.resize(idx + 1);
      }
      fetchs_[idx] = op;
    }
  }
}

T
tensor-tang 已提交
60 61
bool NativePaddlePredictor::Init(
    std::shared_ptr<framework::Scope> parent_scope) {
62
  VLOG(3) << "Predictor::init()";
63 64 65 66 67 68 69 70 71
  if (FLAGS_profile) {
    LOG(WARNING) << "Profiler is actived, might affect the performance";
    LOG(INFO) << "You can turn off by set gflags '-profile false'";

    auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll
                                           : platform::ProfilerState::kCPU;
    platform::EnableProfiler(tracking_device);
  }

72
  // no matter with or without MKLDNN
L
luotao1 已提交
73
  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
74

Y
Yan Chunwei 已提交
75
  if (config_.use_gpu) {
76 77 78
    PADDLE_ENFORCE_EQ(config_.use_xpu, false,
                      platform::errors::InvalidArgument(
                          "Only one choice can be made between CPU and XPU."));
X
Xin Pan 已提交
79
    place_ = paddle::platform::CUDAPlace(config_.device);
80 81
  } else if (config_.use_xpu) {
    place_ = paddle::platform::XPUPlace(config_.device);
W
Wilber 已提交
82 83
  } else if (config_.use_npu) {
    place_ = paddle::platform::NPUPlace(config_.device);
X
Xin Pan 已提交
84 85 86
  } else {
    place_ = paddle::platform::CPUPlace();
  }
T
tensor-tang 已提交
87 88 89
  if (parent_scope) {
    scope_ = parent_scope;
    sub_scope_ = &(parent_scope->NewScope());
90 91 92
    PADDLE_ENFORCE_NOT_NULL(sub_scope_,
                            platform::errors::PreconditionNotMet(
                                "The sub_scope should not be nullptr."));
93
  } else {
94
    paddle::framework::InitDevices();
95 96
    scope_.reset(new paddle::framework::Scope());
  }
97

X
Xin Pan 已提交
98
  executor_.reset(new paddle::framework::Executor(place_));
99

X
Xin Pan 已提交
100 101 102 103
  // Initialize the inference program
  if (!config_.model_dir.empty()) {
    // Parameters are saved in separate files sited in
    // the specified `dirname`.
104 105
    inference_program_ = paddle::inference::Load(executor_.get(), scope_.get(),
                                                 config_.model_dir);
X
Xin Pan 已提交
106 107 108 109 110 111 112
  } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
    // All parameters are saved in a single file.
    // The file names should be consistent with that used
    // in Python API `fluid.io.save_inference_model`.
    inference_program_ = paddle::inference::Load(
        executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
  } else {
Y
Yan Chunwei 已提交
113
    LOG(ERROR) << "fail to load inference model from " << config_.model_dir;
X
Xin Pan 已提交
114 115
    return false;
  }
116

X
Xin Pan 已提交
117
  ctx_ = executor_->Prepare(*inference_program_, 0);
118 119
  executor_->CreateVariables(*inference_program_,
                             sub_scope_ ? sub_scope_ : scope_.get(), 0);
Y
Yan Chunwei 已提交
120

X
Xin Pan 已提交
121
  // Get the feed_target_names and fetch_target_names
122
  PrepareFeedFetch();
X
Xin Pan 已提交
123 124 125
  return true;
}

126
NativePaddlePredictor::~NativePaddlePredictor() {
127 128 129 130
  if (FLAGS_profile) {
    platform::DisableProfiler(platform::EventSortingKey::kTotal,
                              "./profile.log");
  }
131 132 133
  if (sub_scope_) {
    scope_->DeleteScope(sub_scope_);
  }
L
Luo Tao 已提交
134
}
135

Y
Yan Chunwei 已提交
136
bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
137 138
                                std::vector<PaddleTensor> *output_data,
                                int batch_size) {
F
flame 已提交
139 140 141 142 143 144 145 146 147 148
#ifndef PADDLE_ON_INFERENCE
  LOG_FIRST_N(WARNING, 5) << "The NaiveExecutor can not work properly if the "
                             "cmake flag ON_INFER is not set.";
  LOG_FIRST_N(WARNING, 5) << "Unlike the training phase, all the scopes and "
                             "variables will be reused to save the allocation "
                             "overhead.";
  LOG_FIRST_N(WARNING, 5) << "Please re-compile the inference library by "
                             "setting the cmake flag ON_INFER=ON if you are "
                             "running Paddle Inference";
#endif  // PADDLE_ON_INFERENCE
L
luotao1 已提交
149 150 151
  if (UNLIKELY(config_.cpu_math_library_num_threads() > 1)) {
    paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
  }
152
  VLOG(3) << "Predictor::predict";
X
Xin Pan 已提交
153 154 155
  Timer timer;
  timer.tic();
  // set feed variable
156 157
  framework::Scope *scope = sub_scope_ != nullptr ? sub_scope_ : scope_.get();
  if (!SetFeed(inputs, scope)) {
X
Xin Pan 已提交
158 159 160 161 162
    LOG(ERROR) << "fail to set feed";
    return false;
  }
  // Run the inference program
  // if share variables, we need not create variables
163
  VLOG(4) << "Run prepared context";
164 165
  executor_->RunPreparedContext(ctx_.get(), scope,
                                false, /* don't create local scope each time*/
166
                                false /* don't create variable each time */);
167
  VLOG(4) << "Finish prepared context";
168 169
  // get fetch variable
  if (!GetFetch(output_data, scope)) {
170
    LOG(ERROR) << "fail to get fetches";
X
Xin Pan 已提交
171 172
    return false;
  }
M
minqiyang 已提交
173
  VLOG(3) << "predict cost: " << timer.toc() << "ms";
Y
Yan Chunwei 已提交
174

Y
Yan Chunwei 已提交
175 176 177
  // For some other vector like containers not cleaned after each batch.
  tensor_array_batch_cleaner_.CollectNoTensorVars(scope_.get());
  tensor_array_batch_cleaner_.ResetNoTensorVars();
X
Xin Pan 已提交
178 179 180
  return true;
}

Y
Yan Chunwei 已提交
181
std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
Y
Yan Chunwei 已提交
182 183
  std::lock_guard<std::mutex> lk(clone_mutex_);
  VLOG(3) << "Predictor::clone";
Y
Yan Chunwei 已提交
184
  std::unique_ptr<PaddlePredictor> cls(new NativePaddlePredictor(config_));
Y
Yan Chunwei 已提交
185 186
  // Hot fix the bug that result diff in multi-thread.
  // TODO(Superjomn) re-implement a real clone here.
187 188 189 190
  PADDLE_ENFORCE_NOT_NULL(
      dynamic_cast<NativePaddlePredictor *>(cls.get()),
      platform::errors::PreconditionNotMet(
          "Dynamic_cast from PaddlePredictor to NativePaddlePredictor failed"));
Y
Yan Chunwei 已提交
191
  if (!dynamic_cast<NativePaddlePredictor *>(cls.get())->Init(nullptr)) {
Y
Yan Chunwei 已提交
192
    LOG(ERROR) << "fail to call Init";
X
Xin Pan 已提交
193 194
    return nullptr;
  }
J
Fix mac  
JiabinYang 已提交
195
  return cls;
X
Xin Pan 已提交
196 197
}

Y
Yan Chunwei 已提交
198
bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
199
                                    framework::Scope *scope) {
200
  VLOG(3) << "Predictor::set_feed";
201
  if (inputs.size() != feeds_.size()) {
202 203
    LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get "
               << inputs.size();
X
Xin Pan 已提交
204 205
    return false;
  }
206 207 208 209

  // Cache the inputs memory for better concurrency performance.
  feed_tensors_.resize(inputs.size());

210
  for (size_t i = 0; i < inputs.size(); ++i) {
211
    auto &input = feed_tensors_[i];
212
    framework::DDim ddim = framework::make_ddim(inputs[i].shape);
X
Xin Pan 已提交
213 214
    void *input_ptr;
    if (inputs[i].dtype == PaddleDType::INT64) {
215
      input_ptr = input.mutable_data<int64_t>(ddim, place_);
X
Xin Pan 已提交
216
    } else if (inputs[i].dtype == PaddleDType::FLOAT32) {
217
      input_ptr = input.mutable_data<float>(ddim, place_);
218 219
    } else if (inputs[i].dtype == PaddleDType::INT32) {
      input_ptr = input.mutable_data<int32_t>(ddim, place_);
X
Xin Pan 已提交
220 221 222 223 224
    } else {
      LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
      return false;
    }

225 226 227 228 229 230 231
    PADDLE_ENFORCE_NOT_NULL(input_ptr,
                            platform::errors::InvalidArgument(
                                "The input_ptr should not be nullptr."));
    PADDLE_ENFORCE_NOT_NULL(
        inputs[i].data.data(),
        platform::errors::InvalidArgument(
            "The data of input tensor should not be null."));
232 233 234 235
    if (platform::is_cpu_place(place_)) {
      // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
      std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
                  inputs[i].data.length());
236 237 238 239 240
    } else if (platform::is_gpu_place(place_)) {
      PADDLE_ENFORCE_EQ(
          platform::is_xpu_place(place_), false,
          platform::errors::InvalidArgument(
              "Only one choice can be made between CPU and XPU."));
241
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Q
qingqing01 已提交
242 243 244 245
      platform::DeviceContextPool &pool =
          platform::DeviceContextPool::Instance();
      auto *dev_ctx =
          static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
246
      auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
247 248
      memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
                   platform::CPUPlace(), inputs[i].data.data(),
Q
qingqing01 已提交
249
                   inputs[i].data.length(), dev_ctx->stream());
250
#else
251 252
      PADDLE_THROW(platform::errors::Unavailable(
          "Not compile with CUDA, should not reach here."));
253
#endif
W
Wilber 已提交
254
    } else if (platform::is_xpu_place(place_)) {
255 256 257 258 259 260 261 262
#ifdef PADDLE_WITH_XPU
      auto dst_xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
      memory::Copy(dst_xpu_place, static_cast<void *>(input_ptr),
                   platform::CPUPlace(), inputs[i].data.data(),
                   inputs[i].data.length());
#else
      PADDLE_THROW(platform::errors::Unavailable(
          "Not compile with XPU, should not reach here."));
W
Wilber 已提交
263 264 265 266 267 268 269 270 271 272 273 274 275 276
#endif
    } else {
#ifdef PADDLE_WITH_ASCEND_CL
      platform::DeviceContextPool &pool =
          platform::DeviceContextPool::Instance();
      auto *dev_ctx =
          static_cast<const platform::NPUDeviceContext *>(pool.Get(place_));
      auto dst_npu_place = BOOST_GET_CONST(platform::NPUPlace, place_);
      memory::Copy(dst_npu_place, static_cast<void *>(input_ptr),
                   platform::CPUPlace(), inputs[i].data.data(),
                   inputs[i].data.length(), dev_ctx->stream());
#else
      PADDLE_THROW(platform::errors::Unavailable(
          "Not compile with NPU, should not reach here."));
277 278 279
#endif
    }

Y
Yan Chunwei 已提交
280 281 282 283 284 285
    // TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
    framework::LoD lod;
    for (auto &level : inputs[i].lod) {
      lod.emplace_back(level);
    }
    input.set_lod(lod);
286 287
    int idx = -1;
    if (config_.specify_input_name) {
X
polish  
Xin Pan 已提交
288
      idx = feed_names_[inputs[i].name];
289
    } else {
290
      idx = BOOST_GET_CONST(int, feeds_[i]->GetAttr("col"));
291 292
    }
    framework::SetFeedVariable(scope, input, "feed", idx);
X
Xin Pan 已提交
293 294 295
  }
  return true;
}
L
luotao1 已提交
296 297 298
template <typename T>
void NativePaddlePredictor::GetFetchOne(const framework::LoDTensor &fetch,
                                        PaddleTensor *output) {
299 300 301 302 303 304 305 306 307 308 309 310 311 312
  // set shape.
  auto shape = framework::vectorize(fetch.dims());
  output->shape.assign(shape.begin(), shape.end());
  // set data.
  const T *data = fetch.data<T>();
  int num_elems = inference::VecReduceToInt(shape);
  output->data.Resize(num_elems * sizeof(T));
  // The fetched tensor output by fetch op, should always in CPU memory, so just
  // copy.
  memcpy(output->data.data(), data, num_elems * sizeof(T));
  // set lod
  output->lod.clear();
  for (auto &level : fetch.lod()) {
    output->lod.emplace_back(level.begin(), level.end());
L
luotao1 已提交
313 314
  }
}
X
Xin Pan 已提交
315

316 317
bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
                                     framework::Scope *scope) {
318
  VLOG(3) << "Predictor::get_fetch";
319 320
  outputs->resize(fetchs_.size());
  for (size_t i = 0; i < fetchs_.size(); ++i) {
321
    int idx = BOOST_GET_CONST(int, fetchs_[i]->GetAttr("col"));
322 323 324 325 326
    PADDLE_ENFORCE_EQ(
        static_cast<size_t>(idx), i,
        platform::errors::InvalidArgument(
            "Fetch op's col attr(%d) should be equal to the index(%d)", idx,
            i));
327
    framework::FetchType &fetch_var =
328
        framework::GetFetchVariable(*scope, "fetch", idx);
329
    auto fetch = BOOST_GET_CONST(framework::LoDTensor, fetch_var);
L
luotao1 已提交
330 331
    auto type = fetch.type();
    auto output = &(outputs->at(i));
332
    output->name = fetchs_[idx]->Input("X")[0];
333
    if (type == framework::DataTypeTrait<float>::DataType()) {
L
luotao1 已提交
334 335
      GetFetchOne<float>(fetch, output);
      output->dtype = PaddleDType::FLOAT32;
336
    } else if (type == framework::DataTypeTrait<int64_t>::DataType()) {
L
luotao1 已提交
337 338
      GetFetchOne<int64_t>(fetch, output);
      output->dtype = PaddleDType::INT64;
339
    } else if (type == framework::DataTypeTrait<int32_t>::DataType()) {
340 341
      GetFetchOne<int32_t>(fetch, output);
      output->dtype = PaddleDType::INT32;
X
Xin Pan 已提交
342
    } else {
343
      LOG(ERROR) << "unknown type, only support float32, int64 and int32 now.";
Y
Yan Chunwei 已提交
344
    }
X
Xin Pan 已提交
345 346 347 348
  }
  return true;
}

349
template <>
350 351
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
    NativeConfig, PaddleEngineKind::kNative>(const NativeConfig &config) {
W
Wilber 已提交
352 353
  // TODO(NHZlX): Should add the link to the doc of
  // paddle_infer::CreatePredictor<paddle_infer::Config>
354
  VLOG(3) << "create NativePaddlePredictor";
Y
Yan Chunwei 已提交
355
  if (config.use_gpu) {
S
Sylwester Fraczek 已提交
356
    // 1. GPU memory
357 358 359 360 361 362 363 364 365
    PADDLE_ENFORCE_GE(config.fraction_of_gpu_memory, 0.f,
                      platform::errors::InvalidArgument(
                          "fraction_of_gpu_memory in the config should be set "
                          "to range (0., 1.]"));
    PADDLE_ENFORCE_GE(config.device, 0,
                      platform::errors::PreconditionNotMet(
                          "Invalid device id %d, the device id should be "
                          "greater than or equal to 0.",
                          config.device));
Y
Yan Chunwei 已提交
366 367 368 369 370
    std::vector<std::string> flags;
    if (config.fraction_of_gpu_memory >= 0.0f ||
        config.fraction_of_gpu_memory <= 0.95f) {
      flags.push_back("dummpy");
      std::string flag = "--fraction_of_gpu_memory_to_use=" +
371
                         num2str<float>(config.fraction_of_gpu_memory);
Y
Yan Chunwei 已提交
372
      flags.push_back(flag);
373
      VLOG(3) << "set flag: " << flag;
Y
Yan Chunwei 已提交
374 375
      framework::InitGflags(flags);
    }
X
Xin Pan 已提交
376
  }
377

Y
Yan Chunwei 已提交
378
  std::unique_ptr<PaddlePredictor> predictor(new NativePaddlePredictor(config));
L
liuwei1031 已提交
379
  PADDLE_ENFORCE_NOT_NULL(
380 381 382
      dynamic_cast<NativePaddlePredictor *>(predictor.get()),
      platform::errors::PreconditionNotMet(
          "Dynamic_cast from PaddlePredictor to NativePaddlePredictor failed"));
T
tensor-tang 已提交
383
  if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) {
X
Xin Pan 已提交
384 385
    return nullptr;
  }
J
Fix mac  
JiabinYang 已提交
386
  return predictor;
X
Xin Pan 已提交
387 388
}

Y
Yan Chunwei 已提交
389 390 391
template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<NativeConfig>(
    const NativeConfig &config) {
W
Wilber 已提交
392
  LOG(WARNING) << "Deprecated. Please use CreatePredictor instead.";
Y
Yan Chunwei 已提交
393 394 395
  return CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
}

X
Xin Pan 已提交
396
}  // namespace paddle