api_impl.cc 15.1 KB
Newer Older
X
Xin Pan 已提交
1 2
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Y
Yan Chunwei 已提交
3 4 5
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
X
Xin Pan 已提交
6

Y
Yan Chunwei 已提交
7
http://www.apache.org/licenses/LICENSE-2.0
X
Xin Pan 已提交
8

Y
Yan Chunwei 已提交
9 10 11 12 13
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
X
Xin Pan 已提交
14

F
flame 已提交
15
#include <glog/logging.h>
W
Wilber 已提交
16
#include <memory>
X
Xin Pan 已提交
17 18 19
#include <sstream>
#include <string>

20
#include "paddle/fluid/framework/feed_fetch_method.h"
L
Luo Tao 已提交
21
#include "paddle/fluid/inference/api/api_impl.h"
22
#include "paddle/fluid/inference/api/helper.h"
23
#include "paddle/fluid/platform/cpu_helper.h"
W
Wilber 已提交
24
#include "paddle/fluid/platform/place.h"
25 26 27
#include "paddle/fluid/platform/profiler.h"

DEFINE_bool(profile, false, "Turn on profiler for fluid");
X
Xin Pan 已提交
28 29

namespace paddle {
30 31 32 33 34 35 36 37 38 39
namespace {
using paddle::inference::Timer;

template <class T>
std::string num2str(T a) {
  std::stringstream istr;
  istr << a;
  return istr.str();
}
}  // namespace
X
Xin Pan 已提交
40

41 42 43
void NativePaddlePredictor::PrepareFeedFetch() {
  for (auto *op : inference_program_->Block(0).AllOps()) {
    if (op->Type() == "feed") {
44
      int idx = BOOST_GET_CONST(int, op->GetAttr("col"));
T
tensor-tang 已提交
45
      if (feeds_.size() <= static_cast<size_t>(idx)) {
46 47 48 49 50
        feeds_.resize(idx + 1);
      }
      feeds_[idx] = op;
      feed_names_[op->Output("Out")[0]] = idx;
    } else if (op->Type() == "fetch") {
51
      int idx = BOOST_GET_CONST(int, op->GetAttr("col"));
T
tensor-tang 已提交
52
      if (fetchs_.size() <= static_cast<size_t>(idx)) {
53 54 55 56 57 58 59
        fetchs_.resize(idx + 1);
      }
      fetchs_[idx] = op;
    }
  }
}

T
tensor-tang 已提交
60 61
bool NativePaddlePredictor::Init(
    std::shared_ptr<framework::Scope> parent_scope) {
62
  VLOG(3) << "Predictor::init()";
63 64 65 66 67 68 69 70 71
  if (FLAGS_profile) {
    LOG(WARNING) << "Profiler is actived, might affect the performance";
    LOG(INFO) << "You can turn off by set gflags '-profile false'";

    auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll
                                           : platform::ProfilerState::kCPU;
    platform::EnableProfiler(tracking_device);
  }

72
  // no matter with or without MKLDNN
L
luotao1 已提交
73
  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
74

Y
Yan Chunwei 已提交
75
  if (config_.use_gpu) {
76 77 78
    PADDLE_ENFORCE_EQ(config_.use_xpu, false,
                      platform::errors::InvalidArgument(
                          "Only one choice can be made between CPU and XPU."));
X
Xin Pan 已提交
79
    place_ = paddle::platform::CUDAPlace(config_.device);
80 81
  } else if (config_.use_xpu) {
    place_ = paddle::platform::XPUPlace(config_.device);
W
Wilber 已提交
82 83
  } else if (config_.use_npu) {
    place_ = paddle::platform::NPUPlace(config_.device);
X
Xin Pan 已提交
84 85 86
  } else {
    place_ = paddle::platform::CPUPlace();
  }
T
tensor-tang 已提交
87 88 89
  if (parent_scope) {
    scope_ = parent_scope;
    sub_scope_ = &(parent_scope->NewScope());
90 91 92
    PADDLE_ENFORCE_NOT_NULL(sub_scope_,
                            platform::errors::PreconditionNotMet(
                                "The sub_scope should not be nullptr."));
93
  } else {
94
    paddle::framework::InitDevices();
95
    paddle::framework::InitDefaultKernelSignatureMap();
96 97
    scope_.reset(new paddle::framework::Scope());
  }
98

X
Xin Pan 已提交
99
  executor_.reset(new paddle::framework::Executor(place_));
100

X
Xin Pan 已提交
101 102 103 104
  // Initialize the inference program
  if (!config_.model_dir.empty()) {
    // Parameters are saved in separate files sited in
    // the specified `dirname`.
105 106
    inference_program_ = paddle::inference::Load(executor_.get(), scope_.get(),
                                                 config_.model_dir);
X
Xin Pan 已提交
107 108 109 110 111 112 113
  } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
    // All parameters are saved in a single file.
    // The file names should be consistent with that used
    // in Python API `fluid.io.save_inference_model`.
    inference_program_ = paddle::inference::Load(
        executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
  } else {
Y
Yan Chunwei 已提交
114
    LOG(ERROR) << "fail to load inference model from " << config_.model_dir;
X
Xin Pan 已提交
115 116
    return false;
  }
117

X
Xin Pan 已提交
118
  ctx_ = executor_->Prepare(*inference_program_, 0);
119 120
  executor_->CreateVariables(*inference_program_,
                             sub_scope_ ? sub_scope_ : scope_.get(), 0);
Y
Yan Chunwei 已提交
121

X
Xin Pan 已提交
122
  // Get the feed_target_names and fetch_target_names
123
  PrepareFeedFetch();
X
Xin Pan 已提交
124 125 126
  return true;
}

127
NativePaddlePredictor::~NativePaddlePredictor() {
128 129 130 131
  if (FLAGS_profile) {
    platform::DisableProfiler(platform::EventSortingKey::kTotal,
                              "./profile.log");
  }
132 133 134
  if (sub_scope_) {
    scope_->DeleteScope(sub_scope_);
  }
L
Luo Tao 已提交
135
}
136

Y
Yan Chunwei 已提交
137
bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
138 139
                                std::vector<PaddleTensor> *output_data,
                                int batch_size) {
F
flame 已提交
140 141 142 143 144 145 146 147 148 149
#ifndef PADDLE_ON_INFERENCE
  LOG_FIRST_N(WARNING, 5) << "The NaiveExecutor can not work properly if the "
                             "cmake flag ON_INFER is not set.";
  LOG_FIRST_N(WARNING, 5) << "Unlike the training phase, all the scopes and "
                             "variables will be reused to save the allocation "
                             "overhead.";
  LOG_FIRST_N(WARNING, 5) << "Please re-compile the inference library by "
                             "setting the cmake flag ON_INFER=ON if you are "
                             "running Paddle Inference";
#endif  // PADDLE_ON_INFERENCE
L
luotao1 已提交
150 151 152
  if (UNLIKELY(config_.cpu_math_library_num_threads() > 1)) {
    paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
  }
153
  VLOG(3) << "Predictor::predict";
X
Xin Pan 已提交
154 155 156
  Timer timer;
  timer.tic();
  // set feed variable
157 158
  framework::Scope *scope = sub_scope_ != nullptr ? sub_scope_ : scope_.get();
  if (!SetFeed(inputs, scope)) {
X
Xin Pan 已提交
159 160 161 162 163
    LOG(ERROR) << "fail to set feed";
    return false;
  }
  // Run the inference program
  // if share variables, we need not create variables
164
  VLOG(4) << "Run prepared context";
165 166
  executor_->RunPreparedContext(ctx_.get(), scope,
                                false, /* don't create local scope each time*/
167
                                false /* don't create variable each time */);
168
  VLOG(4) << "Finish prepared context";
169 170
  // get fetch variable
  if (!GetFetch(output_data, scope)) {
171
    LOG(ERROR) << "fail to get fetches";
X
Xin Pan 已提交
172 173
    return false;
  }
M
minqiyang 已提交
174
  VLOG(3) << "predict cost: " << timer.toc() << "ms";
Y
Yan Chunwei 已提交
175

Y
Yan Chunwei 已提交
176 177 178
  // For some other vector like containers not cleaned after each batch.
  tensor_array_batch_cleaner_.CollectNoTensorVars(scope_.get());
  tensor_array_batch_cleaner_.ResetNoTensorVars();
X
Xin Pan 已提交
179 180 181
  return true;
}

Y
Yan Chunwei 已提交
182
std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
Y
Yan Chunwei 已提交
183 184
  std::lock_guard<std::mutex> lk(clone_mutex_);
  VLOG(3) << "Predictor::clone";
Y
Yan Chunwei 已提交
185
  std::unique_ptr<PaddlePredictor> cls(new NativePaddlePredictor(config_));
Y
Yan Chunwei 已提交
186 187
  // Hot fix the bug that result diff in multi-thread.
  // TODO(Superjomn) re-implement a real clone here.
188 189 190 191
  PADDLE_ENFORCE_NOT_NULL(
      dynamic_cast<NativePaddlePredictor *>(cls.get()),
      platform::errors::PreconditionNotMet(
          "Dynamic_cast from PaddlePredictor to NativePaddlePredictor failed"));
Y
Yan Chunwei 已提交
192
  if (!dynamic_cast<NativePaddlePredictor *>(cls.get())->Init(nullptr)) {
Y
Yan Chunwei 已提交
193
    LOG(ERROR) << "fail to call Init";
X
Xin Pan 已提交
194 195
    return nullptr;
  }
J
Fix mac  
JiabinYang 已提交
196
  return cls;
X
Xin Pan 已提交
197 198
}

Y
Yan Chunwei 已提交
199
bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
200
                                    framework::Scope *scope) {
201
  VLOG(3) << "Predictor::set_feed";
202
  if (inputs.size() != feeds_.size()) {
203 204
    LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get "
               << inputs.size();
X
Xin Pan 已提交
205 206
    return false;
  }
207 208 209 210

  // Cache the inputs memory for better concurrency performance.
  feed_tensors_.resize(inputs.size());

211
  for (size_t i = 0; i < inputs.size(); ++i) {
212
    auto &input = feed_tensors_[i];
213
    framework::DDim ddim = phi::make_ddim(inputs[i].shape);
X
Xin Pan 已提交
214 215
    void *input_ptr;
    if (inputs[i].dtype == PaddleDType::INT64) {
216
      input_ptr = input.mutable_data<int64_t>(ddim, place_);
X
Xin Pan 已提交
217
    } else if (inputs[i].dtype == PaddleDType::FLOAT32) {
218
      input_ptr = input.mutable_data<float>(ddim, place_);
219 220
    } else if (inputs[i].dtype == PaddleDType::INT32) {
      input_ptr = input.mutable_data<int32_t>(ddim, place_);
X
Xin Pan 已提交
221 222 223 224 225
    } else {
      LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
      return false;
    }

226 227 228 229 230 231 232
    PADDLE_ENFORCE_NOT_NULL(input_ptr,
                            platform::errors::InvalidArgument(
                                "The input_ptr should not be nullptr."));
    PADDLE_ENFORCE_NOT_NULL(
        inputs[i].data.data(),
        platform::errors::InvalidArgument(
            "The data of input tensor should not be null."));
233 234 235 236
    if (platform::is_cpu_place(place_)) {
      // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
      std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
                  inputs[i].data.length());
237 238 239 240 241
    } else if (platform::is_gpu_place(place_)) {
      PADDLE_ENFORCE_EQ(
          platform::is_xpu_place(place_), false,
          platform::errors::InvalidArgument(
              "Only one choice can be made between CPU and XPU."));
242
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Q
qingqing01 已提交
243 244 245 246
      platform::DeviceContextPool &pool =
          platform::DeviceContextPool::Instance();
      auto *dev_ctx =
          static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
247
      auto dst_gpu_place = place_;
248 249
      memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
                   platform::CPUPlace(), inputs[i].data.data(),
Q
qingqing01 已提交
250
                   inputs[i].data.length(), dev_ctx->stream());
251
#else
252 253
      PADDLE_THROW(platform::errors::Unavailable(
          "Not compile with CUDA, should not reach here."));
254
#endif
W
Wilber 已提交
255
    } else if (platform::is_xpu_place(place_)) {
256
#ifdef PADDLE_WITH_XPU
257
      auto dst_xpu_place = place_;
258 259 260 261 262 263
      memory::Copy(dst_xpu_place, static_cast<void *>(input_ptr),
                   platform::CPUPlace(), inputs[i].data.data(),
                   inputs[i].data.length());
#else
      PADDLE_THROW(platform::errors::Unavailable(
          "Not compile with XPU, should not reach here."));
W
Wilber 已提交
264 265 266 267 268 269 270
#endif
    } else {
#ifdef PADDLE_WITH_ASCEND_CL
      platform::DeviceContextPool &pool =
          platform::DeviceContextPool::Instance();
      auto *dev_ctx =
          static_cast<const platform::NPUDeviceContext *>(pool.Get(place_));
271
      auto dst_npu_place = place_;
W
Wilber 已提交
272 273 274 275 276 277
      memory::Copy(dst_npu_place, static_cast<void *>(input_ptr),
                   platform::CPUPlace(), inputs[i].data.data(),
                   inputs[i].data.length(), dev_ctx->stream());
#else
      PADDLE_THROW(platform::errors::Unavailable(
          "Not compile with NPU, should not reach here."));
278 279 280
#endif
    }

Y
Yan Chunwei 已提交
281 282 283 284 285 286
    // TODO(Superjomn) Low performance, need optimization for heavy LoD copy.
    framework::LoD lod;
    for (auto &level : inputs[i].lod) {
      lod.emplace_back(level);
    }
    input.set_lod(lod);
287 288
    int idx = -1;
    if (config_.specify_input_name) {
X
polish  
Xin Pan 已提交
289
      idx = feed_names_[inputs[i].name];
290
    } else {
291
      idx = BOOST_GET_CONST(int, feeds_[i]->GetAttr("col"));
292 293
    }
    framework::SetFeedVariable(scope, input, "feed", idx);
X
Xin Pan 已提交
294 295 296
  }
  return true;
}
L
luotao1 已提交
297 298 299
template <typename T>
void NativePaddlePredictor::GetFetchOne(const framework::LoDTensor &fetch,
                                        PaddleTensor *output) {
300
  // set shape.
301
  auto shape = phi::vectorize(fetch.dims());
302 303 304 305 306 307 308 309 310 311 312 313
  output->shape.assign(shape.begin(), shape.end());
  // set data.
  const T *data = fetch.data<T>();
  int num_elems = inference::VecReduceToInt(shape);
  output->data.Resize(num_elems * sizeof(T));
  // The fetched tensor output by fetch op, should always in CPU memory, so just
  // copy.
  memcpy(output->data.data(), data, num_elems * sizeof(T));
  // set lod
  output->lod.clear();
  for (auto &level : fetch.lod()) {
    output->lod.emplace_back(level.begin(), level.end());
L
luotao1 已提交
314 315
  }
}
X
Xin Pan 已提交
316

317 318
bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
                                     framework::Scope *scope) {
319
  VLOG(3) << "Predictor::get_fetch";
320 321
  outputs->resize(fetchs_.size());
  for (size_t i = 0; i < fetchs_.size(); ++i) {
322
    int idx = BOOST_GET_CONST(int, fetchs_[i]->GetAttr("col"));
323 324 325 326 327
    PADDLE_ENFORCE_EQ(
        static_cast<size_t>(idx), i,
        platform::errors::InvalidArgument(
            "Fetch op's col attr(%d) should be equal to the index(%d)", idx,
            i));
328
    framework::FetchType &fetch_var =
329
        framework::GetFetchVariable(*scope, "fetch", idx);
330
    auto fetch = BOOST_GET_CONST(framework::LoDTensor, fetch_var);
331
    auto type = framework::TransToProtoVarType(fetch.dtype());
L
luotao1 已提交
332
    auto output = &(outputs->at(i));
333
    output->name = fetchs_[idx]->Input("X")[0];
334
    if (type == framework::DataTypeTrait<float>::DataType()) {
L
luotao1 已提交
335 336
      GetFetchOne<float>(fetch, output);
      output->dtype = PaddleDType::FLOAT32;
337
    } else if (type == framework::DataTypeTrait<int64_t>::DataType()) {
L
luotao1 已提交
338 339
      GetFetchOne<int64_t>(fetch, output);
      output->dtype = PaddleDType::INT64;
340
    } else if (type == framework::DataTypeTrait<int32_t>::DataType()) {
341 342
      GetFetchOne<int32_t>(fetch, output);
      output->dtype = PaddleDType::INT32;
X
Xin Pan 已提交
343
    } else {
344
      LOG(ERROR) << "unknown type, only support float32, int64 and int32 now.";
Y
Yan Chunwei 已提交
345
    }
X
Xin Pan 已提交
346 347 348 349
  }
  return true;
}

350
template <>
351 352
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
    NativeConfig, PaddleEngineKind::kNative>(const NativeConfig &config) {
W
Wilber 已提交
353 354
  // TODO(NHZlX): Should add the link to the doc of
  // paddle_infer::CreatePredictor<paddle_infer::Config>
355
  VLOG(3) << "create NativePaddlePredictor";
Y
Yan Chunwei 已提交
356
  if (config.use_gpu) {
S
Sylwester Fraczek 已提交
357
    // 1. GPU memory
358 359 360 361 362 363 364 365 366
    PADDLE_ENFORCE_GE(config.fraction_of_gpu_memory, 0.f,
                      platform::errors::InvalidArgument(
                          "fraction_of_gpu_memory in the config should be set "
                          "to range (0., 1.]"));
    PADDLE_ENFORCE_GE(config.device, 0,
                      platform::errors::PreconditionNotMet(
                          "Invalid device id %d, the device id should be "
                          "greater than or equal to 0.",
                          config.device));
Y
Yan Chunwei 已提交
367 368 369 370 371
    std::vector<std::string> flags;
    if (config.fraction_of_gpu_memory >= 0.0f ||
        config.fraction_of_gpu_memory <= 0.95f) {
      flags.push_back("dummpy");
      std::string flag = "--fraction_of_gpu_memory_to_use=" +
372
                         num2str<float>(config.fraction_of_gpu_memory);
Y
Yan Chunwei 已提交
373
      flags.push_back(flag);
374
      VLOG(3) << "set flag: " << flag;
Y
Yan Chunwei 已提交
375 376
      framework::InitGflags(flags);
    }
X
Xin Pan 已提交
377
  }
378

Y
Yan Chunwei 已提交
379
  std::unique_ptr<PaddlePredictor> predictor(new NativePaddlePredictor(config));
L
liuwei1031 已提交
380
  PADDLE_ENFORCE_NOT_NULL(
381 382 383
      dynamic_cast<NativePaddlePredictor *>(predictor.get()),
      platform::errors::PreconditionNotMet(
          "Dynamic_cast from PaddlePredictor to NativePaddlePredictor failed"));
T
tensor-tang 已提交
384
  if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) {
X
Xin Pan 已提交
385 386
    return nullptr;
  }
J
Fix mac  
JiabinYang 已提交
387
  return predictor;
X
Xin Pan 已提交
388 389
}

Y
Yan Chunwei 已提交
390 391 392
template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<NativeConfig>(
    const NativeConfig &config) {
W
Wilber 已提交
393
  LOG(WARNING) << "Deprecated. Please use CreatePredictor instead.";
Y
Yan Chunwei 已提交
394 395 396
  return CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
}

X
Xin Pan 已提交
397
}  // namespace paddle