paddle_api.h 17.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once

16 17 18
/*! \file paddle_api.h
 */

Y
Yan Chunwei 已提交
19 20 21 22 23 24
/*! \mainpage Paddle Inference APIs
 * \section intro_sec Introduction
 * The Paddle inference library aims to offer an high performance inference SDK
 * for Paddle users.
 */

25
#include <cassert>
26
#include <map>
27 28 29
#include <memory>
#include <string>
#include <vector>
30

M
MRXLT 已提交
31
#include "crypto/cipher.h"
32
#include "paddle_infer_declare.h"  // NOLINT
33
#include "paddle_tensor.h"         // NOLINT
M
MRXLT 已提交
34 35
                                   /*! \namespace paddle
                                    */
36 37
namespace paddle {

38 39
using PaddleDType = paddle_infer::DataType;
using PaddlePlace = paddle_infer::PlaceType;
40
using PaddleDataLayout = paddle_infer::DataLayout;
41
using paddle_infer::Exp_OutputHookFunc;
42

43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
/// \brief Memory manager for PaddleTensor.
///
/// The PaddleBuf holds a buffer for data input or output. The memory can be
/// allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
/// should be reused for better performance.
///
/// For user allocated memory, the following API can be used:
/// - PaddleBuf(void* data, size_t length) to set an external memory by
/// specifying the memory address and length.
/// - Reset(void* data, size_t length) to reset the PaddleBuf with an external
/// memory.
/// ATTENTION, for user allocated memory, deallocation should be done by users
/// externally after the program finished. The PaddleBuf won't do any allocation
/// or deallocation.
///
/// To have the PaddleBuf allocate and manage the memory:
/// - PaddleBuf(size_t length) will allocate a memory of size `length`.
/// - Resize(size_t length) resize the memory to no less than `length`,
/// ATTENTION
///  if the allocated memory is larger than `length`, nothing will done.
///
/// Usage:
///
/// Let PaddleBuf manage the memory internally.
/// \code{cpp}
/// const int num_elements = 128;
/// PaddleBuf buf(num_elements/// sizeof(float));
/// \endcode
///
/// Or
/// \code{cpp}
/// PaddleBuf buf;
/// buf.Resize(num_elements/// sizeof(float));
/// \endcode
/// Works the exactly the same.
///
/// One can also make the `PaddleBuf` use the external memory.
/// \code{cpp}
/// PaddleBuf buf;
/// void* external_memory = new float[num_elements];
/// buf.Reset(external_memory, num_elements*sizeof(float));
/// ...
/// delete[] external_memory; // manage the memory lifetime outside.
/// \endcode
///
88
class PD_INFER_DECL PaddleBuf {
89
 public:
90 91 92 93 94
  ///
  /// \brief PaddleBuf allocate memory internally, and manage it.
  ///
  /// \param[in] length The length of data.
  ///
95 96
  explicit PaddleBuf(size_t length)
      : data_(new char[length]), length_(length), memory_owned_(true) {}
97 98 99 100 101 102
  ///
  /// \brief Set external memory, the PaddleBuf won't manage it.
  ///
  /// \param[in] data The start address of the external memory.
  /// \param[in] length The length of data.
  ///
103 104
  PaddleBuf(void* data, size_t length)
      : data_(data), length_(length), memory_owned_{false} {}
105 106 107 108 109 110 111 112 113 114 115
  ///
  /// \brief Copy only available when memory is managed externally.
  ///
  /// \param[in] other another `PaddleBuf`
  ///
  explicit PaddleBuf(const PaddleBuf& other);
  ///
  /// \brief Resize the memory.
  ///
  /// \param[in] length The length of data.
  ///
116
  void Resize(size_t length);
117 118 119 120 121 122
  ///
  /// \brief Reset to external memory, with address and length set.
  ///
  /// \param[in] data The start address of the external memory.
  /// \param[in] length The length of data.
  ///
123
  void Reset(void* data, size_t length);
124 125 126
  ///
  /// \brief Tell whether the buffer is empty.
  ///
127
  bool empty() const { return length_ == 0; }
128 129 130
  ///
  /// \brief Get the data's memory address.
  ///
131
  void* data() const { return data_; }
132 133 134
  ///
  /// \brief Get the memory length.
  ///
135 136 137 138 139 140 141 142 143 144
  size_t length() const { return length_; }

  ~PaddleBuf() { Free(); }
  PaddleBuf& operator=(const PaddleBuf&);
  PaddleBuf& operator=(PaddleBuf&&);
  PaddleBuf() = default;
  PaddleBuf(PaddleBuf&& other);

 private:
  void Free();
145 146
  void* data_{nullptr};  ///< pointer to the data memory.
  size_t length_{0};     ///< number of memory bytes.
147 148 149
  bool memory_owned_{true};
};

150 151 152
///
/// \brief Basic input and output data structure for PaddlePredictor.
///
153
struct PD_INFER_DECL PaddleTensor {
154
  PaddleTensor() = default;
155
  std::string name;  ///<  variable name.
156
  std::vector<int> shape;
157
  PaddleBuf data;  ///<  blob of data.
158
  PaddleDType dtype;
159
  std::vector<std::vector<size_t>> lod;  ///<  Tensor+LoD equals LoDTensor
160 161
};

162 163 164 165 166 167 168 169
/// \brief Represents an n-dimensional array of values.
/// The ZeroCopyTensor is used to store the input or output of the network.
/// Zero copy means that the tensor supports direct copy of host or device data
/// to device,
/// eliminating additional CPU copy. ZeroCopyTensor is only used in the
/// AnalysisPredictor.
/// It is obtained through PaddlePredictor::GetinputTensor()
/// and PaddlePredictor::GetOutputTensor() interface.
170

171 172
class PD_INFER_DECL ZeroCopyTensor : public paddle_infer::Tensor {
 public:
173 174 175
  /// \brief Copy the host memory to tensor data.
  /// It's usually used to set the input tensor data.
  /// \param data The pointer of the data, from which the tensor will copy.
N
nhzlx 已提交
176
  template <typename T>
177 178 179
  void copy_from_cpu(const T* data) {
    return CopyFromCpu(data);
  }
S
Steffy-zxf 已提交
180 181 182 183 184 185 186 187

  /// \brief Experimental interface.
  /// It's usually used to set the input tensor data with Strings data type.
  /// \param data The pointer of the data, from which the tensor will copy.
  void copy_strings_from_cpu(const paddle_infer::Strings* data) {
    return CopyStringsFromCpu(data);
  }

188 189 190
  /// \brief Copy the tensor data to the host memory.
  /// It's usually used to get the output tensor data.
  /// \param[out] data The tensor will copy the data to the address.
N
nhzlx 已提交
191
  template <typename T>
192 193
  void copy_to_cpu(T* data) {
    return CopyToCpu(data);
N
nhzlx 已提交
194
  }
195 196 197

 private:
  friend class AnalysisPredictor;
198
  friend class ONNXRuntimePredictor;
199 200
  explicit ZeroCopyTensor(void* scope, const void* device_contexts)
      : paddle_infer::Tensor{scope, device_contexts} {}
201 202
};

203 204
/// \brief A Predictor for executing inference on a model.
/// Base class for AnalysisPredictor and NativePaddlePredictor.
205
class PD_INFER_DECL PaddlePredictor {
206 207 208 209 210 211
 public:
  struct Config;
  PaddlePredictor() = default;
  PaddlePredictor(const PaddlePredictor&) = delete;
  PaddlePredictor& operator=(const PaddlePredictor&) = delete;

212 213 214 215 216 217 218 219
  /// \brief This interface takes input and runs the network.
  /// There are redundant copies of data between hosts in this operation,
  /// so it is more recommended to use the zecopyrun interface
  /// \param[in] inputs An list of PaddleTensor as the input to the network.
  /// \param[out] output_data Pointer to the tensor list, which holds the output
  /// paddletensor
  /// \param[in] batch_size This setting has been discarded and can be ignored.
  /// \return Whether the run is successful
220 221 222 223
  virtual bool Run(const std::vector<PaddleTensor>& inputs,
                   std::vector<PaddleTensor>* output_data,
                   int batch_size = -1) = 0;

224 225 226 227 228 229 230 231 232 233
  /// \brief This interface takes input and runs the network (Recommended).
  /// \param[in] inputs An list of Tensor as the input to the network.
  /// \param[out] output_data Pointer to the tensor list, which holds the output
  /// Tensor
  /// \return Whether the run is successful
  virtual bool Run(const std::vector<paddle::Tensor>& inputs,
                   std::vector<paddle::Tensor>* outputs) {
    return false;
  }

234 235 236
  /// \brief  Used to get the name of the network input.
  /// Be inherited by AnalysisPredictor, Only used in ZeroCopy scenarios.
  /// \return Input tensor names.
N
nhzlx 已提交
237 238
  virtual std::vector<std::string> GetInputNames() { return {}; }

239 240
  /// \brief Get the input shape of the model.
  /// \return A map contains all the input names and shape defined in the model.
241 242 243 244
  virtual std::map<std::string, std::vector<int64_t>> GetInputTensorShape() {
    return {};
  }

245 246 247 248 249 250
  /// \brief Get the input type of the model.
  /// \return A map contains all the input names and type defined in the model.
  virtual std::map<std::string, paddle_infer::DataType> GetInputTypes() {
    return {};
  }

251 252 253
  /// \brief Used to get the name of the network output.
  /// Be inherited by AnalysisPredictor, Only used in ZeroCopy scenarios.
  /// \return Output tensor names.
N
nhzlx 已提交
254 255
  virtual std::vector<std::string> GetOutputNames() { return {}; }

256 257 258 259 260 261 262 263 264 265 266 267 268
  /// \brief Get the output shape of the model.
  /// \return A map contains all the output names and shape defined in the
  /// model.
  virtual std::map<std::string, std::vector<int64_t>> GetOutputTensorShape() {
    return {};
  }

  /// \brief Get the output type of the model.
  /// \return A map contains all the output names and type defined in the model.
  virtual std::map<std::string, paddle_infer::DataType> GetOutputTypes() {
    return {};
  }

269 270 271 272 273
  /// \brief Get the input ZeroCopyTensor by name.
  /// Be inherited by AnalysisPredictor, Only used in ZeroCopy scenarios.
  /// The name is obtained from the GetInputNames() interface.
  /// \param name The input tensor name.
  /// \return Return the corresponding input ZeroCopyTensor.
274 275 276 277
  virtual std::unique_ptr<ZeroCopyTensor> GetInputTensor(
      const std::string& name) {
    return nullptr;
  }
278 279 280 281 282 283

  /// \brief Get the output ZeroCopyTensor by name.
  /// Be inherited by AnalysisPredictor, Only used in ZeroCopy scenarios.
  /// The name is obtained from the GetOutputNames() interface.
  /// \param name The output tensor name.
  /// \return Return the corresponding output ZeroCopyTensor.
284 285 286 287
  virtual std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
      const std::string& name) {
    return nullptr;
  }
288 289 290 291 292
  /// \brief Run the network with zero-copied inputs and outputs.
  /// Be inherited by AnalysisPredictor and only used in ZeroCopy scenarios.
  /// This will save the IO copy for transfering inputs and outputs to predictor
  /// workspace
  /// and get some performance improvement.
P
Pei Yang 已提交
293
  /// To use it, one should call the AnalysisConfig.SwitchUseFeedFetchOp(false)
294 295 296
  /// and then use the `GetInputTensor` and `GetOutputTensor`
  /// to directly write or read the input/output tensors.
  /// \return Whether the run is successful
297 298
  virtual bool ZeroCopyRun() { return false; }

299 300 301 302 303 304
  ///
  /// \brief Clear the intermediate tensors of the predictor
  ///
  ///
  virtual void ClearIntermediateTensor() {}

305 306 307 308 309 310 311 312 313 314 315
  ///
  /// \brief Release all tmp tensor to compress the size of the memory pool.
  /// The memory pool is considered to be composed of a list of chunks, if
  /// the chunk is not occupied, it can be released.
  ///
  /// \return Number of bytes released. It may be smaller than the actual
  /// released memory, because part of the memory is not managed by the
  /// MemoryPool.
  ///
  virtual uint64_t TryShrinkMemory() { return 0; }

316 317 318 319 320 321 322 323 324 325
  ///
  /// \brief Register a output hook function to operate the intermediate tensor
  /// of op output. when using this function, memory reuse should be tured off.
  /// The hook function signature is void(const std::string&, const
  /// std::string&, const Tensor&>). Here, the first parameter is op's
  /// type, the second param is output var name of the op, and the third
  /// parameter is output tensor with the var name.
  ///
  virtual void RegisterOutputHook(const Exp_OutputHookFunc& hookfunc) {}

326 327 328 329
  /// \brief Clone an existing predictor
  /// When using clone, the same network will be created,
  /// and the parameters between them are shared.
  /// \return unique_ptr which contains the pointer of predictor
330
  virtual std::unique_ptr<PaddlePredictor> Clone(void* stream = nullptr) = 0;
331

332
  /// \brief Destroy the Predictor.
333 334
  virtual ~PaddlePredictor() = default;

335
  virtual std::string GetSerializedProgram() const {
Y
Yan Chunwei 已提交
336 337
    assert(false);  // Force raise error.
    return "NotImplemented";
338
  }
Y
Yan Chunwei 已提交
339

340
  /// \brief Base class for NativeConfig and AnalysisConfig.
341
  struct Config {
342
    std::string model_dir; /*!< path to the model directory. */
343
  };
344 345 346 347 348

  virtual void* GetExecStream() const { return nullptr; }

 protected:
  virtual const void* GetDeviceContexts() const { return nullptr; }
349 350
};

351 352 353 354 355 356 357
///
/// \brief configuration manager for `NativePredictor`.
///
/// `AnalysisConfig` manages configurations of `NativePredictor`.
/// During inference procedure, there are many parameters(model/params path,
/// place of inference, etc.)
///
358
struct PD_INFER_DECL NativeConfig : public PaddlePredictor::Config {
W
Wilber 已提交
359
  NativeConfig();
360
  /// GPU related fields.
361
  bool use_xpu{false};
362 363
  bool use_gpu{false};
  int device{0};
364
  float fraction_of_gpu_memory{
365
      -1.f};  ///< Change to a float in (0,1] if needed.
366 367

  std::string prog_file;
368 369
  std::string
      param_file;  ///< Specify the exact path of program and parameter files.
370

371 372 373 374
  bool specify_input_name{false};  ///< Specify the variable's name of each
                                   ///< input if input tensors don't follow the
                                   ///< `feeds` and `fetches` of the phase
                                   ///< `save_inference_model`.
L
luotao1 已提交
375

376
  /// Set and get the number of cpu math library threads.
L
luotao1 已提交
377 378 379 380 381
  void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads) {
    cpu_math_library_num_threads_ = cpu_math_library_num_threads;
  }
  int cpu_math_library_num_threads() const {
    return cpu_math_library_num_threads_;
L
luotao1 已提交
382 383 384
  }

 protected:
385 386 387
  int cpu_math_library_num_threads_{1};  ///< number of cpu math library (such
                                         ///< as MKL, OpenBlas) threads for each
                                         ///< instance.
388 389
};

390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405
///
/// \brief A factory to help create different predictors.
///
/// Usage:
///
/// \code{.cpp}
/// NativeConfig config;
/// ... // change the configs.
/// auto native_predictor = CreatePaddlePredictor(config);
/// \endcode
///
/// FOR EXTENSION DEVELOPER:
/// Different predictors are designated by config type. Similar configs can be
/// merged, but there shouldn't be a huge config containing different fields for
/// more than one kind of predictors.
////
406 407 408
template <typename ConfigT>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);

409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424
struct AnalysisConfig;
struct NativeConfig;
struct DemoConfig;

template <>
PD_INFER_DECL std::unique_ptr<PaddlePredictor>
CreatePaddlePredictor<AnalysisConfig>(const AnalysisConfig& config);

template <>
PD_INFER_DECL std::unique_ptr<PaddlePredictor>
CreatePaddlePredictor<NativeConfig>(const NativeConfig& config);

template <>
PD_INFER_DECL std::unique_ptr<PaddlePredictor>
CreatePaddlePredictor<DemoConfig>(const DemoConfig& config);

425 426 427
/// NOTE The following APIs are too trivial, we will discard it in the following
/// versions.
///
428
enum class PaddleEngineKind {
429 430 431
  kNative = 0,         ///< Use the native Fluid facility.
  kAutoMixedTensorRT,  ///< Automatically mix Fluid with TensorRT.
  kAnalysis,           ///< More optimization.
432
  kONNXRuntime,        ///< Use ONNXRuntime
433 434 435
};

template <typename ConfigT, PaddleEngineKind engine>
W
Wilber 已提交
436 437
PD_INFER_DECL std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(
    const ConfigT& config);
438

439
template <>
440 441 442
PD_INFER_DECL std::unique_ptr<PaddlePredictor>
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
    const NativeConfig& config);
443 444

template <>
445 446 447
PD_INFER_DECL std::unique_ptr<PaddlePredictor>
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
    const AnalysisConfig& config);
448

449 450 451 452 453
template <>
PD_INFER_DECL std::unique_ptr<PaddlePredictor>
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kONNXRuntime>(
    const AnalysisConfig& config);

454 455 456
PD_INFER_DECL int PaddleDtypeSize(PaddleDType dtype);

PD_INFER_DECL std::string get_version();
457

458
PD_INFER_DECL std::string UpdateDllFlag(const char* name, const char* value);
Y
Yan Chunwei 已提交
459

460 461 462
PD_INFER_DECL std::shared_ptr<framework::Cipher> MakeCipher(
    const std::string& config_file);

463
}  // namespace paddle
464 465 466 467 468 469 470

// forward declation
using cudaStream_t = struct CUstream_st*;
using hipStream_t = struct ihipStream_t*;

namespace paddle_infer {
class Predictor;
W
Wilber 已提交
471
class Tensor;
472 473
using Config = paddle::AnalysisConfig;
namespace experimental {
474 475 476 477 478 479 480
struct XpuRuntimeConfig {
  void* stream{nullptr};
  size_t l3_size{16773120};
  void* l3_ptr{nullptr};
  size_t l3_autotune_size{0};
};

W
Wilber 已提交
481
// Unstable interface, may be modified or deleted in the future.
482 483 484 485 486 487 488
class PD_INFER_DECL InternalUtils {
 public:
  // Note: Can only be used under thread_local semantics.
  static bool RunWithExternalStream(paddle_infer::Predictor* pred,
                                    cudaStream_t stream);
  static bool RunWithExternalStream(paddle_infer::Predictor* pred,
                                    hipStream_t stream);
489 490
  static bool RunWithRuntimeConfig(paddle_infer::Predictor* pred, void* config);

491 492
  static void UpdateConfigInterleaved(paddle_infer::Config* c,
                                      bool with_interleaved);
W
Wilber 已提交
493

494 495 496 497 498 499
  static void SetTransformerPosid(
      paddle_infer::Config* c, const std::string& tensorrt_transformer_posid);

  static void SetTransformerMaskid(
      paddle_infer::Config* c, const std::string& tensorrt_transformer_maskid);

W
Wilber 已提交
500 501 502
  static void SyncStream(paddle_infer::Predictor* pred);
  static void SyncStream(cudaStream_t stream);
  template <typename T>
503 504
  static void CopyFromCpuWithIoStream(paddle_infer::Tensor* t,
                                      const T* data,
W
Wilber 已提交
505 506
                                      cudaStream_t stream);
  template <typename T>
507 508
  static void CopyToCpuWithIoStream(paddle_infer::Tensor* t,
                                    T* data,
W
Wilber 已提交
509
                                    cudaStream_t stream);
510 511 512
};
}  // namespace experimental
}  // namespace paddle_infer