paddle_inference_api.h 9.7 KB
Newer Older
Y
Yan Chunwei 已提交
1 2
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Y
Yan Chunwei 已提交
3 4 5
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Y
Yan Chunwei 已提交
6

Y
Yan Chunwei 已提交
7
http://www.apache.org/licenses/LICENSE-2.0
Y
Yan Chunwei 已提交
8

Y
Yan Chunwei 已提交
9 10 11 12 13
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Y
Yan Chunwei 已提交
14

15 16 17
/*
 * This file contains the definition of a simple Inference API for Paddle.
 *
18
 * ATTENTION: It requires some C++11 features, for lower version C++ or C, we
19 20 21
 * might release another API.
 */

Y
Yan Chunwei 已提交
22 23
#pragma once

24
#include <cassert>
25
#include <memory>
Y
Yan Chunwei 已提交
26 27 28 29 30
#include <string>
#include <vector>

namespace paddle {

Y
Yan Chunwei 已提交
31
// Data type.
X
Xin Pan 已提交
32 33 34
enum PaddleDType {
  FLOAT32,
  INT64,
Y
Yan Chunwei 已提交
35
  // TODO(Superjomn) support more data types if needed.
X
Xin Pan 已提交
36 37
};

Y
Yan Chunwei 已提交
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
/*
 * Memory menage for PaddleTensor.
 * The PaddleBuf holds a buffer for data input or output. The memory can be
 * allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
 * should be reused for better performance.
 *
 * For user allocated memory, the following API can be used:
 * - PaddleBuf(void* data, size_t length) to set an external memory by
 * specifying
 *   the memory address and length.
 * - Reset(void* data, size_t length) to reset the PaddleBuf with an external
 * memory.
 * ATTENTION, for user allocated memory, deallocation should be done by users
 * externally after the program finished. The PaddleBuf won't do any allocation
 * or deallocation.
 *
 * To have the PaddleBuf allocate and manage the memory:
 * - PaddleBuf(size_t length) will allocate a memory of size `length`.
 * - Resize(size_t length) resize the memory to no less than `length`, ATTENTION
 *   if the allocated memory is larger than `length`, nothing will done.
 */
59 60
class PaddleBuf {
 public:
Y
Yan Chunwei 已提交
61
  // PaddleBuf allocate memory internally, and manage it.
T
tensor-tang 已提交
62
  explicit PaddleBuf(size_t length)
63
      : data_(new char[length]), length_(length), memory_owned_(true) {}
Y
Yan Chunwei 已提交
64 65 66 67 68 69 70
  // Set external memory, the PaddleBuf won't manage it.
  PaddleBuf(void* data, size_t length)
      : data_(data), length_(length), memory_owned_{false} {}
  // Copy only available when memory is managed externally.
  explicit PaddleBuf(const PaddleBuf&);

  // Resize the memory.
71
  void Resize(size_t length);
Y
Yan Chunwei 已提交
72
  // Reset to external memory, with address and length set.
73
  void Reset(void* data, size_t length);
Y
Yan Chunwei 已提交
74
  // Tell whether the buffer is empty.
75
  bool empty() const { return length_ == 0; }
Y
Yan Chunwei 已提交
76
  // Get the memory address.
77
  void* data() const { return data_; }
Y
Yan Chunwei 已提交
78
  // Get the memory length.
79 80 81
  size_t length() const { return length_; }

  ~PaddleBuf() { Free(); }
Y
Yan Chunwei 已提交
82 83 84 85
  PaddleBuf& operator=(const PaddleBuf&);
  PaddleBuf& operator=(PaddleBuf&&);
  PaddleBuf() = default;
  PaddleBuf(PaddleBuf&& other);
86 87 88 89 90 91

 private:
  void Free();
  void* data_{nullptr};  // pointer to the data memory.
  size_t length_{0};     // number of memory bytes.
  bool memory_owned_{true};
X
Xin Pan 已提交
92 93
};

Y
Yan Chunwei 已提交
94
// Basic input and output data structure for PaddlePredictor.
95
struct PaddleTensor {
96
  PaddleTensor() = default;
97 98
  std::string name;  // variable name.
  std::vector<int> shape;
X
Xin Pan 已提交
99 100
  PaddleBuf data;  // blob of data.
  PaddleDType dtype;
T
Tao Luo 已提交
101
  std::vector<std::vector<size_t>> lod;  // Tensor+LoD equals LoDTensor
102 103
};

104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
// Tensor without copy, currently only supports AnalysisPredictor.
class ZeroCopyTensor {
 public:
  void Reshape(const std::vector<int>& shape);

  // Get the memory in CPU or GPU with specific data type, should Reshape first
  // to tell the data size.
  // Once can directly call this data to feed the data.
  // This is for write the input tensor.
  template <typename T>
  T* mutable_data(PaddlePlace place);
  // Get the memory directly, will return the place and memory size by pointer.
  // This is for reading the output tensor.
  template <typename T>
  T* data(PaddlePlace* place, int* size);

  std::vector<int64_t> shape();

  void SetLoD(const std::vector<std::vector<size_t>>& x);
  std::vector<std::vector<size_t>> lod() const;

 protected:
  ZeroCopyTensor(void* scope) : scope_{scope} {}
  void SetName(const std::string& name) { name_ = name; }
  void* FindTensor() const;

 private:
  std::string name_;
  bool input_or_output_;
  friend class AnalysisPredictor;
  void* scope_{nullptr};
};

138
/*
Y
Yan Chunwei 已提交
139
 * A simple Inference API for Paddle.
Y
Yan Chunwei 已提交
140
 */
141
class PaddlePredictor {
W
Wu Yi 已提交
142
 public:
143 144 145
  struct Config;
  PaddlePredictor() = default;
  PaddlePredictor(const PaddlePredictor&) = delete;
146
  PaddlePredictor& operator=(const PaddlePredictor&) = delete;
Y
Yan Chunwei 已提交
147 148

  // Predict an record.
X
Xin Pan 已提交
149
  // The caller should be responsible for allocating and releasing the memory of
150 151 152
  // `inputs`. `inputs` should be available until Run returns. Caller should be
  // responsible for the output tensor's buffer, either allocated or passed from
  // outside.
153
  virtual bool Run(const std::vector<PaddleTensor>& inputs,
154 155
                   std::vector<PaddleTensor>* output_data,
                   int batch_size = -1) = 0;
156

157 158 159 160 161 162 163 164 165 166 167 168 169
  // Zero copy input and output optimization.
  // Get the input or output tensors, and operate on their memory directly,
  // without copy.
  virtual std::unique_ptr<ZeroCopyTensor> GetInputTensor(
      const std::string& name) {
    return nullptr;
  }
  virtual std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
      const std::string& name) {
    return nullptr;
  }
  virtual bool ZeroCopyRun() { return false; }

170 171 172
  // Clone a predictor that share the model weights, the Cloned predictor should
  // be thread-safe.
  virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
Y
Yan Chunwei 已提交
173 174

  // Destroy the Predictor.
175
  virtual ~PaddlePredictor() = default;
176 177 178

  // The common configs for all the predictors.
  struct Config {
Y
Yan Chunwei 已提交
179
    std::string model_dir;  // path to the model directory.
Y
Yan Chunwei 已提交
180 181 182
  };
};

Y
Yan Chunwei 已提交
183
struct NativeConfig : public PaddlePredictor::Config {
Y
Yan Chunwei 已提交
184
  // GPU related fields.
Y
Yan Chunwei 已提交
185
  bool use_gpu{false};
Y
Yan Chunwei 已提交
186
  int device{0};
Y
Yan Chunwei 已提交
187
  float fraction_of_gpu_memory{-1.f};  // Change to a float in (0,1] if needed.
Y
Yan Chunwei 已提交
188

Y
Yan Chunwei 已提交
189
  // Specify the exact path of program and parameter files.
Y
Yan Chunwei 已提交
190 191
  std::string prog_file;
  std::string param_file;
Y
Yan Chunwei 已提交
192 193 194 195

  // Specify the variable's name of each input if input tensors don't follow the
  // `feeds` and `fetches` of the phase `save_inference_model`.
  bool specify_input_name{false};
Y
Yan Chunwei 已提交
196 197
};

Y
Yan Chunwei 已提交
198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219
// A factory to help create different predictors.
//
// Usage:
//
// NativeConfig config;
// ... // change the configs.
// auto native_predictor = CreatePaddlePredictor(config);
//
// FOR EXTENSION DEVELOPER:
// Different predictors are designated by config type. Similar configs can be
// merged, but there shouldn't be a huge config containing different fields for
// more than one kind of predictors.
template <typename ConfigT>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);

// NOTE The following APIs are too trivial, we will discard it in the following
// versions.
enum class PaddleEngineKind {
  kNative = 0,         // Use the native Fluid facility.
  kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
  kAnalysis,           // More optimization.
  kAnakin              // Use Anakin for inference, not mature yet.
Y
Yan Chunwei 已提交
220 221
};

Y
Yan Chunwei 已提交
222 223 224 225 226 227 228 229 230 231 232 233
template <typename ConfigT, PaddleEngineKind engine>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);

// ==
//
// -----------------------------------------------------------------------------------
// NOTE: The following APIs are not mature yet, we are still working on them.

namespace contrib {

// Accelerate GPU computation with TensorRT engine.
struct MixedRTConfig : public NativeConfig {
234 235
  // Determine whether a subgraph will be executed by TRT.
  int min_subgraph_size{1};
236 237 238 239 240 241 242 243
  // While TensorRT allows an engine optimized for a given max batch size
  // to run at any smaller size, the performance for those smaller
  // sizes may not be as well-optimized. Therefore, Max batch is best
  // equivalent to the runtime batch size.
  int max_batch_size{1};
  // For workspace_size, refer it from here:
  // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
  int workspace_size{1 << 30};
N
nhzlx 已提交
244 245 246
  //  We transform the Ops that can be converted into TRT layer in the model,
  //  and aggregate these Ops into subgraphs for TRT execution.
  //  We set this variable to control the minimum number of nodes in the
N
nhzlx 已提交
247
  //  subgraph, 3 as default value.
N
nhzlx 已提交
248
  int minimum_subgraph_size = 3;
N
nhzlx 已提交
249 250 251
  // Reserved configuration
  // We just support "FP32" now, "FP16" and "INT8" will be supported.
  std::string precision_mode = "FP32";
252 253
};

254 255 256 257 258 259 260 261
// NOTE WIP, not stable yet.
struct AnalysisConfig : public NativeConfig {
  enum class IrPassMode {
    kSystem,   // Use system default passes, not customize.
    kInclude,  // Specify the passes in `ir_passes`.
    kExclude   // Specify the disabled passes in `ir_passes`.
  };

W
Wojciech Uss 已提交
262 263
  void SetIncludeMode() {
    ir_mode = IrPassMode::kInclude;
264
    // this pass has to be run at the beginning of all fuse passes
W
Wojciech Uss 已提交
265 266 267
    ir_passes = {"infer_clean_graph_pass"};
  }

Y
Yan Chunwei 已提交
268
  // Determine whether to perform graph optimization.
269
  bool enable_ir_optim = true;
Y
Yan Chunwei 已提交
270
  // Manually determine the IR passes to run.
271
  IrPassMode ir_mode{IrPassMode::kExclude};
W
Wojciech Uss 已提交
272
  // passes to be excluded/included
273
  std::vector<std::string> ir_passes{"embedding_fc_lstm_fuse_pass"};
Y
Yan Chunwei 已提交
274

275 276 277
  // NOT stable yet.
  bool use_feed_fetch_ops{true};

278 279
  // NOTE this is just for internal development, please not use it.
  // NOT stable yet.
Y
Yan Chunwei 已提交
280
  bool _use_mkldnn{false};
281 282
};

Y
Yan Chunwei 已提交
283 284 285 286 287 288 289 290 291 292
// Configurations for Anakin engine.
struct AnakinConfig : public PaddlePredictor::Config {
  enum TargetType { NVGPU = 0, X86 };
  int device;
  std::string model_file;
  int max_batch_size{-1};
  TargetType target_type;
};

}  // namespace contrib
293 294 295

int PaddleDtypeSize(PaddleDType dtype);

Y
Yan Chunwei 已提交
296
}  // namespace paddle