paddle_inference_api.h 8.0 KB
Newer Older
Y
Yan Chunwei 已提交
1 2
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Y
Yan Chunwei 已提交
3 4 5
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Y
Yan Chunwei 已提交
6

Y
Yan Chunwei 已提交
7
http://www.apache.org/licenses/LICENSE-2.0
Y
Yan Chunwei 已提交
8

Y
Yan Chunwei 已提交
9 10 11 12 13
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Y
Yan Chunwei 已提交
14

15 16 17
/*
 * This file contains the definition of a simple Inference API for Paddle.
 *
18
 * ATTENTION: It requires some C++11 features, for lower version C++ or C, we
19 20 21
 * might release another API.
 */

Y
Yan Chunwei 已提交
22 23
#pragma once

24
#include <cassert>
25
#include <memory>
Y
Yan Chunwei 已提交
26 27 28 29 30
#include <string>
#include <vector>

namespace paddle {

Y
Yan Chunwei 已提交
31
// Data type.
X
Xin Pan 已提交
32 33 34
enum PaddleDType {
  FLOAT32,
  INT64,
Y
Yan Chunwei 已提交
35
  // TODO(Superjomn) support more data types if needed.
X
Xin Pan 已提交
36 37
};

Y
Yan Chunwei 已提交
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
/*
 * Memory menage for PaddleTensor.
 * The PaddleBuf holds a buffer for data input or output. The memory can be
 * allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
 * should be reused for better performance.
 *
 * For user allocated memory, the following API can be used:
 * - PaddleBuf(void* data, size_t length) to set an external memory by
 * specifying
 *   the memory address and length.
 * - Reset(void* data, size_t length) to reset the PaddleBuf with an external
 * memory.
 * ATTENTION, for user allocated memory, deallocation should be done by users
 * externally after the program finished. The PaddleBuf won't do any allocation
 * or deallocation.
 *
 * To have the PaddleBuf allocate and manage the memory:
 * - PaddleBuf(size_t length) will allocate a memory of size `length`.
 * - Resize(size_t length) resize the memory to no less than `length`, ATTENTION
 *   if the allocated memory is larger than `length`, nothing will done.
 */
59 60
class PaddleBuf {
 public:
Y
Yan Chunwei 已提交
61
  // PaddleBuf allocate memory internally, and manage it.
T
tensor-tang 已提交
62
  explicit PaddleBuf(size_t length)
63
      : data_(new char[length]), length_(length), memory_owned_(true) {}
Y
Yan Chunwei 已提交
64 65 66 67 68 69 70
  // Set external memory, the PaddleBuf won't manage it.
  PaddleBuf(void* data, size_t length)
      : data_(data), length_(length), memory_owned_{false} {}
  // Copy only available when memory is managed externally.
  explicit PaddleBuf(const PaddleBuf&);

  // Resize the memory.
71
  void Resize(size_t length);
Y
Yan Chunwei 已提交
72
  // Reset to external memory, with address and length set.
73
  void Reset(void* data, size_t length);
Y
Yan Chunwei 已提交
74
  // Tell whether the buffer is empty.
75
  bool empty() const { return length_ == 0; }
Y
Yan Chunwei 已提交
76
  // Get the memory address.
77
  void* data() const { return data_; }
Y
Yan Chunwei 已提交
78
  // Get the memory length.
79 80 81
  size_t length() const { return length_; }

  ~PaddleBuf() { Free(); }
Y
Yan Chunwei 已提交
82 83 84 85
  PaddleBuf& operator=(const PaddleBuf&);
  PaddleBuf& operator=(PaddleBuf&&);
  PaddleBuf() = default;
  PaddleBuf(PaddleBuf&& other);
86 87 88 89 90 91

 private:
  void Free();
  void* data_{nullptr};  // pointer to the data memory.
  size_t length_{0};     // number of memory bytes.
  bool memory_owned_{true};
X
Xin Pan 已提交
92 93
};

Y
Yan Chunwei 已提交
94
// Basic input and output data structure for PaddlePredictor.
95
struct PaddleTensor {
96
  PaddleTensor() = default;
97 98
  std::string name;  // variable name.
  std::vector<int> shape;
X
Xin Pan 已提交
99 100
  PaddleBuf data;  // blob of data.
  PaddleDType dtype;
T
Tao Luo 已提交
101
  std::vector<std::vector<size_t>> lod;  // Tensor+LoD equals LoDTensor
102 103 104
};

/*
Y
Yan Chunwei 已提交
105
 * A simple Inference API for Paddle.
Y
Yan Chunwei 已提交
106
 */
107
class PaddlePredictor {
W
Wu Yi 已提交
108
 public:
109 110 111
  struct Config;
  PaddlePredictor() = default;
  PaddlePredictor(const PaddlePredictor&) = delete;
112
  PaddlePredictor& operator=(const PaddlePredictor&) = delete;
Y
Yan Chunwei 已提交
113 114

  // Predict an record.
X
Xin Pan 已提交
115
  // The caller should be responsible for allocating and releasing the memory of
116 117 118
  // `inputs`. `inputs` should be available until Run returns. Caller should be
  // responsible for the output tensor's buffer, either allocated or passed from
  // outside.
119
  virtual bool Run(const std::vector<PaddleTensor>& inputs,
120 121
                   std::vector<PaddleTensor>* output_data,
                   int batch_size = -1) = 0;
122 123 124 125

  // Clone a predictor that share the model weights, the Cloned predictor should
  // be thread-safe.
  virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
Y
Yan Chunwei 已提交
126 127

  // Destroy the Predictor.
128
  virtual ~PaddlePredictor() = default;
129 130 131

  // The common configs for all the predictors.
  struct Config {
Y
Yan Chunwei 已提交
132
    std::string model_dir;  // path to the model directory.
Y
Yan Chunwei 已提交
133 134 135
  };
};

Y
Yan Chunwei 已提交
136
struct NativeConfig : public PaddlePredictor::Config {
Y
Yan Chunwei 已提交
137
  // GPU related fields.
Y
Yan Chunwei 已提交
138
  bool use_gpu{false};
Y
Yan Chunwei 已提交
139
  int device{0};
Y
Yan Chunwei 已提交
140
  float fraction_of_gpu_memory{-1.f};  // Change to a float in (0,1] if needed.
Y
Yan Chunwei 已提交
141

Y
Yan Chunwei 已提交
142
  // Specify the exact path of program and parameter files.
Y
Yan Chunwei 已提交
143 144
  std::string prog_file;
  std::string param_file;
Y
Yan Chunwei 已提交
145 146 147 148

  // Specify the variable's name of each input if input tensors don't follow the
  // `feeds` and `fetches` of the phase `save_inference_model`.
  bool specify_input_name{false};
Y
Yan Chunwei 已提交
149 150
};

Y
Yan Chunwei 已提交
151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
// A factory to help create different predictors.
//
// Usage:
//
// NativeConfig config;
// ... // change the configs.
// auto native_predictor = CreatePaddlePredictor(config);
//
// FOR EXTENSION DEVELOPER:
// Different predictors are designated by config type. Similar configs can be
// merged, but there shouldn't be a huge config containing different fields for
// more than one kind of predictors.
template <typename ConfigT>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);

// NOTE The following APIs are too trivial, we will discard it in the following
// versions.
enum class PaddleEngineKind {
  kNative = 0,         // Use the native Fluid facility.
  kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
  kAnalysis,           // More optimization.
  kAnakin              // Use Anakin for inference, not mature yet.
Y
Yan Chunwei 已提交
173 174
};

Y
Yan Chunwei 已提交
175 176 177 178 179 180 181 182 183 184 185 186
template <typename ConfigT, PaddleEngineKind engine>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);

// ==
//
// -----------------------------------------------------------------------------------
// NOTE: The following APIs are not mature yet, we are still working on them.

namespace contrib {

// Accelerate GPU computation with TensorRT engine.
struct MixedRTConfig : public NativeConfig {
187 188
  // Determine whether a subgraph will be executed by TRT.
  int min_subgraph_size{1};
189 190 191 192 193 194 195 196
  // While TensorRT allows an engine optimized for a given max batch size
  // to run at any smaller size, the performance for those smaller
  // sizes may not be as well-optimized. Therefore, Max batch is best
  // equivalent to the runtime batch size.
  int max_batch_size{1};
  // For workspace_size, refer it from here:
  // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
  int workspace_size{1 << 30};
N
nhzlx 已提交
197 198 199
  //  We transform the Ops that can be converted into TRT layer in the model,
  //  and aggregate these Ops into subgraphs for TRT execution.
  //  We set this variable to control the minimum number of nodes in the
N
nhzlx 已提交
200
  //  subgraph, 3 as default value.
N
nhzlx 已提交
201
  int minimum_subgraph_size = 3;
N
nhzlx 已提交
202 203 204
  // Reserved configuration
  // We just support "FP32" now, "FP16" and "INT8" will be supported.
  std::string precision_mode = "FP32";
205 206
};

207 208 209 210 211 212 213 214 215 216 217 218
// NOTE WIP, not stable yet.
struct AnalysisConfig : public NativeConfig {
  enum class IrPassMode {
    kSystem,   // Use system default passes, not customize.
    kInclude,  // Specify the passes in `ir_passes`.
    kExclude   // Specify the disabled passes in `ir_passes`.
  };

  bool enable_ir_optim = true;
  IrPassMode ir_mode{IrPassMode::kExclude};
  // attention lstm fuse works only on some specific models, disable as default.
  std::vector<std::string> ir_passes{"attention_lstm_fuse_pass"};
Y
Yan Chunwei 已提交
219 220 221

  // NOTE this is just for internal development, please not use it.
  bool _use_mkldnn{false};
222 223
};

Y
Yan Chunwei 已提交
224 225 226 227 228 229 230 231 232 233
// Configurations for Anakin engine.
struct AnakinConfig : public PaddlePredictor::Config {
  enum TargetType { NVGPU = 0, X86 };
  int device;
  std::string model_file;
  int max_batch_size{-1};
  TargetType target_type;
};

}  // namespace contrib
234 235 236

int PaddleDtypeSize(PaddleDType dtype);

Y
Yan Chunwei 已提交
237
}  // namespace paddle