未验证 提交 5de14c6b 编写于 作者: Y Yan Chunwei 提交者: GitHub

refine inference api (#13518)

上级 79463ae7
......@@ -71,7 +71,7 @@ bool AnalysisPredictor::Init(
inference_program_ = paddle::inference::Load(
executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
} else {
LOG(ERROR) << "fail to load inference model.";
LOG(ERROR) << "fail to load inference model from " << config_.model_dir;
return false;
}
......@@ -109,7 +109,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
}
argument_.origin_program_desc.reset(
new ProgramDesc(*inference_program_->Proto()));
PADDLE_ENFORCE(config_.ir_mode == AnalysisConfig::IrPassMode::kExclude,
PADDLE_ENFORCE(
config_.ir_mode == contrib::AnalysisConfig::IrPassMode::kExclude,
"Only kExclude is supported yet.");
Analyzer().DisableIrPasses(config_.ir_passes).Run(&argument_);
......@@ -126,8 +127,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
}
template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig& config) {
std::unique_ptr<PaddlePredictor>
CreatePaddlePredictor<contrib::AnalysisConfig, PaddleEngineKind::kAnalysis>(
const contrib::AnalysisConfig& config) {
VLOG(3) << "create AnalysisConfig";
if (config.use_gpu) {
// 1. GPU memeroy
......@@ -154,4 +156,11 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
return predictor;
}
template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<contrib::AnalysisConfig>(
const contrib::AnalysisConfig& config) {
return CreatePaddlePredictor<contrib::AnalysisConfig,
PaddleEngineKind::kAnalysis>(config);
}
} // namespace paddle
......@@ -30,7 +30,7 @@ using framework::proto::ProgramDesc;
*/
class AnalysisPredictor : public NativePaddlePredictor {
public:
explicit AnalysisPredictor(const AnalysisConfig& config)
explicit AnalysisPredictor(const contrib::AnalysisConfig& config)
: NativePaddlePredictor(config), config_(config) {}
bool Init(const std::shared_ptr<framework::Scope>& parent_scope);
......@@ -46,7 +46,7 @@ class AnalysisPredictor : public NativePaddlePredictor {
Argument& analysis_argument() { return argument_; }
private:
AnalysisConfig config_;
contrib::AnalysisConfig config_;
Argument argument_;
};
......
......@@ -31,21 +31,24 @@
namespace paddle {
using paddle::contrib::AnakinConfig;
template <typename Target>
PaddleInferenceAnakinPredictor<Target>::PaddleInferenceAnakinPredictor(
const AnakinConfig &config) {
const contrib::AnakinConfig &config) {
CHECK(Init(config));
}
template <>
PaddleInferenceAnakinPredictor<anakin::X86>::PaddleInferenceAnakinPredictor(
const AnakinConfig &config) {
const contrib::AnakinConfig &config) {
omp_set_dynamic(0);
omp_set_num_threads(1);
mkl_set_num_threads(1);
CHECK(Init(config));
}
template <typename Target>
bool PaddleInferenceAnakinPredictor<Target>::Init(const AnakinConfig &config) {
bool PaddleInferenceAnakinPredictor<Target>::Init(
const contrib::AnakinConfig &config) {
if (!(graph_.load(config.model_file))) {
VLOG(3) << "fail to load graph from " << config.model_file;
return false;
......@@ -200,10 +203,11 @@ template class PaddleInferenceAnakinPredictor<anakin::X86>;
// A factory to help create difference predictor.
template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
AnakinConfig, PaddleEngineKind::kAnakin>(const AnakinConfig &config) {
std::unique_ptr<PaddlePredictor>
CreatePaddlePredictor<contrib::AnakinConfig, PaddleEngineKind::kAnakin>(
const contrib::AnakinConfig &config) {
VLOG(3) << "Anakin Predictor create.";
if (config.target_type == AnakinConfig::NVGPU) {
if (config.target_type == contrib::AnakinConfig::NVGPU) {
#ifdef PADDLE_WITH_CUDA
VLOG(3) << "Anakin Predictor create on [ NVIDIA GPU ].";
std::unique_ptr<PaddlePredictor> x(
......@@ -213,7 +217,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
LOG(ERROR) << "AnakinConfig::NVGPU could not used in ONLY-CPU environment";
return nullptr;
#endif
} else if (config.target_type == AnakinConfig::X86) {
} else if (config.target_type == contrib::AnakinConfig::X86) {
VLOG(3) << "Anakin Predictor create on [ Intel X86 ].";
std::unique_ptr<PaddlePredictor> x(
new PaddleInferenceAnakinPredictor<anakin::X86>(config));
......
......@@ -29,6 +29,8 @@ limitations under the License. */
namespace paddle {
using contrib::AnakinConfig;
template <typename Target>
class PaddleInferenceAnakinPredictor : public PaddlePredictor {
public:
......
......@@ -101,14 +101,11 @@ bool NativePaddlePredictor::Init(
inference_program_ = paddle::inference::Load(
executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
} else {
LOG(ERROR) << "fail to load inference model.";
LOG(ERROR) << "fail to load inference model from " << config_.model_dir;
return false;
}
ctx_ = executor_->Prepare(*inference_program_, 0);
if (config_._use_mkldnn) {
executor_->EnableMKLDNN(*inference_program_);
}
executor_->CreateVariables(*inference_program_,
sub_scope_ ? sub_scope_ : scope_.get(), 0);
......@@ -330,4 +327,10 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
#endif
}
template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<NativeConfig>(
const NativeConfig &config) {
return CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
}
} // namespace paddle
......@@ -25,10 +25,11 @@ using inference::analysis::Argument;
using inference::Singleton;
using inference::analysis::Analyzer;
using framework::proto::ProgramDesc;
using paddle::contrib::MixedRTConfig;
class TensorRTSubgraphPredictor : public NativePaddlePredictor {
public:
explicit TensorRTSubgraphPredictor(const TensorRTConfig& config)
explicit TensorRTSubgraphPredictor(const MixedRTConfig& config)
: NativePaddlePredictor(config), config_(config) {}
bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
......@@ -115,13 +116,13 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
}
private:
TensorRTConfig config_;
MixedRTConfig config_;
};
template <>
std::unique_ptr<PaddlePredictor>
CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
const TensorRTConfig& config) {
CreatePaddlePredictor<MixedRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
const MixedRTConfig& config) {
VLOG(3) << "create TensorRTSubgraphPredictor";
if (config.use_gpu) {
// 1. GPU memeroy
......@@ -150,6 +151,13 @@ CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
return std::move(predictor);
}
template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<MixedRTConfig>(
const MixedRTConfig& config) {
return CreatePaddlePredictor<MixedRTConfig,
PaddleEngineKind::kAutoMixedTensorRT>(config);
}
} // namespace paddle
USE_TRT_CONVERTER(elementwise_add_weight);
......
......@@ -20,6 +20,8 @@
namespace paddle {
using contrib::MixedRTConfig;
DEFINE_string(dirname, "", "Directory of the inference model.");
void CompareTensorRTWithFluid(bool enable_tensorrt) {
......@@ -32,7 +34,7 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
config0.fraction_of_gpu_memory = 0.3;
config0.device = 0;
TensorRTConfig config1;
MixedRTConfig config1;
config1.model_dir = FLAGS_dirname + "word2vec.inference.model";
config1.use_gpu = true;
config1.fraction_of_gpu_memory = 0.3;
......@@ -42,7 +44,7 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
auto predictor0 =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
auto predictor1 =
CreatePaddlePredictor<TensorRTConfig,
CreatePaddlePredictor<MixedRTConfig,
PaddleEngineKind::kAutoMixedTensorRT>(config1);
for (int batch_id = 0; batch_id < 1; batch_id++) {
......
......@@ -28,34 +28,61 @@ limitations under the License. */
namespace paddle {
// Data type.
enum PaddleDType {
FLOAT32,
INT64,
// TODO(Superjomn) support more data types if needed.
};
/*
* Memory menage for PaddleTensor.
* The PaddleBuf holds a buffer for data input or output. The memory can be
* allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
* should be reused for better performance.
*
* For user allocated memory, the following API can be used:
* - PaddleBuf(void* data, size_t length) to set an external memory by
* specifying
* the memory address and length.
* - Reset(void* data, size_t length) to reset the PaddleBuf with an external
* memory.
* ATTENTION, for user allocated memory, deallocation should be done by users
* externally after the program finished. The PaddleBuf won't do any allocation
* or deallocation.
*
* To have the PaddleBuf allocate and manage the memory:
* - PaddleBuf(size_t length) will allocate a memory of size `length`.
* - Resize(size_t length) resize the memory to no less than `length`, ATTENTION
* if the allocated memory is larger than `length`, nothing will done.
*/
class PaddleBuf {
public:
PaddleBuf() = default;
PaddleBuf(PaddleBuf&& other);
// Copy only available when memory is managed externally.
explicit PaddleBuf(const PaddleBuf&);
PaddleBuf& operator=(const PaddleBuf&);
PaddleBuf& operator=(PaddleBuf&&);
// Do not own the memory.
PaddleBuf(void* data, size_t length)
: data_(data), length_(length), memory_owned_{false} {}
// Own memory.
// PaddleBuf allocate memory internally, and manage it.
explicit PaddleBuf(size_t length)
: data_(new char[length]), length_(length), memory_owned_(true) {}
// Resize to `length` bytes.
// Set external memory, the PaddleBuf won't manage it.
PaddleBuf(void* data, size_t length)
: data_(data), length_(length), memory_owned_{false} {}
// Copy only available when memory is managed externally.
explicit PaddleBuf(const PaddleBuf&);
// Resize the memory.
void Resize(size_t length);
// Reset to external memory.
// Reset to external memory, with address and length set.
void Reset(void* data, size_t length);
// Tell whether the buffer is empty.
bool empty() const { return length_ == 0; }
// Get the memory address.
void* data() const { return data_; }
// Get the memory length.
size_t length() const { return length_; }
~PaddleBuf() { Free(); }
PaddleBuf& operator=(const PaddleBuf&);
PaddleBuf& operator=(PaddleBuf&&);
PaddleBuf() = default;
PaddleBuf(PaddleBuf&& other);
private:
void Free();
......@@ -64,6 +91,7 @@ class PaddleBuf {
bool memory_owned_{true};
};
// Basic input and output data structure for PaddlePredictor.
struct PaddleTensor {
PaddleTensor() = default;
std::string name; // variable name.
......@@ -73,19 +101,8 @@ struct PaddleTensor {
std::vector<std::vector<size_t>> lod; // Tensor+LoD equals LoDTensor
};
enum class PaddleEngineKind {
kNative = 0, // Use the native Fluid facility.
kAnakin, // Use Anakin for inference.
kAutoMixedTensorRT, // Automatically mix Fluid with TensorRT.
kAnalysis
// TODO(Superjomn) support following engines latter.
// kTensorRT, // Use TensorRT for inference.
// kAutoMixedAnakin, // Automatically mix Fluid with Anakin.
};
/*
* A simple Inference API for Paddle. Currently this API can be used by
* non-sequence scenerios.
* A simple Inference API for Paddle.
*/
class PaddlePredictor {
public:
......@@ -120,26 +137,53 @@ struct NativeConfig : public PaddlePredictor::Config {
// GPU related fields.
bool use_gpu{false};
int device{0};
float fraction_of_gpu_memory{-1.f}; // Negative to notify initialization.
// NOTE: NOT use it, just for the internal test, will discard later
bool _use_mkldnn{false};
// Specify the variable's name of each input.
bool specify_input_name{false};
float fraction_of_gpu_memory{-1.f}; // Change to a float in (0,1] if needed.
// Specify the exact path of program and parameter files.
std::string prog_file;
std::string param_file;
// Specify the variable's name of each input if input tensors don't follow the
// `feeds` and `fetches` of the phase `save_inference_model`.
bool specify_input_name{false};
};
// Configurations for Anakin engine.
struct AnakinConfig : public PaddlePredictor::Config {
enum TargetType { NVGPU = 0, X86 };
int device;
std::string model_file;
int max_batch_size{-1};
TargetType target_type;
// A factory to help create different predictors.
//
// Usage:
//
// NativeConfig config;
// ... // change the configs.
// auto native_predictor = CreatePaddlePredictor(config);
//
// FOR EXTENSION DEVELOPER:
// Different predictors are designated by config type. Similar configs can be
// merged, but there shouldn't be a huge config containing different fields for
// more than one kind of predictors.
template <typename ConfigT>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
// NOTE The following APIs are too trivial, we will discard it in the following
// versions.
enum class PaddleEngineKind {
kNative = 0, // Use the native Fluid facility.
kAutoMixedTensorRT, // Automatically mix Fluid with TensorRT.
kAnalysis, // More optimization.
kAnakin // Use Anakin for inference, not mature yet.
};
struct TensorRTConfig : public NativeConfig {
template <typename ConfigT, PaddleEngineKind engine>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
// ==
//
// -----------------------------------------------------------------------------------
// NOTE: The following APIs are not mature yet, we are still working on them.
namespace contrib {
// Accelerate GPU computation with TensorRT engine.
struct MixedRTConfig : public NativeConfig {
// Determine whether a subgraph will be executed by TRT.
int min_subgraph_size{1};
// While TensorRT allows an engine optimized for a given max batch size
......@@ -154,7 +198,6 @@ struct TensorRTConfig : public NativeConfig {
// NOTE WIP, not stable yet.
struct AnalysisConfig : public NativeConfig {
//
enum class IrPassMode {
kSystem, // Use system default passes, not customize.
kInclude, // Specify the passes in `ir_passes`.
......@@ -165,18 +208,21 @@ struct AnalysisConfig : public NativeConfig {
IrPassMode ir_mode{IrPassMode::kExclude};
// attention lstm fuse works only on some specific models, disable as default.
std::vector<std::string> ir_passes{"attention_lstm_fuse_pass"};
// NOTE this is just for internal development, please not use it.
bool _use_mkldnn{false};
};
// A factory to help create different predictors.
//
// FOR EXTENSION DEVELOPER:
// Different predictors are designated by config type and engine kind. Similar
// configs can be merged, but there shouldn't be a huge config containing
// different fields for more than one kind of predictors.
//
// Similarly, each engine kind should map to a unique predictor implementation.
template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
// Configurations for Anakin engine.
struct AnakinConfig : public PaddlePredictor::Config {
enum TargetType { NVGPU = 0, X86 };
int device;
std::string model_file;
int max_batch_size{-1};
TargetType target_type;
};
} // namespace contrib
int PaddleDtypeSize(PaddleDType dtype);
......
......@@ -22,10 +22,10 @@ DEFINE_string(model, "", "Directory of the inference model(mobile_v2).");
namespace paddle {
AnakinConfig GetConfig() {
AnakinConfig config;
contrib::AnakinConfig GetConfig() {
contrib::AnakinConfig config;
// using AnakinConfig::X86 if you need to use cpu to do inference
config.target_type = AnakinConfig::NVGPU;
config.target_type = contrib::AnakinConfig::NVGPU;
config.model_file = FLAGS_model;
config.device = 0;
config.max_batch_size = 1;
......@@ -33,9 +33,10 @@ AnakinConfig GetConfig() {
}
TEST(inference, anakin) {
AnakinConfig config = GetConfig();
auto config = GetConfig();
auto predictor =
CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config);
CreatePaddlePredictor<contrib::AnakinConfig, PaddleEngineKind::kAnakin>(
config);
float data[1 * 3 * 224 * 224] = {1.0f};
PaddleTensor tensor;
......
......@@ -97,10 +97,10 @@ void Data::get_batch_data(
namespace paddle {
AnakinConfig GetConfig() {
AnakinConfig config;
contrib::AnakinConfig GetConfig() {
contrib::AnakinConfig config;
// using AnakinConfig::X86 if you need to use cpu to do inference
config.target_type = AnakinConfig::X86;
config.target_type = contrib::AnakinConfig::X86;
config.model_file = FLAGS_model;
config.device = 0;
config.max_batch_size = 1000; // the max number of token
......@@ -121,9 +121,10 @@ void set_tensor(std::string name, std::vector<int> shape,
}
void single_test() {
AnakinConfig config = GetConfig();
auto config = GetConfig();
auto predictor =
CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config);
CreatePaddlePredictor<contrib::AnakinConfig, PaddleEngineKind::kAnakin>(
config);
int max_batch_size = 1000;
std::string feature_file = FLAGS_datapath;
......
......@@ -95,7 +95,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
}
}
void SetConfig(AnalysisConfig *cfg) {
void SetConfig(contrib::AnalysisConfig *cfg) {
cfg->prog_file = FLAGS_infer_model + "/__model__";
cfg->param_file = FLAGS_infer_model + "/param";
cfg->use_gpu = false;
......@@ -117,7 +117,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
// Easy for profiling independently.
TEST(Analyzer_Chinese_ner, profile) {
AnalysisConfig cfg;
contrib::AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<PaddleTensor> outputs;
......@@ -141,7 +141,7 @@ TEST(Analyzer_Chinese_ner, profile) {
// Check the fuse status
TEST(Analyzer_Chinese_ner, fuse_statis) {
AnalysisConfig cfg;
contrib::AnalysisConfig cfg;
SetConfig(&cfg);
int num_ops;
......@@ -155,7 +155,7 @@ TEST(Analyzer_Chinese_ner, fuse_statis) {
// Compare result of NativeConfig and AnalysisConfig
TEST(Analyzer_Chinese_ner, compare) {
AnalysisConfig cfg;
contrib::AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
......
......@@ -149,7 +149,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
}
}
void SetConfig(AnalysisConfig *cfg) {
void SetConfig(contrib::AnalysisConfig *cfg) {
cfg->prog_file = FLAGS_infer_model + "/__model__";
cfg->param_file = FLAGS_infer_model + "/param";
cfg->use_gpu = false;
......@@ -172,7 +172,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
// Easy for profiling independently.
TEST(Analyzer_rnn1, profile) {
AnalysisConfig cfg;
contrib::AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<PaddleTensor> outputs;
......@@ -183,7 +183,7 @@ TEST(Analyzer_rnn1, profile) {
// Check the fuse status
TEST(Analyzer_rnn1, fuse_statis) {
AnalysisConfig cfg;
contrib::AnalysisConfig cfg;
SetConfig(&cfg);
int num_ops;
......@@ -198,7 +198,7 @@ TEST(Analyzer_rnn1, fuse_statis) {
// Compare result of NativeConfig and AnalysisConfig
TEST(Analyzer_rnn1, compare) {
AnalysisConfig cfg;
contrib::AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
......@@ -208,7 +208,7 @@ TEST(Analyzer_rnn1, compare) {
// Test Multi-Thread.
TEST(Analyzer_rnn1, multi_thread) {
AnalysisConfig cfg;
contrib::AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<PaddleTensor> outputs;
......
......@@ -38,6 +38,8 @@ DEFINE_bool(use_analysis, true,
namespace paddle {
namespace inference {
using contrib::AnalysisConfig;
void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<PaddleTensor> &ref_outputs) {
EXPECT_GT(outputs.size(), 0UL);
......@@ -77,8 +79,8 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
std::unique_ptr<PaddlePredictor> CreateTestPredictor(
const AnalysisConfig &config, bool use_analysis = true) {
if (use_analysis) {
return CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
return CreatePaddlePredictor<contrib::AnalysisConfig,
PaddleEngineKind::kAnalysis>(config);
} else {
return CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
config);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册