未验证 提交 0129f4b5 编写于 作者: W Wilber 提交者: GitHub

Add some inference API comments for AnalysisPredictor (#23242)

* add inference api doc. test=develop
上级 c8f9e66b
...@@ -30,6 +30,18 @@ ...@@ -30,6 +30,18 @@
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <gtest/gtest_prod.h> #include <gtest/gtest_prod.h>
#endif #endif
///
/// \file analysis_predictor.h
///
/// \brief Compared to NativePredictor, AnalysisPredictor is a high-performance
/// predictor that includes many optimizations
///
/// \author paddle-infer@baidu.com
/// \date 2020-01-01
/// \since 1.7.0
///
namespace paddle { namespace paddle {
using inference::analysis::Argument; using inference::analysis::Argument;
...@@ -37,95 +49,298 @@ using inference::analysis::Analyzer; ...@@ -37,95 +49,298 @@ using inference::analysis::Analyzer;
using framework::proto::ProgramDesc; using framework::proto::ProgramDesc;
using framework::NaiveExecutor; using framework::NaiveExecutor;
/** \brief This predictor is based on the original native predictor with IR and ///
* Analysis support. /// \class AnalysisPredictor
* ///
* It will optimize IR and Parameters in the runtime. /// \brief The analysis predictor is based on the original native predictor with
* /// IR and Analysis support. It will optimize IR and Parameters in the runtime.
* TODO(Superjomn) Replace the Navive predictor? ///
*/ /// The predictor has the following typical uses:
///
/// Get predictor
/// \code{cpp}
/// auto predictor = CreatePaddlePredictor(config);
/// \endcode
///
/// Get input or output names
/// \code{cpp}
/// auto input_names = predictor->GetInputNames();
/// auto output_names = predictor->GetOutputNames();
/// \endcode
///
/// Get input or output tensors
/// \code{cpp}
/// auto input_t = predictor->GetInputTensor(input_names[0]);
/// auto output_t = predictor->GetOutputTensor(output_names[0]);
/// \endcode
///
/// Run predictor
/// \code{cpp}
/// predictor->ZeroCopyRun();
/// \endcode
///
class AnalysisPredictor : public PaddlePredictor { class AnalysisPredictor : public PaddlePredictor {
public: public:
///
/// \brief Construct a new Analysis Predictor object
///
/// \param[in] AnalysisConfig config
///
explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) { explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {
predictor_id_ = inference::GetUniqueId(); predictor_id_ = inference::GetUniqueId();
} }
///
/// \brief Destroy the Analysis Predictor object
///
~AnalysisPredictor(); ~AnalysisPredictor();
///
/// \brief Initialize predictor
///
/// Initializing predictor mainly includes the following tasks:
/// preparing scope, creating executor, preparing program, initializing the
/// variables required by the executor, getting the feed_target_names and
/// fetch_target_names, etc.
///
/// \param[in] parent_scope parent scope
/// \param[in] program program
/// \return Whether the init function executed successfully
///
bool Init(const std::shared_ptr<framework::Scope> &parent_scope, bool Init(const std::shared_ptr<framework::Scope> &parent_scope,
const std::shared_ptr<framework::ProgramDesc> &program = nullptr); const std::shared_ptr<framework::ProgramDesc> &program = nullptr);
///
/// \brief Run the prediction engine. Deprecated. Please refer to ZeroCopyRun
///
/// \param[in] inputs input tensors
/// \param[out] output_data output tensors
/// \param[in] batch_size data's batch size
/// \return Whether the function executed successfully
///
bool Run(const std::vector<PaddleTensor> &inputs, bool Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data, std::vector<PaddleTensor> *output_data,
int batch_size = -1) override; int batch_size = -1) override;
///
/// \brief Get the input names
///
/// \return input names
///
std::vector<std::string> GetInputNames(); std::vector<std::string> GetInputNames();
///
/// \brief Get the output names
///
/// \return output names
///
std::vector<std::string> GetOutputNames(); std::vector<std::string> GetOutputNames();
///
/// \brief Get the Input Tensor object
///
/// \param[in] name input name
/// \return input tensor
///
std::unique_ptr<ZeroCopyTensor> GetInputTensor( std::unique_ptr<ZeroCopyTensor> GetInputTensor(
const std::string &name) override; const std::string &name) override;
///
/// \brief Get the Output Tensor object
///
/// \param[in] name otuput name
/// \return output tensor
///
std::unique_ptr<ZeroCopyTensor> GetOutputTensor( std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
const std::string &name) override; const std::string &name) override;
///
/// \brief Get all input names and their corresponding shapes
///
/// \return the map of input names and shapes
///
std::map<std::string, std::vector<int64_t>> GetInputTensorShape() override; std::map<std::string, std::vector<int64_t>> GetInputTensorShape() override;
///
/// \brief Run the prediction engine
///
/// \return Whether the function executed successfully
///
bool ZeroCopyRun() override; bool ZeroCopyRun() override;
///
/// \brief Create feed fetch variables
///
/// \param[in] scope Scope needed to create variables
///
void CreateFeedFetchVar(framework::Scope *scope); void CreateFeedFetchVar(framework::Scope *scope);
///
/// \brief Determine the model's inputs and outputs based on the program's
/// feed fetch op
///
void PrepareFeedFetch(); void PrepareFeedFetch();
///
/// \brief Set predictor's argument according to config, which mainly includes
/// execution information and graph optimization related pass information
///
void PrepareArgument(); void PrepareArgument();
///
/// \brief According to argument information, execute the relevant pass
/// to get the optimized model program
///
void OptimizeInferenceProgram(); void OptimizeInferenceProgram();
///
/// \brief Get the argument used by predictor
///
/// \return the argument obtained by config
///
Argument &analysis_argument() { return argument_; } Argument &analysis_argument() { return argument_; }
///
/// \brief Clone to get the new predictor. thread safe.
///
/// \return get a new predictor
///
std::unique_ptr<PaddlePredictor> Clone() override; std::unique_ptr<PaddlePredictor> Clone() override;
///
/// \brief Get the scope used by predictor
///
/// \return scope
///
framework::Scope *scope() { return scope_.get(); } framework::Scope *scope() { return scope_.get(); }
///
/// \brief Get the inference program
///
/// \return the inference program
///
framework::ProgramDesc &program() { return *inference_program_; } framework::ProgramDesc &program() { return *inference_program_; }
///
/// \brief Get the serialized program
///
/// \return the serialized program
///
std::string GetSerializedProgram() const override; std::string GetSerializedProgram() const override;
///
/// \brief Initialize mkldnn quantizer and execute mkldnn quantization pass
///
/// \return Whether the function executed successfully
///
bool MkldnnQuantize(); bool MkldnnQuantize();
// save program to model ///
// save parameters to params /// \brief save program to model and save parameters to params
///
/// \param[in] dir path to save the model
///
void SaveOptimModel(const std::string &dir); void SaveOptimModel(const std::string &dir);
protected: protected:
///
/// \brief Prepare predictor's required programs, including loading model
/// information, graph optimization, and executor creation variables, etc.
///
/// \param[in] program paddle program
/// \return Whether the function executed successfully
///
bool PrepareProgram(const std::shared_ptr<framework::ProgramDesc> &program); bool PrepareProgram(const std::shared_ptr<framework::ProgramDesc> &program);
///
/// \brief Prepare scope environment, each predictor has its own scope
///
/// \param[in] parent_scope The scope of the predictor to be cloned, or null
/// \return Whether the function executed successfully
///
bool PrepareScope(const std::shared_ptr<framework::Scope> &parent_scope); bool PrepareScope(const std::shared_ptr<framework::Scope> &parent_scope);
///
/// \brief Create an Executor object
///
/// \return Whether the function executed successfully
///
bool CreateExecutor(); bool CreateExecutor();
///
/// \brief According to the model's program, the executor creates ops
///
/// \return Whether the function executed successfully
///
bool PrepareExecutor(); bool PrepareExecutor();
///
/// \brief Load model program.
///
/// \return Whether the function executed successfully
///
bool LoadProgramDesc(); bool LoadProgramDesc();
///
/// \brief Load model parameters.
///
/// \return Whether the function executed successfully
///
bool LoadParameters(); bool LoadParameters();
///
/// \brief Prepare input data, only used in Run()
///
/// \param[in] input_datas inpute tensors
/// \param[in] scope the scope used by predictor
/// \return Whether the function executed successfully
///
bool SetFeed(const std::vector<PaddleTensor> &input_datas, bool SetFeed(const std::vector<PaddleTensor> &input_datas,
framework::Scope *scope); framework::Scope *scope);
///
/// \brief Get the output data, only used in Run()
///
/// \param[out] output_data output tensors
/// \param[in] scope the scope used by predictor
/// \return Whether the function executed successfully
///
bool GetFetch(std::vector<PaddleTensor> *output_data, bool GetFetch(std::vector<PaddleTensor> *output_data,
framework::Scope *scope); framework::Scope *scope);
///
/// \brief Get the output data, only used in GetFetch()
///
/// \param[in] tensor for fetch op
/// \param[out] output_data output tensor
///
template <typename T> template <typename T>
void GetFetchOne(const framework::LoDTensor &fetchs, void GetFetchOne(const framework::LoDTensor &fetchs,
PaddleTensor *output_data); PaddleTensor *output_data);
// PreSet and PostReset for Mkldnn multi-thread and dynamic shape input. ///
// Used in AnalysisPredictor::Run(), do not support /// \brief PreSet for Mkldnn multi-thread and dynamic shape input.
// AnalysisPredictor::ZeroRun() now. ///
/// Used in AnalysisPredictor::Run(), do not support
/// AnalysisPredictor::ZeroCopyRun() now.
///
/// \param[in] inputs tensors
///
void MkldnnPreSet(const std::vector<PaddleTensor> &inputs); void MkldnnPreSet(const std::vector<PaddleTensor> &inputs);
///
/// \brief PostReset for Mkldnn multi-thread and dynamic shape input.
///
/// Used in AnalysisPredictor::Run(), do not support
/// AnalysisPredictor::ZeroCopyRun() now.
///
void MkldnnPostReset(); void MkldnnPostReset();
///
/// \brief Compute compatibility based on model version information and
/// operator version information
///
/// \return Compatible information
///
bool CheckOperatorCompatible(); bool CheckOperatorCompatible();
#if PADDLE_WITH_TENSORRT #if PADDLE_WITH_TENSORRT
// When we use Paddle-TRT INT8 engine, we need to generate calibration table ///
// data first, /// \brief save calibration table
// the calibration table contains the range for each op's input and output, ///
// this whole process can be divided into several steps: /// When we use Paddle-TRT INT8 engine, we need to generate calibration table
// /// data first,
// 1. Builds a 32-bit engine, runs it on the calibration set, and records a /// the calibration table contains the range for each op's input and output,
// histogram for each /// this whole process can be divided into several steps:
// tensor of the distribution of activation values. /// 1. Builds a 32-bit engine, runs it on the calibration set, and records a
// 2. Builds a calibration table from the histograms. /// histogram for each tensor of the distribution of activation values.
// /// 2. Builds a calibration table from the histograms.
// After step 2, we need to store the calibration table on disk /// After step 2, we need to store the calibration table on disk.
///
/// \return Whether the function executed successfully
///
bool SaveTrtCalibToDisk(); bool SaveTrtCalibToDisk();
#endif #endif
......
...@@ -11,6 +11,17 @@ ...@@ -11,6 +11,17 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
///
/// \file paddle_mkldnn_quantizer_config.h
///
/// \brief Mkldnn quantizer config.
///
/// \author paddle-infer@baidu.com
/// \date 2020-01-01
/// \since 1.7.0
///
#pragma once #pragma once
#include <cassert> #include <cassert>
...@@ -24,75 +35,155 @@ ...@@ -24,75 +35,155 @@
namespace paddle { namespace paddle {
// Algorithms for finding scale of quantized Tensors. ///
/// \brief Algorithms for finding scale of quantized Tensors.
///
enum class ScaleAlgo { enum class ScaleAlgo {
NONE, // Do not compute scale NONE, ///< Do not compute scale
MAX, // Find scale based on the max absolute value MAX, ///< Find scale based on the max absolute value
MAX_CH, // Find scale based on the max absolute value per output channel MAX_CH, ///< Find scale based on the max absolute value per output channel
MAX_CH_T, // Find scale based on the max absolute value per output channel MAX_CH_T, ///< Find scale based on the max absolute value per output channel
// of a transposed tensor ///< of a transposed tensor
KL, // Find scale based on KL Divergence KL, ///< Find scale based on KL Divergence
}; };
///
/// \class MkldnnQuantizerConfig
///
/// \brief Config for mkldnn quantize.
///
/// The MkldnnQuantizerConfig is used to configure Mkldnn's quantization
/// parameters, including scale algorithm, warmup data, warmup batch size,
/// quantized op list, etc.
///
/// It is not recommended to use this config directly, please refer to
/// AnalysisConfig::mkldnn_quantizer_config()
///
struct MkldnnQuantizerConfig { struct MkldnnQuantizerConfig {
///
/// \brief Construct a new Mkldnn Quantizer Config object
///
MkldnnQuantizerConfig(); MkldnnQuantizerConfig();
/** Specify a quantization algorithm for a connection (input/output) of the ///
* operator type. /// \brief Set the scale algo
* @param op_type_name the operator's name. ///
* @param conn_name name of the connection (input/output) of the operator. /// Specify a quantization algorithm for a connection (input/output) of the
* @param algo the algorithm for computing scale. /// operator type.
*/ /// \param[in] op_type_name the operator's name.
/// \param[in] conn_name name of the connection (input/output) of the
/// operator.
/// \param[in] algo the algorithm for computing scale.
///
void SetScaleAlgo(std::string op_type_name, std::string conn_name, void SetScaleAlgo(std::string op_type_name, std::string conn_name,
ScaleAlgo algo) { ScaleAlgo algo) {
rules_[op_type_name][conn_name] = algo; rules_[op_type_name][conn_name] = algo;
} }
/** Get the quantization algorithm for a connection (input/output) of the ///
* operator type. /// \brief Get the scale algo
* @param op_type_name the operator's name. ///
* @param conn_name name of the connection (input/output) of the operator. /// Get the quantization algorithm for a connection (input/output) of the
* @return the algorithm for computing scale. /// operator type.
*/ ///
/// \param[in] op_type_name the operator's name.
/// \param[in] conn_name name of the connection (input/output) of the
/// operator.
/// \return the scale algo.
///
ScaleAlgo scale_algo(const std::string& op_type_name, ScaleAlgo scale_algo(const std::string& op_type_name,
const std::string& conn_name) const; const std::string& conn_name) const;
/** Set the batch of data to be used for warm-up iteration. ///
* @param data batch of data. /// \brief Set the warmup data
*/ ///
/// Set the batch of data to be used for warm-up iteration.
///
/// \param[in] data batch of data.
///
void SetWarmupData(std::shared_ptr<std::vector<PaddleTensor>> data) { void SetWarmupData(std::shared_ptr<std::vector<PaddleTensor>> data) {
warmup_data_ = data; warmup_data_ = data;
} }
/** Get the batch of data used for warm-up iteration. ///
* @return batch of data. /// \brief Get the warmup data
*/ ///
/// Get the batch of data used for warm-up iteration.
///
/// \return the warm up data
///
std::shared_ptr<std::vector<PaddleTensor>> warmup_data() const { std::shared_ptr<std::vector<PaddleTensor>> warmup_data() const {
return warmup_data_; return warmup_data_;
} }
///
/// \brief Set the warmup batch size
///
/// Set the batch size for warm-up iteration.
///
/// \param[in] batch_size warm-up batch size
///
void SetWarmupBatchSize(int batch_size) { warmup_bs_ = batch_size; } void SetWarmupBatchSize(int batch_size) { warmup_bs_ = batch_size; }
///
/// \brief Get the warmup batch size
///
/// Get the batch size for warm-up iteration.
///
/// \return the warm up batch size
int warmup_batch_size() const { return warmup_bs_; } int warmup_batch_size() const { return warmup_bs_; }
///
/// \brief Set quantized op list
///
/// In the quantization process, set the op list that supports quantization
///
/// \param[in] op_list List of quantized ops
///
void SetEnabledOpTypes(std::unordered_set<std::string> op_list) { void SetEnabledOpTypes(std::unordered_set<std::string> op_list) {
enabled_op_types_ = op_list; enabled_op_types_ = op_list;
} }
///
/// \brief Get quantized op list
///
/// \return list of quantized ops
///
const std::unordered_set<std::string>& enabled_op_types() const { const std::unordered_set<std::string>& enabled_op_types() const {
return enabled_op_types_; return enabled_op_types_;
} }
///
/// \brief Set the excluded op ids
///
/// \param[in] op_ids_list excluded op ids
///
void SetExcludedOpIds(std::unordered_set<int> op_ids_list) { void SetExcludedOpIds(std::unordered_set<int> op_ids_list) {
excluded_op_ids_ = op_ids_list; excluded_op_ids_ = op_ids_list;
} }
///
/// \brief Get the excluded op ids
///
/// \return exclude op ids
///
const std::unordered_set<int>& excluded_op_ids() const { const std::unordered_set<int>& excluded_op_ids() const {
return excluded_op_ids_; return excluded_op_ids_;
} }
///
/// \brief Set default scale algorithm
///
/// \param[in] algo Method for calculating scale in quantization process
///
void SetDefaultScaleAlgo(ScaleAlgo algo) { default_scale_algo_ = algo; } void SetDefaultScaleAlgo(ScaleAlgo algo) { default_scale_algo_ = algo; }
///
/// \brief Get default scale algorithm
///
/// \return Method for calculating scale in quantization
/// process
///
ScaleAlgo default_scale_algo() const { return default_scale_algo_; } ScaleAlgo default_scale_algo() const { return default_scale_algo_; }
protected: protected:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册