diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 3f192bc0545a2678e76a4bbfb1b24cad6fcae37f..267817829ec4598808486fd3ea5df241a1466e22 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -30,6 +30,18 @@ #include #include #endif + +/// +/// \file analysis_predictor.h +/// +/// \brief Compared to NativePredictor, AnalysisPredictor is a high-performance +/// predictor that includes many optimizations +/// +/// \author paddle-infer@baidu.com +/// \date 2020-01-01 +/// \since 1.7.0 +/// + namespace paddle { using inference::analysis::Argument; @@ -37,95 +49,298 @@ using inference::analysis::Analyzer; using framework::proto::ProgramDesc; using framework::NaiveExecutor; -/** \brief This predictor is based on the original native predictor with IR and - * Analysis support. - * - * It will optimize IR and Parameters in the runtime. - * - * TODO(Superjomn) Replace the Navive predictor? - */ +/// +/// \class AnalysisPredictor +/// +/// \brief The analysis predictor is based on the original native predictor with +/// IR and Analysis support. It will optimize IR and Parameters in the runtime. +/// +/// The predictor has the following typical uses: +/// +/// Get predictor +/// \code{cpp} +/// auto predictor = CreatePaddlePredictor(config); +/// \endcode +/// +/// Get input or output names +/// \code{cpp} +/// auto input_names = predictor->GetInputNames(); +/// auto output_names = predictor->GetOutputNames(); +/// \endcode +/// +/// Get input or output tensors +/// \code{cpp} +/// auto input_t = predictor->GetInputTensor(input_names[0]); +/// auto output_t = predictor->GetOutputTensor(output_names[0]); +/// \endcode +/// +/// Run predictor +/// \code{cpp} +/// predictor->ZeroCopyRun(); +/// \endcode +/// class AnalysisPredictor : public PaddlePredictor { public: + /// + /// \brief Construct a new Analysis Predictor object + /// + /// \param[in] AnalysisConfig config + /// explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) { predictor_id_ = inference::GetUniqueId(); } + /// + /// \brief Destroy the Analysis Predictor object + /// ~AnalysisPredictor(); + /// + /// \brief Initialize predictor + /// + /// Initializing predictor mainly includes the following tasks: + /// preparing scope, creating executor, preparing program, initializing the + /// variables required by the executor, getting the feed_target_names and + /// fetch_target_names, etc. + /// + /// \param[in] parent_scope parent scope + /// \param[in] program program + /// \return Whether the init function executed successfully + /// bool Init(const std::shared_ptr &parent_scope, const std::shared_ptr &program = nullptr); + /// + /// \brief Run the prediction engine. Deprecated. Please refer to ZeroCopyRun + /// + /// \param[in] inputs input tensors + /// \param[out] output_data output tensors + /// \param[in] batch_size data's batch size + /// \return Whether the function executed successfully + /// bool Run(const std::vector &inputs, std::vector *output_data, int batch_size = -1) override; + /// + /// \brief Get the input names + /// + /// \return input names + /// std::vector GetInputNames(); + /// + /// \brief Get the output names + /// + /// \return output names + /// std::vector GetOutputNames(); + /// + /// \brief Get the Input Tensor object + /// + /// \param[in] name input name + /// \return input tensor + /// std::unique_ptr GetInputTensor( const std::string &name) override; + /// + /// \brief Get the Output Tensor object + /// + /// \param[in] name otuput name + /// \return output tensor + /// std::unique_ptr GetOutputTensor( const std::string &name) override; - + /// + /// \brief Get all input names and their corresponding shapes + /// + /// \return the map of input names and shapes + /// std::map> GetInputTensorShape() override; + /// + /// \brief Run the prediction engine + /// + /// \return Whether the function executed successfully + /// bool ZeroCopyRun() override; + /// + /// \brief Create feed fetch variables + /// + /// \param[in] scope Scope needed to create variables + /// void CreateFeedFetchVar(framework::Scope *scope); + /// + /// \brief Determine the model's inputs and outputs based on the program's + /// feed fetch op + /// void PrepareFeedFetch(); + /// + /// \brief Set predictor's argument according to config, which mainly includes + /// execution information and graph optimization related pass information + /// void PrepareArgument(); + /// + /// \brief According to argument information, execute the relevant pass + /// to get the optimized model program + /// void OptimizeInferenceProgram(); + /// + /// \brief Get the argument used by predictor + /// + /// \return the argument obtained by config + /// Argument &analysis_argument() { return argument_; } - + /// + /// \brief Clone to get the new predictor. thread safe. + /// + /// \return get a new predictor + /// std::unique_ptr Clone() override; - + /// + /// \brief Get the scope used by predictor + /// + /// \return scope + /// framework::Scope *scope() { return scope_.get(); } + /// + /// \brief Get the inference program + /// + /// \return the inference program + /// framework::ProgramDesc &program() { return *inference_program_; } + /// + /// \brief Get the serialized program + /// + /// \return the serialized program + /// std::string GetSerializedProgram() const override; + /// + /// \brief Initialize mkldnn quantizer and execute mkldnn quantization pass + /// + /// \return Whether the function executed successfully + /// bool MkldnnQuantize(); - // save program to model - // save parameters to params + /// + /// \brief save program to model and save parameters to params + /// + /// \param[in] dir path to save the model + /// void SaveOptimModel(const std::string &dir); protected: + /// + /// \brief Prepare predictor's required programs, including loading model + /// information, graph optimization, and executor creation variables, etc. + /// + /// \param[in] program paddle program + /// \return Whether the function executed successfully + /// bool PrepareProgram(const std::shared_ptr &program); + /// + /// \brief Prepare scope environment, each predictor has its own scope + /// + /// \param[in] parent_scope The scope of the predictor to be cloned, or null + /// \return Whether the function executed successfully + /// bool PrepareScope(const std::shared_ptr &parent_scope); + /// + /// \brief Create an Executor object + /// + /// \return Whether the function executed successfully + /// bool CreateExecutor(); + /// + /// \brief According to the model's program, the executor creates ops + /// + /// \return Whether the function executed successfully + /// bool PrepareExecutor(); + /// + /// \brief Load model program. + /// + /// \return Whether the function executed successfully + /// bool LoadProgramDesc(); + /// + /// \brief Load model parameters. + /// + /// \return Whether the function executed successfully + /// bool LoadParameters(); + /// + /// \brief Prepare input data, only used in Run() + /// + /// \param[in] input_datas inpute tensors + /// \param[in] scope the scope used by predictor + /// \return Whether the function executed successfully + /// bool SetFeed(const std::vector &input_datas, framework::Scope *scope); + /// + /// \brief Get the output data, only used in Run() + /// + /// \param[out] output_data output tensors + /// \param[in] scope the scope used by predictor + /// \return Whether the function executed successfully + /// bool GetFetch(std::vector *output_data, framework::Scope *scope); + /// + /// \brief Get the output data, only used in GetFetch() + /// + /// \param[in] tensor for fetch op + /// \param[out] output_data output tensor + /// template void GetFetchOne(const framework::LoDTensor &fetchs, PaddleTensor *output_data); - // PreSet and PostReset for Mkldnn multi-thread and dynamic shape input. - // Used in AnalysisPredictor::Run(), do not support - // AnalysisPredictor::ZeroRun() now. + /// + /// \brief PreSet for Mkldnn multi-thread and dynamic shape input. + /// + /// Used in AnalysisPredictor::Run(), do not support + /// AnalysisPredictor::ZeroCopyRun() now. + /// + /// \param[in] inputs tensors + /// void MkldnnPreSet(const std::vector &inputs); + /// + /// \brief PostReset for Mkldnn multi-thread and dynamic shape input. + /// + /// Used in AnalysisPredictor::Run(), do not support + /// AnalysisPredictor::ZeroCopyRun() now. + /// void MkldnnPostReset(); + /// + /// \brief Compute compatibility based on model version information and + /// operator version information + /// + /// \return Compatible information + /// bool CheckOperatorCompatible(); #if PADDLE_WITH_TENSORRT - // When we use Paddle-TRT INT8 engine, we need to generate calibration table - // data first, - // the calibration table contains the range for each op's input and output, - // this whole process can be divided into several steps: - // - // 1. Builds a 32-bit engine, runs it on the calibration set, and records a - // histogram for each - // tensor of the distribution of activation values. - // 2. Builds a calibration table from the histograms. - // - // After step 2, we need to store the calibration table on disk + /// + /// \brief save calibration table + /// + /// When we use Paddle-TRT INT8 engine, we need to generate calibration table + /// data first, + /// the calibration table contains the range for each op's input and output, + /// this whole process can be divided into several steps: + /// 1. Builds a 32-bit engine, runs it on the calibration set, and records a + /// histogram for each tensor of the distribution of activation values. + /// 2. Builds a calibration table from the histograms. + /// After step 2, we need to store the calibration table on disk. + /// + /// \return Whether the function executed successfully + /// bool SaveTrtCalibToDisk(); #endif diff --git a/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h b/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h index 2ac09b82138ecaf3663a3783633c9cbf50da16bd..6ddbef78f9d4cbd0ddd282cbb7e82fd4fcb444e4 100644 --- a/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h +++ b/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h @@ -11,6 +11,17 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + +/// +/// \file paddle_mkldnn_quantizer_config.h +/// +/// \brief Mkldnn quantizer config. +/// +/// \author paddle-infer@baidu.com +/// \date 2020-01-01 +/// \since 1.7.0 +/// + #pragma once #include @@ -24,75 +35,155 @@ namespace paddle { -// Algorithms for finding scale of quantized Tensors. +/// +/// \brief Algorithms for finding scale of quantized Tensors. +/// enum class ScaleAlgo { - NONE, // Do not compute scale - MAX, // Find scale based on the max absolute value - MAX_CH, // Find scale based on the max absolute value per output channel - MAX_CH_T, // Find scale based on the max absolute value per output channel - // of a transposed tensor - KL, // Find scale based on KL Divergence + NONE, ///< Do not compute scale + MAX, ///< Find scale based on the max absolute value + MAX_CH, ///< Find scale based on the max absolute value per output channel + MAX_CH_T, ///< Find scale based on the max absolute value per output channel + ///< of a transposed tensor + KL, ///< Find scale based on KL Divergence }; +/// +/// \class MkldnnQuantizerConfig +/// +/// \brief Config for mkldnn quantize. +/// +/// The MkldnnQuantizerConfig is used to configure Mkldnn's quantization +/// parameters, including scale algorithm, warmup data, warmup batch size, +/// quantized op list, etc. +/// +/// It is not recommended to use this config directly, please refer to +/// AnalysisConfig::mkldnn_quantizer_config() +/// struct MkldnnQuantizerConfig { + /// + /// \brief Construct a new Mkldnn Quantizer Config object + /// MkldnnQuantizerConfig(); - /** Specify a quantization algorithm for a connection (input/output) of the - * operator type. - * @param op_type_name the operator's name. - * @param conn_name name of the connection (input/output) of the operator. - * @param algo the algorithm for computing scale. - */ + /// + /// \brief Set the scale algo + /// + /// Specify a quantization algorithm for a connection (input/output) of the + /// operator type. + /// \param[in] op_type_name the operator's name. + /// \param[in] conn_name name of the connection (input/output) of the + /// operator. + /// \param[in] algo the algorithm for computing scale. + /// void SetScaleAlgo(std::string op_type_name, std::string conn_name, ScaleAlgo algo) { rules_[op_type_name][conn_name] = algo; } - /** Get the quantization algorithm for a connection (input/output) of the - * operator type. - * @param op_type_name the operator's name. - * @param conn_name name of the connection (input/output) of the operator. - * @return the algorithm for computing scale. - */ + /// + /// \brief Get the scale algo + /// + /// Get the quantization algorithm for a connection (input/output) of the + /// operator type. + /// + /// \param[in] op_type_name the operator's name. + /// \param[in] conn_name name of the connection (input/output) of the + /// operator. + /// \return the scale algo. + /// ScaleAlgo scale_algo(const std::string& op_type_name, const std::string& conn_name) const; - /** Set the batch of data to be used for warm-up iteration. - * @param data batch of data. - */ + /// + /// \brief Set the warmup data + /// + /// Set the batch of data to be used for warm-up iteration. + /// + /// \param[in] data batch of data. + /// void SetWarmupData(std::shared_ptr> data) { warmup_data_ = data; } - /** Get the batch of data used for warm-up iteration. - * @return batch of data. - */ + /// + /// \brief Get the warmup data + /// + /// Get the batch of data used for warm-up iteration. + /// + /// \return the warm up data + /// std::shared_ptr> warmup_data() const { return warmup_data_; } + /// + /// \brief Set the warmup batch size + /// + /// Set the batch size for warm-up iteration. + /// + /// \param[in] batch_size warm-up batch size + /// void SetWarmupBatchSize(int batch_size) { warmup_bs_ = batch_size; } + /// + /// \brief Get the warmup batch size + /// + /// Get the batch size for warm-up iteration. + /// + /// \return the warm up batch size int warmup_batch_size() const { return warmup_bs_; } + /// + /// \brief Set quantized op list + /// + /// In the quantization process, set the op list that supports quantization + /// + /// \param[in] op_list List of quantized ops + /// void SetEnabledOpTypes(std::unordered_set op_list) { enabled_op_types_ = op_list; } + /// + /// \brief Get quantized op list + /// + /// \return list of quantized ops + /// const std::unordered_set& enabled_op_types() const { return enabled_op_types_; } + /// + /// \brief Set the excluded op ids + /// + /// \param[in] op_ids_list excluded op ids + /// void SetExcludedOpIds(std::unordered_set op_ids_list) { excluded_op_ids_ = op_ids_list; } + /// + /// \brief Get the excluded op ids + /// + /// \return exclude op ids + /// const std::unordered_set& excluded_op_ids() const { return excluded_op_ids_; } + /// + /// \brief Set default scale algorithm + /// + /// \param[in] algo Method for calculating scale in quantization process + /// void SetDefaultScaleAlgo(ScaleAlgo algo) { default_scale_algo_ = algo; } + /// + /// \brief Get default scale algorithm + /// + /// \return Method for calculating scale in quantization + /// process + /// ScaleAlgo default_scale_algo() const { return default_scale_algo_; } protected: