From 5ef1ac75e6a022449e21eac306d0816524933d7c Mon Sep 17 00:00:00 2001
From: Megvii Engine Team <megengine@megvii.com>
Date: Thu, 26 May 2022 17:09:18 +0800
Subject: [PATCH] docs(api/lite): add lite network api doc

GitOrigin-RevId: 5d416cc5af9595240dbf71bdc819d989ec2d5dbc
---
 lite/include/lite/network.h | 367 +++++++++++++++++++++++++-----------
 1 file changed, 256 insertions(+), 111 deletions(-)

diff --git a/lite/include/lite/network.h b/lite/include/lite/network.h
index 7a20c58d4..3f65aea48 100644
--- a/lite/include/lite/network.h
+++ b/lite/include/lite/network.h
@@ -18,56 +18,56 @@ LITE_API inline LiteAlgoSelectStrategy operator|(
 }
 
 /*!
- * \brief the inference options which will be translated to megenine
+ * @brief the inference options which can optimize the network forwarding
+ * performance
  *
- * \param weight_preprocess is the option wich optimize the inferece performance
- * with preprocess the const weights
+ * @param weight_preprocess is the option which optimize the inference performance
+ * with processing the weights of the network ahead
  *
- * \param fuse_preprocess fuse preprocess patten, like astype + pad_channel +
+ * @param fuse_preprocess fuse preprocess patten, like astype + pad_channel +
  * dimshuffle
  *
- * \param fake_next_exec  whether only to perform non-computing tasks (like
- * memory allocation and queue initialization) for next exec. This would be
+ * @param fake_next_exec  whether only to perform non-computing tasks (like
+ * memory allocation and queue initialization) for next exec. This will be
  * reset to false when the graph is executed.
  *
- * \param var_sanity_check_first_run Disable var sanity check on the first run.
+ * @param var_sanity_check_first_run Disable var sanity check on the first run.
  * Var sanity check is enabled on the first-time execution by default, and can
  * be used to find some potential memory access errors in the operator
- * implementation.
  *
- * \param const_shape This can be used to reduce memory usage since some
- * static inference data structures can be omitted.
+ * @param const_shape used to reduce memory usage and improve performance since some
+ * static inference data structures can be omitted and some operators can be
+ * compute before forwarding
  *
- * \param force_dynamic_alloc force dynamic memory alloc for all vars
+ * @param force_dynamic_alloc force dynamic allocate memory for all vars
  *
- * \param force_output_dynamic_alloc force dynamic memory alloc for output vars
- * which are used as CallbackCaller input when call compile() function
+ * @param force_output_dynamic_alloc force dynamic allocate memory for output tensor
+ * which are used as the input of CallbackCaller Operator
  *
- * \param no_profiling_on_shape_change do not re-profile to select best impl
+ * @param no_profiling_on_shape_change do not re-profile to select best implement
  * algo when input shape changes (use previous algo)
  *
- * \param jit_level Execute supported operators with JIT (support MLIR,
- * NVRTC). Can only be used on Nvidia GPUs, this value indicates JIT level:
- * 1 for basic elemwise opr;
- * 2 for including reduce operator
+ * @param jit_level Execute supported operators with JIT (support MLIR,
+ * NVRTC). Can only be used on Nvidia GPUs and X86 CPU, this value indicates JIT level:
+ * level 1: for JIT execute with basic elemwise operator
+ * level 2: for JIT execute elemwise and reduce operators
  *
- * \param record_level flag optimize the inference performace with record the
- * kernel tasks in first run, hereafter the inference all need to execute the
+ * @param record_level flags to optimize the inference performance with record the
+ * kernel tasks in first run, hereafter the inference all need is to execute the
  * recorded tasks.
  * level = 0 means the normal inference,
  * level = 1 means use record inference,
  * level = 2 means record inference with free the extra memory
  *
- * \param graph_opt_level optimization level:
+ * @param graph_opt_level network optimization level:
  * 0: disable
  * 1: level-1: inplace arith transformations during graph
  *    construction
  * 2: level-2: level-1, plus global optimization before graph
  *    compiling
  * 3: also enable JIT
- * <0: corresponding level, with result check for debug
  *
- * \param async_exec_level exec: dispatch on separate threads for different
+ * @param async_exec_level level of dispatch on separate threads for different
  * comp_node.
  * 0: do not perform async dispatch
  * 1: dispatch async if there are more than one comp node with limited queue
@@ -99,14 +99,21 @@ struct LITE_API Options {
     bool enable_nchw64 = false;
 };
 
-/*!
- * \brief Configuration when load and compile the graph
+/**
+ * @brief Configuration when load and compile a network
+ *
+ * @param has_compression flag whether the model is compressed, the compress
+ * method is stored in the model
+ *
+ * @param device_id configure the device id of a network
+ * @param device_type configure the device type of a network
+ * @param backend configure the inference backend of a network, now only support
+ * megengine
  *
- * \param bare_model_cryption_name is the bare model cryption method name, bare
- *model is not pack json info inside
+ * @param bare_model_cryption_name is the bare model encryption method name, bare
+ * model is not pack json information data inside
  *
- *\param has_compression flag whether the model is compressed, the compress
- *method will read form the model
+ * @param options configuration of Options
  */
 struct LITE_API Config {
     bool has_compression = false;
@@ -118,9 +125,9 @@ struct LITE_API Config {
 };
 
 /*!
- * \brief Extra Configuration for a network
+ * @brief Extra Configuration for a network
  *
- * \param disable_configure_by_model_info disable the configuration dumped with model,
+ * @param disable_configure_by_model_info disable the configuration dumped with model,
  * if set true, all configuration in the model will not apply, users should configure
  * the network.
  */
@@ -128,90 +135,136 @@ struct LITE_API ExtraConfig {
     bool disable_configure_by_model_info = false;
 };
 
-/*!
- * \brief config the network input and output item
+
+/**
+ * @brief config the network input and output item, the input and output tensor
+ * information will describe there
+ *
+ * @param name the input/output tensor name
+ *
+ * @param is_host Used to mark where the input tensor comes from and where the output
+ * tensor will copy to, if is_host is true, the input is from host and output copy
+ * to host, otherwise in device. Sometimes the input is from device and output no need
+ * copy to host, default is true.
+ *
+ * @param io_type The IO type, it can be SHAPE or VALUE, when SHAPE is set, the input or
+ * output tensor value is invaid, only shape will be set, default is VALUE
+ *
+ * @param config_layout The layout of input or output tensor
  *
+ * \verbatim embed:rst:leading-asterisk
+ *
+ *  .. note::
+ *
+ *      * if other layout is set to input tensor before forwarding, this layout will not
+ *        work
+ *      * if no layout is set before forwarding, the model will forward with its origin
+ *        layout
+ *      * if layout is set in output tensor, it will used to check whether the
+ *        layout computed from the network is correct
+ *
+ * \endverbatim
  */
 struct LITE_API IO {
-    //! the tensor name in the graph corresponding to the IO
     std::string name;
 
-    //! Used to mark where the input tensor comes from and the output where copy
-    //! to, if is_host is true, the input is from host and output copy to host,
-    //! otherwise device. Sometimes The input is from device and output no need
-    //! copy to host, default is true.
     bool is_host = true;
 
-    //! The IO type, it can be SHAPE or VALUE, when SHAPE is set, the input or
-    //! output tensor value is invaid, only shape will be set, default is VALUE
     LiteIOType io_type = LiteIOType::LITE_IO_VALUE;
 
-    //! The layout of the config from user, if other layout is set before
-    //! forward or get after forward by input tensor reset, this layout will by
-    //! pass. if no other layout is set before forward, this layout will work.
-    //! if this layout is no set, the model will forward with its origin layout.
-    //! if in output, it will used to check.
     Layout config_layout = {};
 };
 
-/*!
- * \brief the input and output information when load the network
- * the NetworkIO will remain in the network until the network is destroyed
+/**
+ * @brief the input and output information when load the network
+ * the NetworkIO will remain in the network until the network is destroyed.
+ *
+ * @param inputs The all input tensors information that will configure to the network
+ * @param outputs The all output tensors information that will configure to the network
  */
 struct LITE_API NetworkIO {
     std::vector<IO> inputs = {};
     std::vector<IO> outputs = {};
 };
 
-/*!
- * \brief A user-implemented allocator interface
+/**
+ * @brief A user-implemented allocator interface, user can register an allocator
+ * to the megengine, then all the runtime memory will allocate by this allocator
  */
 class LITE_API Allocator {
 public:
     virtual ~Allocator() = default;
 
-    //! allocate memory of size in the given device with the given align
+    /** @brief allocate memory of size in the given device with the given align
+     *
+     * @param device_type the device type the memory will allocate from
+     * @param device_id the device id the memory will allocate from
+     * @param size the byte size of memory will be allocated
+     * @param align the align size require when allocate the memory
+     */
     virtual void* allocate(
             LiteDeviceType device_type, int device_id, size_t size, size_t align) = 0;
 
-    //! free the memory pointed by ptr in the given device
+    /** @brief free the memory pointed by ptr in the given device
+     *
+     * @param device_type the device type the memory will allocate from
+     * @param device_id the device id the memory will allocate from
+     * @param ptr the memory pointer to be free
+     */
     virtual void free(LiteDeviceType device_type, int device_id, void* ptr) = 0;
 };
 
-/*!
- * \brief the thread affinith callback type
- * \param thread_id thread_id is the a number begin from 0 to (nr_threads - 1),
- * thread_id of (nr_threads - 1) is the main worker thread.
+/**
+ * @brief the thread affinith callback function type
+ *
+ * @param thread_id the id of the current thread, the id is a number begin from 0 to
+ * (nr_threads - 1), thread id of (nr_threads - 1) is the main worker thread.
  */
 using ThreadAffinityCallback = std::function<void(int thread_id)>;
 
+/**
+ * @brief the network async callback function type
+ */
 using AsyncCallback = std::function<void(void)>;
 
-/*!
- * \brief the start/finish callback function
- * \param unordered_map map from the io tensor name to the pair of which is the
- * corresponding IO of user config and the realy input or output tensor.
+/**
+ * @brief the start/finish callback function type
+ *
+ * @param unordered_map map from the io tensor name to the pair of the
+ * user configuration information and the really input or output tensor.
  */
+//@{
 using StartCallback =
         std::function<void(const std::unordered_map<
                            std::string, std::pair<IO, std::shared_ptr<Tensor>>>&)>;
 using FinishCallback =
         std::function<void(const std::unordered_map<
                            std::string, std::pair<IO, std::shared_ptr<Tensor>>>&)>;
+//@}
 
-/*!
- * \brief The network is construct form a model, implement model load, init,
- * forward, and display some model information
+/**
+ * @brief The network is the main class to perform forwarding, which is construct form a
+ * model, and implement model load, init, forward, and display some model information
  */
 class LITE_API Network {
 public:
     class NetworkImplBase;
+    friend class NetworkHelper;
 
     ~Network();
 
+    /*! @brief Construct a network with given configuration and IO information
+     *
+     * @name Constructor
+     *
+     * @param config  The configuration to create the network
+     * @param networkio The NetworkIO to describe the input and output
+     * tensor of the network
+     */
+    //@{
     Network(const Config& config = {}, const NetworkIO& networkio = {});
-
     Network(const NetworkIO& networkio, const Config& config = {});
+    //@}
 
     //! load the model form memory
     void load_model(void* model_mem, size_t size);
@@ -219,32 +272,37 @@ public:
     //! load the model from a model path
     void load_model(std::string model_path);
 
-    //! only compute the output tensor in user configured
+    //! only compute the output tensor configured by the IO information
     void compute_only_configured_output();
 
-    //! get the network input and output tensor, the layout of which is
-    //! sync from mge tensor, when the name of input and output tensor  are the
-    //! same, use LiteTensorPhase to separate
+    /** @brief get the network input and output tensor, the layout of which is
+     * sync from megengine tensor, when the name of input and output tensor are the
+     * same, use LiteTensorPhase to separate them
+     *
+     * @param io_name the name of the tensor
+     * @param phase indicate whether the tensor is input tensor or output tensor,
+     * maybe the input tensor name is the same with the output tensor name
+     */
     std::shared_ptr<Tensor> get_io_tensor(
             std::string io_name, LiteTensorPhase phase = LiteTensorPhase::LITE_IO);
 
-    //! get the network input by index
+    //! get the network input tensor by index
     std::shared_ptr<Tensor> get_input_tensor(size_t index);
 
     //! get the network output tensor by index
     std::shared_ptr<Tensor> get_output_tensor(size_t index);
 
-    //! set the network forward in async mode and set the async callback
+    //! set the network forwarding in async mode and set the AsyncCallback callback
     //! function
     Network& set_async_callback(const AsyncCallback& async_callback);
 
-    //! set the start forward callback function, which will be execute before
-    //! forward. this can be used to check network input or dump model inputs
-    //! for debug
+    //! set the start forwarding callback function of type StartCallback, which will be
+    //! execute before forward. this can be used to check network input or dump model
+    //! inputs for debug
     Network& set_start_callback(const StartCallback& start_callback);
 
-    //! set the finish forward callback function, which will be execute after
-    //! forward. this can be used to dump model outputs for debug
+    //! set the finish forwarding callback function of type FinishCallback, which will
+    //! be execute after forward. this can be used to dump model outputs for debug
     Network& set_finish_callback(const FinishCallback& finish_callback);
 
     //! forward the network with filled input data and fill the output data
@@ -254,33 +312,37 @@ public:
     //! waite until forward finish in sync model
     void wait();
 
-    //! get the input tensor name in the order in load return
+    //! get the input tensor name by index
     std::string get_input_name(size_t index) const;
 
-    //! get the output tensor name in the order in load return
+    //! get the output tensor name by index
     std::string get_output_name(size_t index) const;
 
-    //! get all the input tensor name in the order in load return
+    //! get all the input tensor names
     std::vector<std::string> get_all_input_name() const;
 
-    //! get all the output tensor name in the order in load return
+    //! get all the output tensor names
     std::vector<std::string> get_all_output_name() const;
 
-    //! set/get device id, default device id = 0
+    //! set the network forwarding device id, default device id = 0
     Network& set_device_id(int device_id);
+
+    //! get the network forwarding device id
     int get_device_id() const;
 
-    //! set/get stream id, default stream id = 0
+    //! set the network stream id, default stream id = 0
     Network& set_stream_id(int stream_id);
+
+    //! get the network stream id
     int get_stream_id() const;
 
-    //! enable profile the network, a file will be generated
+    //! enable profile the network, a file will be generated to the given path
     void enable_profile_performance(std::string profile_file_path);
 
-    //! get model extra info
+    //! get model extra info, the extra information is packed into model by user
     const std::string& get_model_extra_info();
 
-    //! get device type
+    //! get the network device type
     LiteDeviceType get_device_type() const;
 
     //! get static peak memory info showed by Graph visualization
@@ -312,80 +374,163 @@ private:
 };
 
 /*********************** MGE special network function ***************/
+/*!
+ * @brief All the runtime configuration function is define in Runtime class, as
+ * a static member function
+ */
 class LITE_API Runtime {
 public:
-    //! When device is CPU, this interface will set the to be loaded model
-    //! run in multi thread mode with the given thread number.
+    /** @brief The multithread number setter and getter interface
+     * When device is CPU, this interface will set the network
+     * running in multi thread mode with the given thread number.
+     *
+     * @param dst_network the target network to set/get the thread number
+     * @param nr_threads the thread number set to the target network
+     */
+    //@{
     static void set_cpu_threads_number(
             std::shared_ptr<Network> dst_network, size_t nr_threads);
     static size_t get_cpu_threads_number(std::shared_ptr<Network> dst_network);
+    //@}
 
-    //! set threads affinity callback;
+    /** @brief set threads affinity callback
+     *
+     * @param dst_network the target network to set the thread affinity callback
+     * @param thread_affinity_callback the ThreadAffinityCallback callback to set the
+     * thread affinity
+     */
     static void set_runtime_thread_affinity(
             std::shared_ptr<Network> network,
             const ThreadAffinityCallback& thread_affinity_callback);
 
-    //! Set cpu default mode when device is CPU, in some low computation
-    //! device or single core device, this mode will get good performace
+    /** @brief Set cpu default mode when device is CPU, in some low computation
+     * device or single core device, this mode will get good performace
+     *
+     * @param dst_network the target network to set/get cpu inplace model
+     */
+    //@{
     static void set_cpu_inplace_mode(std::shared_ptr<Network> dst_network);
     static bool is_cpu_inplace_mode(std::shared_ptr<Network> dst_network);
+    //@}
 
-    //! Set use tensorrt forward
+    //! Set the network forwarding use tensorrt
     static void use_tensorrt(std::shared_ptr<Network> dst_network);
 
-    //! set opr algorithm selection strategy in the network
-    //! shared_batch_size: the batch size used by fastrun,
-    //!                    Non-zero value means that fastrun use this batch size
-    //!                    regardless of the batch size of the model. Zero means
-    //!                    fastrun use batch size of the model
-    //! binary_equal_between_batch: if the content of each input batch is binary
-    //!                             equal,whether the content of each output
-    //!                             batch is promised to be equal
+    /** @brief set opr algorithm selection strategy in the target network
+     *
+     * @param dst_network the target network to set the algorithm strategy
+     * @param strategy the algorithm strategy will set to the network, if multi
+     * strategy should set, use | operator can pack them together
+     * @param shared_batch_size the batch size used by fast-run, Non-zero value means
+     * that fast-run use this batch size regardless of the batch size of the model, if
+     * set to zero means fast-run use batch size of the model
+     *
+     * @param binary_equal_between_batch if set true means if the content of each input
+     * batch is binary equal, whether the content of each output batch is promised to be
+     * equal, otherwise not
+     */
     static void set_network_algo_policy(
             std::shared_ptr<Network> dst_network, LiteAlgoSelectStrategy strategy,
             uint32_t shared_batch_size = 0, bool binary_equal_between_batch = false);
 
-    //! set workspace_limit for oprs with multiple algorithms, set
-    //! workspace limitation can save memory but may influence the performance
+    /** @brief set the opr workspace limitation in the target network, some opr
+     * maybe use large of workspace to get good performance, set workspace limitation
+     * can save memory but may influence the performance
+     *
+     * @param dst_network the target network to set/get workspace limitation
+     * @param workspace_limit the byte size of workspace limitation
+     */
     static void set_network_algo_workspace_limit(
             std::shared_ptr<Network> dst_network, size_t workspace_limit);
 
-    //! set the network memroy allocator, the allocator is defined by user
+    /** @brief set the network runtime memory Allocator, the Allocator is defined by
+     * user, through this method, user can implement a memory pool for network
+     * forwarding
+     *
+     * @param dst_network the target network
+     * @param user_allocator the user defined Allocator
+     */
     static void set_memory_allocator(
             std::shared_ptr<Network> dst_network,
             std::shared_ptr<Allocator> user_allocator);
 
-    //! share the runtime memory with other network, the weights is not shared
+    /** @brief share the runtime memory with other network, the weights is not shared
+     *
+     * \verbatim embed:rst:leading-asterisk
+     *
+     *  .. warning::
+     *
+     *     the src network and the dst network can not execute in simultaneous
+     *
+     * \endverbatim
+     *
+     * @param dst_network the target network to share the runtime memory from
+     * src_network
+     * @param src_network the source network to shared runtime memory to dst_network
+     */
     static void share_runtime_memory_with(
             std::shared_ptr<Network> dst_network, std::shared_ptr<Network> src_network);
 
-    //! Dump input/output values of all internal variables to output
-    //! file, in txt format
+    /** @brief dump all input/output tensor of all operators to the output file, in txt
+     * format, user can use this function to debug compute error
+     *
+     * @param dst_network the target network to dump its tensors
+     * @param io_txt_out_file the txt file
+     */
     static void enable_io_txt_dump(
             std::shared_ptr<Network> dst_network, std::string io_txt_out_file);
 
-    //! Dump input/output values of all internal variables to output
-    //! directory, in binary format
+    /** @brief dump all input/output tensor of all operators to the output file, in
+     * binary format, user can use this function to debug compute error
+     *
+     * @param dst_network the target network to dump its tensors
+     * @param io_bin_out_dir the binary file director
+     */
     static void enable_io_bin_dump(
             std::shared_ptr<Network> dst_network, std::string io_bin_out_dir);
 
-    //! load a new network which will share weights with src network
+    /** @brief load a new network which will share weights with src network,
+     * this can reduce memory usage when user want to load the same model multi
+     * times
+     *
+     * @param dst_network the target network to share weights from src_network
+     * @param src_network the source network to shared weights to dst_network
+     */
     static void shared_weight_with_network(
             std::shared_ptr<Network> dst_network,
             const std::shared_ptr<Network> src_network);
 
-    //! set global layout transform optimization for network
+    /** @brief set global layout transform optimization for network, global
+     * layout optimization can auto determine the layout of every operator in
+     * the network by profile, thus it can improve the performance of the
+     * network forwarding
+     */
     static void enable_global_layout_transform(std::shared_ptr<Network> network);
 
-    //! dump network after global layout transform optimization
+    /** @brief dump network after global layout transform optimization to the
+     * specific path
+     */
     static void dump_layout_transform_model(
             std::shared_ptr<Network> network, std::string optimized_model_path);
 
-    //! get the model io information before model loaded by model path.
+    /** @brief get the model io information before model loaded by model path.
+     *
+     * @param model_path the model path to get the model IO information
+     * @param config the model configuration
+     *
+     * @return the model NetworkIO information
+     */
     static NetworkIO get_model_io_info(
             const std::string& model_path, const Config& config = {});
 
-    //! get the model io information before model loaded by model memory.
+    /** @brief get the model io information before model loaded by model memory.
+     *
+     * @param model_mem the model memory to get the model IO information
+     * @param size model memory size in byte
+     * @param config the model configuration
+     *
+     * @return the model NetworkIO information
+     */
     static NetworkIO get_model_io_info(
             const void* model_mem, size_t size, const Config& config = {});
 };
-- 
GitLab