[xdoctest] reformat example code with google style in No. 250-260 (#56541)

* test=docs_preview * test=docs_preview * test=docs_preview * test=docs_preview * test=docs_preview * test=docs_preview * fix * test=docs_preview * test=docs_preview * fix * move stmts under imports --------- Co-authored-by: N SigureMo <sigure.qaq@gmail.com>

[xdoctest] reformat example code with google style in No. 250-260 (#56541)
* test=docs_preview * test=docs_preview * test=docs_preview * test=docs_preview * test=docs_preview * test=docs_preview * fix * test=docs_preview * test=docs_preview * fix * move stmts under imports --------- Co-authored-by: N SigureMo <sigure.qaq@gmail.com>
4dbe441c · cyberslack_lee · GitHub · 7314cf69 · 4dbe441c · 4dbe441c
11 changed file
--- a/python/paddle/incubate/distributed/fleet/parameter_server/pslib/__init__.py
+++ b/python/paddle/incubate/distributed/fleet/parameter_server/pslib/__init__.py
@@ -67,8 +67,8 @@ class PSLib(Fleet):
                    should call init_worker() to initialize global information about worker and connect
                    worker with pserver. You should run startup program before init_worker.
        Args:
-            executor(Executor): The executor to run for init server.
+            executor (Executor): The executor to run for init server.
-            programs(Program|None): The program that need to run.
+            programs (Program|None): The program that need to run.
        """
        if len(self._main_programs) == 0:
@@ -167,16 +167,24 @@ class PSLib(Fleet):
    def init_server(self, model_dir=None, **kwargs):
        """
-        init_server() will be called by user. It will load model from model_dir.
+        Called by user. It will load model from model_dir.
        Args:
-            model_dir(str): load model path, can be local or hdfs/afs path.
+            model_dir(str, optional): Load model path, can be local or hdfs/afs path. Default is None.
-            kwargs: user-defined attributes, currently support following:
+            kwargs: User-defined attributes, currently support following:
-                model(int): load model mode.
-                            0 is for load whole model,
+                - model(int): Load model mode.
-                            1 is for load delta model (load diff),
-                            default is 0.
+                  0 is for load whole model,
-        Example:
+                  1 is for load delta model (load diff).
-            >>> fleet.init_server("/you/path/to/model", mode = 0)
+                  Default is 0.
+        Examples:
+            .. code-block:: text
+                fleet.init_server("/you/path/to/model", mode = 0)
        """
        mode = kwargs.get("mode", 0)
        if isinstance(self._role_maker, HeterRoleMaker):
@@ -192,8 +200,7 @@ class PSLib(Fleet):
    def run_server(self):
        """
-        init_pserver(): will be called by user. When a user knows current process is_worker(), he/she
+        Called by user. When a user init server, after that he/she should run run_server() to start.
-            should call init_pserver() to initialize global information about parameter server
        """
        if self._opt_info:
            if "fleet_desc" in self._opt_info:
@@ -296,8 +303,8 @@ class PSLib(Fleet):
    def stop_worker(self):
        """
-        stop(): will be called after a user finishes his/her training task. Fleet instance will be
+        Will be called after a user finishes his/her training task. Fleet instance will be
-            destroyed when stop() is called.
+            destroyed when stop_worker() is called.
        """
        self._role_maker._barrier_worker()
        # all worker should be finalize first
@@ -315,14 +322,20 @@ class PSLib(Fleet):
    def distributed_optimizer(self, optimizer, strategy={}):
        """
        distributed_optimizer
        Args:
-            optimizer(Optimizer): optimizer
+            optimizer (Optimizer): Optimizer.
-            strategy(dict): strategy
+            strategy (dict): Strategy.
-        Examples:
-            .. code-block:: python
-              fleet.distributed_optimizer(optimizer)
        Returns:
-            optimizer(DownpourOptimizer): downpour optimizer
+            optimizer(DownpourOptimizer): Downpour optimizer.
+        Examples:
+            .. code-block:: text
+                fleet.distributed_optimizer(optimizer)
        """
        self._optimizer = DownpourOptimizer(optimizer, strategy)
        return self._optimizer
@@ -337,29 +350,42 @@ class PSLib(Fleet):
        export_for_deployment=True,
    ):
        """
-        save pserver model called from a worker
+        Save pserver model called from a worker.
        Args:
-            executor(Executor): fluid executor
-            dirname(str): save model path
+            executor (Executor): Fluid executor.
-            feeded_var_names(list): default None
+            dirname (str): Save model path.
-            target_vars(list): default None
+            feeded_var_names (list, optional): Default None.
-            main_program(Program): default None
+            target_vars (list, optional): Default None.
-            export_for_deployment(bool): default None
+            main_program (Program, optional): Default None.
+            export_for_deployment (bool, optional): Default None.
        Examples:
-            .. code-block:: python
-              fleet.save_inference_model(dirname="hdfs:/my/path")
+            .. code-block:: text
+                fleet.save_inference_model(dirname="hdfs:/my/path")
        """
        self._fleet_ptr.save_model(dirname, 0)
    def print_table_stat(self, table_id, pass_id, threshold):
        """
-        print stat info of table_id,
+        Print stat info of table_id, format: tableid, feasign size, mf size.
-        format: tableid, feasign size, mf size
        Args:
-            table_id(int): the id of table
-        Example:
+            table_id (int): The id of table.
-            .. code-block:: python
+            pass_id (int): The id of pass.
-              fleet.print_table_stat(0)
+            threshold (float): The threshold of print.
+        Examples:
+            .. code-block:: text
+                fleet.print_table_stat(0)
        """
        self._role_maker._barrier_worker()
        if self._role_maker.is_first_worker():
@@ -368,13 +394,19 @@ class PSLib(Fleet):
    def set_file_num_one_shard(self, table_id, file_num):
        """
-        set file_num in one shard
+        Set file_num in one shard.
        Args:
-            table_id(int): the id of table
-            file_num(int): file num in one shard
+            table_id (int): The id of table.
-        Example:
+            file_num (int): File num in one shard.
-            .. code-block:: python
-              fleet.set_file_num_one_shard(0, 5)
+        Examples:
+            .. code-block:: text
+                fleet.set_file_num_one_shard(0, 5)
        """
        self._role_maker._barrier_worker()
        if self._role_maker.is_first_worker():
@@ -383,20 +415,28 @@ class PSLib(Fleet):
    def save_persistables(self, executor, dirname, main_program=None, **kwargs):
        """
-        save presistable parameters,
+        Save presistable parameters,
-        when using fleet, it will save sparse and dense feature
+        when using fleet, it will save sparse and dense feature.
        Args:
-            executor(Executor): fluid executor
-            dirname(str): save path. It can be hdfs/afs path or local path
+            executor (Executor): Fluid executor.
-            main_program(Program): fluid program, default None
+            dirname (str): Save path. It can be hdfs/afs path or local path.
-            kwargs: use define property, current support following
+            main_program (Program, optional): Fluid program, default None.
-                mode(int): 0 means save all pserver model,
+            kwargs: Use define property, current support following
-                           1 means save delta pserver model (save diff),
-                           2 means save xbox base,
+                - mode (int):
-                           3 means save batch model.
+                  0 means save all pserver model,
-        Example:
+                  1 means save delta pserver model (save diff),
-            .. code-block:: python
+                  2 means save xbox base,
-              fleet.save_persistables(dirname="/you/path/to/model", mode = 0)
+                  3 means save batch model.
+        Examples:
+            .. code-block:: test
+                fleet.save_persistables(dirname="/you/path/to/model", mode = 0)
        """
        mode = kwargs.get("mode", 0)
        self._fleet_ptr.client_flush()
@@ -409,23 +449,28 @@ class PSLib(Fleet):
        self, executor, dirname, whitelist_path, main_program=None, **kwargs
    ):
        """
-        save whitelist, mode is consistent with fleet.save_persistables,
+        Save whitelist, mode is consistent with fleet.save_persistables,
-        when using fleet, it will save sparse and dense feature
+        when using fleet, it will save sparse and dense feature.
        Args:
-            executor(Executor): fluid executor
-            dirname(str): save path. It can be hdfs/afs path or local path
-            main_program(Program): fluid program, default None
-            kwargs: use define property, current support following
-                mode(int): 0 means save all pserver model,
-                           1 means save delta pserver model (save diff),
-                           2 means save xbox base,
-                           3 means save batch model.
-        Example:
+            executor (Executor): Fluid executor.
-            .. code-block:: python
+            dirname (str): save path. It can be hdfs/afs path or local path.
+            whitelist_path (str): whitelist path. It can be hdfs/afs path or local path.
+            main_program (Program, optional): fluid program, default None.
+            kwargs: Use define property, current support following
-              fleet.save_persistables(dirname="/you/path/to/model", mode = 0)
+                - mode (int):
+                  0 means save all pserver model,
+                  1 means save delta pserver model (save diff),
+                  2 means save xbox base,
+                  3 means save batch model.
+        Examples:
+            .. code-block:: text
+                fleet.save_persistables(dirname="/you/path/to/model", mode = 0)
        """
        mode = kwargs.get("mode", 0)
@@ -440,18 +485,23 @@ class PSLib(Fleet):
    def save_multi_table_one_path(self, table_ids, model_dir, **kwargs):
        """
-        save pslib multi sparse table in one path.
+        Save pslib multi sparse table in one path.
        Args:
-            table_ids(list): table ids
-            model_dir(str): if you use hdfs, model_dir should starts with
+            table_ids (list): Table ids.
-                            'hdfs:', otherwise means local dir
+            model_dir (str): If you use hdfs, model_dir should starts with 'hdfs:', otherwise means local dir.
-            kwargs(dict): user-defined properties.
+            kwargs (dict): User-defined properties.
-                          mode(int): the modes illustrated above, default 0
-                          prefix(str): the parts to save can have prefix,
+                - mode (int): The modes illustrated above, default 0.
-                                       for example, part-prefix-000-00000
+                - prefix (str): the parts to save can have prefix, for example, part-prefix-000-00000.
        Examples:
-            .. code-block:: python
-              fleet.save_multi_table_one_path("[0, 1]", "afs:/user/path/")
+            .. code-block:: text
+                fleet.save_multi_table_one_path("[0, 1]", "afs:/user/path/")
        """
        mode = kwargs.get("mode", 0)
        self._role_maker._barrier_worker()
@@ -463,21 +513,30 @@ class PSLib(Fleet):
    def save_cache_model(self, executor, dirname, main_program=None, **kwargs):
        """
-        save sparse cache table,
+        Save sparse cache table,
-        when using fleet, it will save sparse cache table
+        when using fleet, it will save sparse cache table.
        Args:
-            executor(Executor): fluid executor
-            dirname(str): save path. It can be hdfs/afs path or local path
+            executor (Executor): Fluid executor.
-            main_program(Program): fluid program, default None
+            dirname (str): Save path. It can be hdfs/afs path or local path.
-            kwargs: use define property, current support following
+            main_program (Program, optional): Fluid program, default None.
-                mode(int): define for feature extension in the future,
+            kwargs: Use define property, current support following
-                           currently no use, will pass a default value 0
-                table_id(int): which table to save cache, default is 0
+                - mode (int): Define for feature extension in the future,
+                  currently no use, will pass a default value 0.
+                - table_id (int): Which table to save cache, default is 0.
        Returns:
-            feasign_num(int): cache feasign num
-        Example:
+            feasign_num (int): cache feasign num.
-            .. code-block:: python
-              fleet.save_cache_model(None, dirname="/you/path/to/model", mode = 0)
+        Examples:
+            .. code-block:: text
+                fleet.save_cache_model(None, dirname="/you/path/to/model", mode = 0)
        """
        mode = kwargs.get("mode", 0)
        table_id = kwargs.get("table_id", 0)
@@ -506,10 +565,15 @@ class PSLib(Fleet):
    def shrink_sparse_table(self):
        """
-        shrink cvm of all sparse embedding in pserver, the decay rate
+        Shrink cvm of all sparse embedding in pserver, the decay rate
-        is defined as "show_click_decay_rate" in fleet_desc.prototxt
+        is defined as "show_click_decay_rate" in fleet_desc.prototxt.
-        Example:
-            >>> fleet.shrink_sparse_table()
+        Examples:
+            .. code-block:: text
+                fleet.shrink_sparse_table()
        """
        self._role_maker._barrier_worker()
        if self._role_maker.is_first_worker():
@@ -523,18 +587,22 @@ class PSLib(Fleet):
    def shrink_dense_table(self, decay, emb_dim=11, scope=None, table_id=None):
        """
-        shrink batch_sum in pserver by multiplying by decay
+        Shrink batch_sum in pserver by multiplying by decay.
        Args:
-            decay(float): the decay rate, usually range in (0, 1)
+            decay (float): The decay rate, usually range in (0, 1).
-            emb_dim(int): one element's length in datanorm layer
+            emb_dim (int, optional): One element's length in datanorm layer. Default is 11.
-            scope(Scope): Scope object, default is fluid.global_scope()
+            scope (Scope, optional): Scope object, default is fluid.global_scope(). Default is None.
-            table_id(int): table id of shrinking dense table. None means shrink all,
+            table_id (int, optional): Table id of shrinking dense table. None means shrink all,
-                           you should specify it when using multiple scopes,
+                you should specify it when using multiple scopes, default is None.
-                           default is None.
-        Example:
+        Examples:
-            >>> fleet.shrink_dense_table(0.98, 11, myscope1, 1)
-            >>> fleet.shrink_dense_table(0.98, 11, myscope1, 2)
+            .. code-block:: text
-            >>> fleet.shrink_dense_table(0.98, 11, myscope2, 3)
+                fleet.shrink_dense_table(0.98, 11, myscope1, 1)
+                fleet.shrink_dense_table(0.98, 11, myscope1, 2)
+                fleet.shrink_dense_table(0.98, 11, myscope2, 3)
        """
        if scope is None:
            scope = paddle.static.global_scope()
@@ -559,12 +627,17 @@ class PSLib(Fleet):
    def clear_one_table(self, table_id):
        """
-        clear_one_table() will be called by user. It will clear one table.
+        This function will be called by user. It will clear one table.
        Args:
-            table_id(int): table id
+            table_id (int): Table id.
        Examples:
-            .. code-block:: python
-              fleet.clear_one_table(0)
+            .. code-block:: text
+                fleet.clear_one_table(0)
        """
        self._role_maker._barrier_worker()
        if self._role_maker.is_first_worker():
@@ -573,10 +646,13 @@ class PSLib(Fleet):
    def clear_model(self):
        """
-        clear_model() will be called by user. It will clear sparse model.
+        This function will be called by user. It will clear sparse model.
        Examples:
-            .. code-block:: python
-              fleet.clear_model()
+            .. code-block:: text
+                fleet.clear_model()
        """
        self._role_maker._barrier_worker()
        if self._role_maker.is_first_worker():
@@ -585,40 +661,39 @@ class PSLib(Fleet):
    def load_pslib_whitelist(self, table_id, model_path, **kwargs):
        """
-        load pslib model for one table with whitelist
+        Load pslib model for one table with whitelist.
        Args:
-            table_id(int): load table id
+            table_id (int): Load table id.
-            model_path(str): load model path, can be local or hdfs/afs path
+            model_path (str): Load model path, can be local or hdfs/afs path.
-            kwargs(dict): user defined params, currently support following:
+            kwargs (dict): User defined params, currently support following:
-                only for load pslib model for one table:
-                    mode(int): load model mode. 0 is for load whole model, 1 is
+                - only for load pslib model for one table:
-                               for load delta model (load diff), default is 0.
+                  mode (int): load model mode. 0 is for load whole model, 1 is for load delta model (load diff), default is 0.
-                only for load params from paddle model:
+                - only for load params from paddle model:
-                    scope(Scope): Scope object
+                  scope (Scope): Scope object.
-                    model_proto_file(str): path of program desc proto binary
+                  model_proto_file (str): Path of program desc proto binary file, can be local or hdfs/afs file.
-                                           file, can be local or hdfs/afs file
+                  var_names (list): Var name list.
-                    var_names(list): var name list
+                  load_combine (bool): Load from a file or split param files, default False.
-                    load_combine(bool): load from a file or split param files
-                                        default False.
        Examples:
-            .. code-block:: python
-              # load pslib model for one table
+            .. code-block:: text
-              fleet.load_one_table(0, "hdfs:/my_fleet_model/20190714/0/")
-              fleet.load_one_table(1, "hdfs:/xx/xxx", mode = 0)
+                # load pslib model for one table
+                fleet.load_one_table(0, "hdfs:/my_fleet_model/20190714/0/")
+                fleet.load_one_table(1, "hdfs:/xx/xxx", mode = 0)
-              # load params from paddle model
+                # load params from paddle model
-              fleet.load_one_table(2, "hdfs:/my_paddle_model/",
+                fleet.load_one_table(2, "hdfs:/my_paddle_model/",
-                                   scope = my_scope,
+                                    scope = my_scope,
-                                   model_proto_file = "./my_program.bin",
+                                    model_proto_file = "./my_program.bin",
-                                   load_combine = False)
+                                    load_combine = False)
-              # below is how to save proto binary file
+                # below is how to save proto binary file
-              with open("my_program.bin", "wb") as fout:
+                with open("my_program.bin", "wb") as fout:
-                  my_program = fluid.default_main_program()
+                    my_program = fluid.default_main_program()
-                  fout.write(my_program.desc.serialize_to_string())
+                    fout.write(my_program.desc.serialize_to_string())
        """
        self._role_maker._barrier_worker()
@@ -631,35 +706,39 @@ class PSLib(Fleet):
    def load_one_table(self, table_id, model_path, **kwargs):
        """
-        load pslib model for one table or load params from paddle model
+        Load pslib model for one table or load params from paddle model.
        Args:
-            table_id(int): load table id
-            model_path(str): load model path, can be local or hdfs/afs path
+            table_id (int): Load table id.
-            kwargs(dict): user defined params, currently support following:
+            model_path (str): Load model path, can be local or hdfs/afs path.
-                only for load pslib model for one table:
+            kwargs (dict): user defined params, currently support following:
-                    mode(int): load model mode. 0 is for load whole model, 1 is
-                               for load delta model (load diff), default is 0.
+                - only for load pslib model for one table:
-                only for load params from paddle model:
+                  mode(int): load model mode. 0 is for load whole model, 1 is for load delta model (load diff), default is 0.
-                    scope(Scope): Scope object
+                - only for load params from paddle model:
-                    model_proto_file(str): path of program desc proto binary
+                  scope(Scope): Scope object.
-                                           file, can be local or hdfs/afs file
+                  model_proto_file(str): Path of program desc proto binary file, can be local or hdfs/afs file.
-                    var_names(list): var name list
+                  var_names(list): var name list.
-                    load_combine(bool): load from a file or split param files
+                  load_combine(bool): load from a file or split param files, default False.
-                                        default False.
        Examples:
-            .. code-block:: python
-              # load pslib model for one table
+            .. code-block:: text
-              fleet.load_one_table(0, "hdfs:/my_fleet_model/20190714/0/")
-              fleet.load_one_table(1, "hdfs:/xx/xxx", mode = 0)
+                # load pslib model for one table
-              # load params from paddle model
+                fleet.load_one_table(0, "hdfs:/my_fleet_model/20190714/0/")
-              fleet.load_one_table(2, "hdfs:/my_paddle_model/",
+                fleet.load_one_table(1, "hdfs:/xx/xxx", mode = 0)
-                                   scope = my_scope,
+                # load params from paddle model
-                                   model_proto_file = "./my_program.bin",
+                fleet.load_one_table(2, "hdfs:/my_paddle_model/",
-                                   load_combine = False)
+                                    scope = my_scope,
-              # below is how to save proto binary file
+                                    model_proto_file = "./my_program.bin",
-              with open("my_program.bin", "wb") as fout:
+                                    load_combine = False)
-                  my_program = fluid.default_main_program()
+                # below is how to save proto binary file
-                  fout.write(my_program.desc.serialize_to_string())
+                with open("my_program.bin", "wb") as fout:
+                    my_program = fluid.default_main_program()
+                    fout.write(my_program.desc.serialize_to_string())
        """
        self._role_maker._barrier_worker()
        mode = kwargs.get("mode", 0)
@@ -691,15 +770,16 @@ class PSLib(Fleet):
        load_combine=False,
    ):
        """
-        load params from paddle model, and push params to pserver
+        Load params from paddle model, and push params to pserver.
        Args:
-            scope(Scope): Scope object
+            scope (Scope): Scope object.
-            table_id(int): the id of table to load
+            table_id (int): The id of table to load.
-            model_path(str): path of paddle model, can be local or hdfs/afs file
+            model_path (str): Path of paddle model, can be local or hdfs/afs file.
-            model_proto_file(str): path of program desc proto binary file,
+            model_proto_file (str): Path of program desc proto binary file, can be local or hdfs/afs file.
-                                   can be local or hdfs/afs file
+            var_names (list, optional): Load var names. Default is None.
-            var_names(list): load var names
+            load_combine (bool, optional): Load from a file or split param files. Default is False.
-            load_combine(bool): load from a file or split param files
        """
        self._role_maker._barrier_worker()
        if self._role_maker.is_first_worker():
@@ -800,14 +880,18 @@ class PSLib(Fleet):
           usually for online predict)
        3: load batch model (do some statistic works in checkpoint, such as
           calculate unseen days of each feasign)
        Args:
-            model_dir(str): if you use hdfs, model_dir should starts with
+            model_dir (str, optional): If you use hdfs, model_dir should starts with
-                            'hdfs:', otherwise means local dir
+                'hdfs:', otherwise means local dir. Default is None.
-            kwargs(dict): user-defined properties.
+            kwargs (dict): user-defined properties.
-                          mode(int): the modes illustrated above, default 0
+                - mode (int): The modes illustrated above, default 0.
        Examples:
-            .. code-block:: python
+            .. code-block:: text
-              fleet.load_model("afs:/user/path/")
+                fleet.load_model("afs:/user/path/")
        """
        mode = kwargs.get("mode", 0)
        self._role_maker._barrier_worker()
@@ -818,14 +902,19 @@ class PSLib(Fleet):
    def save_model(self, model_dir=None, **kwargs):
        """
        save pslib model, the modes are same with load model.
        Args:
-            model_dir(str): if you use hdfs, model_dir should starts with
+            model_dir (str, optional): If you use hdfs, model_dir should starts with
-                            'hdfs:', otherwise means local dir
+                'hdfs:', otherwise means local dir. Default is None.
-            kwargs(dict): user-defined properties.
+            kwargs (dict): user-defined properties.
-                          mode(int): the modes illustrated above, default 0
+                - mode (int): The modes illustrated above, default 0.
        Examples:
-            .. code-block:: python
+            .. code-block:: text
-              fleet.save_model("afs:/user/path/")
+                fleet.save_model("afs:/user/path/")
        """
        mode = kwargs.get("mode", 0)
        prefix = kwargs.get("prefix", None)
@@ -836,18 +925,21 @@ class PSLib(Fleet):
    def save_one_table(self, table_id, model_dir, **kwargs):
        """
-        save pslib model's one table, the modes are same with load model.
+        Save pslib model's one table, the modes are same with load model.
        Args:
-            table_id(int): table id
+            table_id (int): Table id.
-            model_dir(str): if you use hdfs, model_dir should starts with
+            model_dir (str): if you use hdfs, model_dir should starts with
-                            'hdfs:', otherwise means local dir
+                'hdfs:', otherwise means local dir.
-            kwargs(dict): user-defined properties.
+            kwargs (dict): user-defined properties.
-                          mode(int): the modes illustrated above, default 0
-                          prefix(str): the parts to save can have prefix,
+                - mode (int): the modes illustrated above, default 0.
-                                       for example, part-prefix-000-00000
+                - prefix (str): the parts to save can have prefix, for example, part-prefix-000-00000.
        Examples:
-            .. code-block:: python
+            .. code-block:: text
-              fleet.save_one_table("afs:/user/path/")
+                fleet.save_one_table("afs:/user/path/")
        """
        mode = kwargs.get("mode", 0)
        prefix = kwargs.get("prefix", None)
@@ -890,15 +982,17 @@ def _prepare_params(
    dtype='float32',
 ):
    """
-    preprocess params, this interface is not for users.
+    Preprocess params, this interface is not for users.
    Args:
-        input(Variable|list of Variable): Input is a Tensor<int64> Variable
+        input (Variable|list of Variable): Input is a Tensor<int64> Variable.
-        size(list of int): the embedding dim
+        size (list of int): The embedding dim.
-        is_sparse(bool): whether input is sparse ids
+        is_sparse (bool, optional): Whether input is sparse ids. Default is False.
-        is_distributed(bool): whether in distributed mode
+        is_distributed (bool, optional): Whether in distributed mode. Default is False.
-        padding_idx(int): padding idx of input
+        padding_idx (int, optional): Padding idx of input. Default is None.
-        param_attr(ParamAttr): To specify the weight parameter property
+        param_attr (ParamAttr, optional): To specify the weight parameter property. Default is None.
-        dtype(str): data type of output
+        dtype (str, optional): Data type of output. Default is 'float32'.
    """
    if param_attr is None:
        raise ValueError("param_attr must be set")
@@ -953,15 +1047,16 @@ def _fleet_embedding(
    dtype='float32',
 ):
    """
-    add fleet embedding, this interface is not for users.
+    Add fleet embedding, this interface is not for users.
    Args:
-        input(Variable|list of Variable): Input is a Tensor<int64> Variable
+        input (Variable|list of Variable): Input is a Tensor<int64> Variable.
-        size(list of int): the embedding dim
+        size (list[int]): The embedding dim.
-        is_sparse(bool): whether input is sparse ids
+        is_sparse (bool, optional): Whether input is sparse ids. Default is False.
-        is_distributed(bool): whether in distributed mode
+        is_distributed (bool, optional): Whether in distributed mode. Default is False.
-        padding_idx(int): padding idx of input
+        padding_idx (int, optional): Padding idx of input. Default is None.
-        param_attr(ParamAttr): To specify the weight parameter property
+        param_attr (ParamAttr, optional): To specify the weight parameter property. Default is None.
-        dtype(str): data type of output
+        dtype (str, optional): Data type of output. Default is 'float32'.
    """
    def _pull_sparse(
@@ -1041,15 +1136,16 @@ def _fleet_embedding_v2(
    dtype='float32',
 ):
    """
-    add fleet embedding v2, this interface is not for users.
+    Add fleet embedding v2, this interface is not for users.
    Args:
-        input(Variable|list of Variable): Input is a Tensor<int64> Variable
+        input (Variable|list of Variable): Input is a Tensor<int64> Variable.
-        size(list of int): the embedding dim
+        size (list[int]): The embedding dim.
-        is_sparse(bool): whether input is sparse ids
+        is_sparse (bool, optional): Whether input is sparse ids. Default is False.
-        is_distributed(bool): whether in distributed mode
+        is_distributed (bool, optional): Whether in distributed mode. Default is False.
-        padding_idx(int): padding idx of input
+        padding_idx (int, optional): Padding idx of input. Default is None.
-        param_attr(ParamAttr): To specify the weight parameter property
+        param_attr (ParamAttr, optional): To specify the weight parameter property. Default is None.
-        dtype(str): data type of output
+        dtype (str, optional): Data type of output. Default is 'float32'.
    """
    def _pull_sparse_v2(
@@ -1120,16 +1216,19 @@ def _fleet_embedding_v2(
 class fleet_embedding:
    """
-    fleet embedding class, it is used as a wrapper
+    Fleet embedding class, it is used as a wrapper.
-    Example:
-        .. code-block:: python
+    Examples:
-          with fleet_embedding(click_name=label.name):
-              emb = paddle.static.nn.embedding(
+        .. code-block:: text
-                  input=var,
-                  size=[-1, 11],
+            with fleet_embedding(click_name=label.name):
-                  is_sparse=True,
+                emb = paddle.static.nn.embedding(
-                  is_distributed=True,
+                    input=var,
-                  param_attr=fluid.ParamAttr(name="embedding"))
+                    size=[-1, 11],
+                    is_sparse=True,
+                    is_distributed=True,
+                    param_attr=fluid.ParamAttr(name="embedding"))
    """
    def __init__(self, click_name, scale_sparse_grad=True):
@@ -1165,9 +1264,11 @@ class DownpourOptimizer(DistributedOptimizer):
    run distributed training. The optimized information will be stored in
    Fleet() instance who holds the global information about current distributed
    training.
    Args:
        optimizer(Optimizer): subclass of Optimizer.
        strategy(any): config for DownpourOptimizer.
    Returns:
        None
    """
@@ -1270,22 +1371,24 @@ class DownpourOptimizer(DistributedOptimizer):
        program_mode="all_reduce",
    ):
        """
-        minimize a program through loss, loss can be a list in DistributedOptimizer.
+        Minimize a program through loss, loss can be a list in DistributedOptimizer.
        Note that in parameter server mode, a worker will not get anything about optimize_os
        Because optimizer algorithms run on pserver side. We will make this usable in pserver
        process, but currently the optimization part is written into Fleet(). A user does not
        need to care about how to startup a pserver node.
        Args:
-            losses (Variable|Variable List): loss variable or loss variable list to run optimization.
+            losses (Variable|Variable List): Loss variable or loss variable list to run optimization.
-            scopes (Scope| Scope List): scope instance.
+            scopes (Scope|Scope List, Optional): Scope instance. Default is None.
-            startup_programs (Program|Program List): startup_program for initializing parameters
+            startup_programs (Program|Program List, Optional): Startup_program for initializing parameters
-                in `parameter_list`.
+                in `parameter_list`. Default is None.
-            parameter_list (list): list of Variables to update.
+            parameter_list (list, Optional): List of Variables to update. Default is None.
-            no_grad_set (set|None): set of Variables should be ignored.
+            no_grad_set (set, Optional): Set of Variables should be ignored. Default is None.
-            program_mode (str|"all_reduce"): grad action for grogram when use_ps_gpu.
+            program_mode (str, Optional): Grad action for grogram when use_ps_gpu. Default is "all_reduce".
        Returns:
            tuple: (optimize_ops, params_grads) which are, list of operators appended;
-            and list of (param, grad) Variables pair for optimization.
+                and list of (param, grad) Variables pair for optimization.
        """
        if not isinstance(losses, list):

--- a/python/paddle/incubate/distributed/models/moe/grad_clip.py
+++ b/python/paddle/incubate/distributed/models/moe/grad_clip.py
@@ -22,7 +22,7 @@ from paddle.nn.clip import ClipGradBase, _squared_l2_norm
 class ClipGradForMOEByGlobalNorm(ClipGradBase):
    r"""
-    The Algrithm is the same as paddle.nn.ClipGradByGlobalNorm
+    The Algorithm is the same as paddle.nn.ClipGradByGlobalNorm
    Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
    :math:`t\_list` , and limit it to ``clip_norm`` .
@@ -50,7 +50,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
    Note:
        ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0.
-        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
+        Please use ``need_clip`` in ``ParamAttr`` to specify the clip scope.
    Reference:
        https://github.com/laekov/fastmoe/blob/master/examples/megatron/clip-grad-v2.2.patch
@@ -64,22 +64,22 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
        group_name (str, optional): The group name for this clip. Default value is ``default_moe_group``.
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
+            >>> x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
-            linear = paddle.nn.Linear(in_features=10, out_features=10,
+            >>> linear = paddle.nn.Linear(in_features=10, out_features=10,
-                                      weight_attr=paddle.ParamAttr(need_clip=True),
+            ...                           weight_attr=paddle.ParamAttr(need_clip=True),
-                                      bias_attr=paddle.ParamAttr(need_clip=False))
+            ...                           bias_attr=paddle.ParamAttr(need_clip=False))
-            out = linear(x)
+            >>> out = linear(x)
-            loss = paddle.mean(out)
+            >>> loss = paddle.mean(out)
-            loss.backward()
+            >>> loss.backward()
-            is_expert_func = lambda param: "expert_" in param.name
+            >>> clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) # Cause paddle.nn hasn't this interface, so we use ClipGradByGlobalNorm here.
-            clip = paddle.nn.ClipGradForMOEByGlobalNorm(clip_norm=1.0,is_expert_func, None)
+            >>> sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
-            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
+            >>> sdg.step()
-            sdg.step()
    """
    def __init__(
@@ -124,7 +124,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
            else:
                sum_square_list.append(sum_square)
-        # all parameters have been filterd out
+        # all parameters have been filtered out
        if (
            len(sum_square_list)
            + len(sum_square_list_fp16)

--- a/python/paddle/incubate/distributed/models/moe/moe_layer.py
+++ b/python/paddle/incubate/distributed/models/moe/moe_layer.py
@@ -263,63 +263,68 @@ def prepare_forward(gate, num_expert, world_size, moe_group):
 class MoELayer(nn.Layer):
    """MoE Layer
    Args:
-        d_model: (int) model dimention
+        d_model (int): Model dimention.
-        experts: (nn.LayerList) expert networks list
+        experts (nn.LayerList): Expert networks list.
-        gate: (dict|NaiveGate|SwitchGate|NaiveGate):
+        gate (dict|NaiveGate|SwitchGate|NaiveGate):
-                if gate is a dict:
-                    gate is a gate network config, containing 2 keys:
+            - If gate is a dict:
-                    `type`(str) value can be: "naive", "gshard", "switch" or None, default is "gshard"
+              gate is a gate network config, containing 2 keys:
-                    `top_k`(int) default value is 2
+              `type` (str) value can be: "naive", "gshard", "switch" or None, default is "gshard".
-                else gate is an instance of NaiveGate|SwitchGate|NaiveGate:
+              `top_k` (int) Default value is 2.
+            else gate is an instance of NaiveGate|SwitchGate|NaiveGate:
-        moe_group: moe group for experts communication
-        mp_group: mp group for mp commutication
+        moe_group: moe group for experts communication.
-        recompute_interval(int, optional): whether to use recompute, default 0, means to disable recompute.
+        mp_group: mp group for mp communication.
-        recompute_ctx(dict, optional): the context for recompute, if recompute_interval > 1, recompute_ctx must be given.
+        recompute_interval (int, optional): Whether to use recompute, default 0, means to disable recompute.
+        recompute_ctx (dict, optional): The context for recompute, if recompute_interval > 1, recompute_ctx must be given.
    Examples:
        .. code-block:: python
-        from paddle.nn import layer, LayerList
-        from paddle.distributed.moe import MoElayer
-        from paddle.distributed.collective import Group
-        from paddle.distributed import fleet
-        moe_group = Group(fleet.worker_index(),
-                          0,
-                          list(range(fleet.worker_num())))
-        mp_group = None
-        num_experts=8
-        dim_feedforward=512
-        d_model=8
-        top_k=2
-        class ExpertLayer(Layer):
-            def __init__(self, d_model, d_hidden, name=None,rank=0, windex = 0, num_expert=1):
-                super().__init__()
-                self.htoh4 = nn.Linear(d_model, d_hidden)
-                self.h4toh = nn.Linear(d_hidden, d_model)
-            def forward(self, x):
-                x = self.htoh4(x)
-                x = self.h4toh(x)
-                return x
-        gate_config = {
+            >>> # doctest: +SKIP('Until Distributed move successfully, just skip it')
-                "type": "gshard",
+            >>> from paddle.nn import layer, LayerList
-                "top_k": top_k,
+            >>> from paddle.distributed.moe import MoElayer
-        }
+            >>> from paddle.distributed.collective import Group
+            >>> from paddle.distributed import fleet
-        experts_list = LayerList()
-        for expi in range(num_experts):
+            >>> moe_group = Group(fleet.worker_index(),
-            exp_layer = ExpertLayer(d_model, dim_feedforward // top_k, windex=expi, num_expert=num_experts)
+            ...                   0,
-            experts_list.append(exp_layer)
+            ...                   list(range(fleet.worker_num())))
+            >>> mp_group = None
-        moeLayer = MoELayer(d_model = d_model,
-                            experts=experts_list,
+            >>> num_experts=8
-                            gate=gate_config,
+            >>> dim_feedforward=512
-                            moe_group=moe_group,
+            >>> d_model=8
-                            mp_group=mp_group,
+            >>> top_k=2
-                            recompute_interval=0)
+            >>> class ExpertLayer(Layer):
+            ...     def __init__(self, d_model, d_hidden, name=None,rank=0, windex = 0, num_expert=1):
+            ...         super().__init__()
+            ...         self.htoh4 = nn.Linear(d_model, d_hidden)
+            ...         self.h4toh = nn.Linear(d_hidden, d_model)
+            ...     def forward(self, x):
+            ...         x = self.htoh4(x)
+            ...         x = self.h4toh(x)
+            ...         return x
+            >>> gate_config = {
+            ...         "type": "gshard",
+            ...         "top_k": top_k,
+            ... }
+            >>> experts_list = LayerList()
+            >>> for expi in range(num_experts):
+            ...     exp_layer = ExpertLayer(d_model, dim_feedforward // top_k, windex=expi, num_expert=num_experts)
+            ...     experts_list.append(exp_layer)
+            >>> moeLayer = MoELayer(d_model = d_model,
+            ...                     experts=experts_list,
+            ...                     gate=gate_config,
+            ...                     moe_group=moe_group,
+            ...                     mp_group=mp_group,
+            ...                     recompute_interval=0)
    """

--- a/python/paddle/incubate/layers/nn.py
+++ b/python/paddle/incubate/layers/nn.py
@@ -54,37 +54,40 @@ def fused_embedding_seq_pool(
        size (tuple|list): The shape of the lookup_table parameter. It should
            have two elements which indicate the size of the dictionary of
            embedding and the size of each embedding vector respectively.
-        is_sparse (bool): The flag indicating whether to use sparse update.
+        is_sparse (bool, optional): The flag indicating whether to use sparse update.
            Default: False.
-        padding_idx (int|long|None): It will output all-zero padding data whenever
+        padding_idx (int|long|None, optional): It will output all-zero padding data whenever
            lookup encounters :math:`padding\_idx` in Ids. If set :attr:`None`, it makes
            no effect to output. If :math:`padding\_idx < 0`, the :math:`padding\_idx`
            will automatically be converted to :math:`size[0] + padding\_idx` to use.
            Default: None.
-        combiner (str): The pooling type of sequence_pool, and only support `sum`.
+        combiner (str, optional): The pooling type of sequence_pool, and only support `sum`.
            Default: sum.
-        param_attr (ParamAttr): Parameters for this layer.
+        param_attr (ParamAttr, optional): Parameters for this layer. Default: None.
-        dtype (np.dtype|core.VarDesc.VarType|str): The dtype refers to the data type of output
+        dtype (np.dtype|core.VarDesc.VarType|str, optional): The dtype refers to the data type of output
-            tensor. It can be float32, float_16, int etc.
+            tensor. It can be float32, float_16, int etc. Default: float32.
    Returns:
        The Tensor of sequence pooling.
    Examples:
        .. code-block:: python
-            import numpy as np
-            import paddle.fluid as fluid
+            >>> import numpy as np
-            import paddle
+            >>> import paddle
-            paddle.enable_static()
+            >>> paddle.enable_static()
-            dict_size = 20
+            >>> dict_size = 20
-            data_t = paddle.static.data(
+            >>> data_t = paddle.static.data(
-                name='word', shape=[-1, 1], dtype='int64', lod_level=1)
+            ...     name='word', shape=[-1, 1], dtype='int64', lod_level=1)
-            padding_idx = np.random.randint(1, 10)
+            >>> padding_idx = np.random.randint(1, 10)
-            out = paddle.incubate.layers.fused_embedding_seq_pool(
+            >>> out = paddle.incubate.layers.fused_embedding_seq_pool(
-                input=data_t,
+            ...     input=data_t,
-                size=[dict_size, 32],
+            ...     size=[dict_size, 32],
-                param_attr='w',
+            ...     param_attr='w',
-                padding_idx=padding_idx,
+            ...     padding_idx=padding_idx,
-                is_sparse=False)
+            ...     is_sparse=False)
    """
    helper = LayerHelper('fused_embedding_seq_pool', **locals())
    w = helper.create_parameter(
@@ -130,27 +133,25 @@ def fused_seqpool_cvm(
        cvm_offset(int, optional): cvm offset. Default: 2, which means cvm contains show, click.
    Returns:
-        Tensor : The tensor storing sequence pool and cvm
+        Tensor : The tensor storing sequence pool and cvm of input.
-        of input.
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            import paddle.fluid as fluid
+            >>> paddle.enable_static()
-            paddle.enable_static()
-            data = paddle.static.data(name='x', shape=[-1, 1], dtype='int64', lod_level=1)
+            >>> data = paddle.static.data(name='x', shape=[-1, 1], dtype='int64', lod_level=1)
-            data2 = paddle.static.data(name='y', shape=[-1, 1], dtype='int64', lod_level=1)
+            >>> data2 = paddle.static.data(name='y', shape=[-1, 1], dtype='int64', lod_level=1)
-            inputs = [data, data2]
+            >>> inputs = [data, data2]
-            embs = paddle.incubate.layers.nn._pull_box_sparse(input=inputs, size=11, is_distributed=True, is_sparse=True)
+            >>> embs = paddle.incubate.layers.nn._pull_box_sparse(input=inputs, size=11, is_distributed=True, is_sparse=True)
-            label = paddle.static.data(name="label", shape=[-1, 1], dtype="int64", lod_level=1)
+            >>> label = paddle.static.data(name="label", shape=[-1, 1], dtype="int64", lod_level=1)
-            ones = fluid.layers.fill_constant_batch_size_like(input=label, shape=[-1, 1], dtype="int64", value=1)
+            >>> ones = paddle.static.data(name="ones", shape=[-1, 1], dtype="int64", lod_level=1)
-            show_clk = paddle.cast(paddle.concat([ones, label], axis=1), dtype='float32')
+            >>> show_clk = paddle.cast(paddle.concat([ones, label], axis=1), dtype='float32')
-            show_clk.stop_gradient = True
+            >>> show_clk.stop_gradient = True
-            cvms = paddle.incubate.layers.fused_seqpool_cvm(embs, 'sum', show_clk)
+            >>> cvms = paddle.incubate.layers.fused_seqpool_cvm(embs, 'sum', show_clk)
    """
@@ -212,10 +213,10 @@ def multiclass_nms2(
    In the NMS step, this operator greedily selects a subset of detection bounding
    boxes that have high scores larger than score_threshold, if providing this
    threshold, then selects the largest nms_top_k confidences scores if nms_top_k
-    is larger than -1. Then this operator pruns away boxes that have high IOU
+    is larger than -1. Then this operator prunes away boxes that have high IOU
    (intersection over union) overlap with already selected boxes by adaptive
    threshold NMS based on parameters of nms_threshold and nms_eta.
-    Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
+    After NMS step, at most keep_top_k number of total bboxes are to be kept
    per image if keep_top_k is larger than -1.
    Args:
@@ -228,7 +229,7 @@ def multiclass_nms2(
                           [xmin, ymin, xmax, ymax], when box size equals to 4.
                           2. (LoDTensor) A 3-D Tensor with shape [M, C, 4]
                           M is the number of bounding boxes, C is the
-                           class number
+                           class number.
        scores (Tensor): Two types of scores are supported:
                           1. (Tensor) A 3-D Tensor with shape [N, C, M]
                           represents the predicted confidence predictions.
@@ -241,22 +242,22 @@ def multiclass_nms2(
                           M is the number of bbox, C is the class number.
                           In this case, input BBoxes should be the second
                           case with shape [M, C, 4].
-        background_label (int): The index of background label, the background
-                                label will be ignored. If set to -1, then all
-                                categories will be considered. Default: 0
        score_threshold (float): Threshold to filter out bounding boxes with
                                 low confidence score. If not provided,
                                 consider all boxes.
        nms_top_k (int): Maximum number of detections to be kept according to
                         the confidences after the filtering detections based
                         on score_threshold.
-        nms_threshold (float): The threshold to be used in NMS. Default: 0.3
-        nms_eta (float): The threshold to be used in NMS. Default: 1.0
        keep_top_k (int): Number of total bboxes to be kept per image after NMS
                          step. -1 means keeping all bboxes after NMS step.
-        normalized (bool): Whether detections are normalized. Default: True
+        nms_threshold (float, optional): The threshold to be used in NMS. Default: 0.3.
-        return_index(bool): Whether return selected index. Default: False
+        normalized (bool, optional): Whether detections are normalized. Default: True.
-        name(str): Name of the multiclass nms op. Default: None.
+        nms_eta (float, optional): The threshold to be used in NMS. Default: 1.0.
+        background_label (int, optional): The index of background label, the background
+                                label will be ignored. If set to -1, then all
+                                categories will be considered. Default: 0.
+        return_index(bool, optional): Whether return selected index. Default: False.
+        name(str, optional): Name of the multiclass nms op. Default: None.
    Returns:
        A tuple with two dimensions of the tensor: (Out, Index) if return_index is True,
@@ -279,23 +280,21 @@ def multiclass_nms2(
    Examples:
        .. code-block:: python
+            >>> import paddle
-            import paddle.fluid as fluid
+            >>> paddle.enable_static()
-            import paddle
+            >>> boxes = paddle.static.data(name='bboxes', shape=[-1, 81, 4],
-            paddle.enable_static()
+            ...                           dtype='float32', lod_level=1)
-            boxes = paddle.static.data(name='bboxes', shape=[-1, 81, 4],
+            >>> scores = paddle.static.data(name='scores', shape=[-1, 81],
-                                      dtype='float32', lod_level=1)
+            ...                           dtype='float32', lod_level=1)
-            scores = paddle.static.data(name='scores', shape=[-1, 81],
+            >>> out, index = paddle.incubate.layers.multiclass_nms2(bboxes=boxes,
-                                      dtype='float32', lod_level=1)
+            ...                                   scores=scores,
-            out, index = paddle.incubate.layers.multiclass_nms2(bboxes=boxes,
+            ...                                   background_label=0,
-                                              scores=scores,
+            ...                                   score_threshold=0.5,
-                                              background_label=0,
+            ...                                   nms_top_k=400,
-                                              score_threshold=0.5,
+            ...                                   nms_threshold=0.3,
-                                              nms_top_k=400,
+            ...                                   keep_top_k=200,
-                                              nms_threshold=0.3,
+            ...                                   normalized=False,
-                                              keep_top_k=200,
+            ...                                   return_index=True)
-                                              normalized=False,
-                                              return_index=True)
    """
    helper = LayerHelper('multiclass_nms2', **locals())
@@ -353,26 +352,27 @@ def search_pyramid_hash(
        pyramid_layer (int): The number of pyramid layers. It should be greater than 2.
        rand_len (int): The minimum length of pyramid hash cell.
        drop_out_percent (float): The probability of dropping out the input token randomly.
-            It should satisfy: [0., 1.]
+            It should satisfy: [0., 1.].
        is_training (bool): Whether in training or testing phrase.
-        use_filter(bool): If set True, the white filter and black filter should be given by
+        use_filter (bool): If set True, the white filter and black filter should be given by
            :attr:`param_attr_wl` and :attr:`param_attr_bl` .
-        white_list_len(int): If set :math:`white_list_len>0` , white filter with shape [white_list_len, 1]
+        white_list_len (int): If set :math:`white_list_len>0` , white filter with shape [white_list_len, 1]
            should be provided by param_attr_wl.
-        black_list_len(int): If set :math:`black_list_len>0` , black filter with shape [black_list_len, 1]
+        black_list_len (int): If set :math:`black_list_len>0` , black filter with shape [black_list_len, 1]
            should be provided by param_attr_bl.
-        seed(int): The number of random seed.
+        seed (int): The number of random seed.
-        lr(float): The learning rate of weight created by :attr:`param_attr` with shape [space_len+rand_len, 1]
+        lr (float): The learning rate of weight created by :attr:`param_attr` with shape [space_len+rand_len, 1]
            in this layer.
-        param_attr(ParamAttr): To specify the weight parameter property. Default: None, which means the
+        param_attr (ParamAttr, optional): To specify the weight parameter property. Default: None, which means the
            default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr` .
-        param_attr_wl(ParamAttr): Specified parameters of white filter.
+        param_attr_wl (ParamAttr, optional): Specified parameters of white filter. Default: None.
-        param_attr_bl(ParamAttr): Specified parameters of black filter.
+        param_attr_bl (ParamAttr, optional): Specified parameters of black filter. Default: None.
-        distribute_update_vars(list[ParamAttr.name]): Decided which params should be updated in distribute training.
+        distribute_update_vars(list[ParamAttr.name], optional): Decided which params should be updated in distribute training.
-            Used in Distribute Transpiler to create a trainer/server program.
+            Used in Distribute Transpiler to create a trainer/server program. Default: None.
-        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
+        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name` .
+            For more information, please refer to :ref:`api_guide_Name` . Default: None.
-        dtype(str): The data type of output Tensor, float32.
+        dtype (str, optional): The data type of output Tensor, float32. Default: float32.
    Returns:
        Tensor: LoDTensor of pyramid hash embedding.
    """
@@ -451,25 +451,25 @@ def shuffle_batch(x, seed=None):
    :attr:`x` is a LoDTensor to be shuffled with shape :math:`[N_1, N_2, ..., N_k, D]` . Note that the last dim of input will not be shuffled.
    :math:`N_1 * N_2 * ... * N_k` numbers of elements with length :math:`D` will be shuffled randomly.
-    For Example:
+    Examples:
-    .. code-block:: text
+        .. code-block:: text
-      Input:
+            Input:
-        x.data = [[1, 2], [3, 4], [5, 6], [7, 8]]
+              x.data = [[1, 2], [3, 4], [5, 6], [7, 8]]
-        x.dims = [4, 2]
+              x.dims = [4, 2]
-      Attrs:
+            Attrs:
-        seed = 2019
+              seed = 2019
-      Output:
+            Output:
-        Out.data =[[7, 8], [1, 2], [3, 4], [5, 6]]
+              Out.data =[[7, 8], [1, 2], [3, 4], [5, 6]]
-        Out.dims = [4, 2]
+              Out.dims = [4, 2]
    Args:
        x (Tensor): The input Tensor. The input Tensor is a N-D LoDTensor with type int, float32 or float64.
-        seed (None|int|Tensor): The start up seed. If set, seed will be set as the start up seed of shuffle engine.
+        seed (None|int|Tensor, optional): The start up seed. If set, seed will be set as the start up seed of shuffle engine.
-                If not set(Default), start up seed of shuffle engine will be generated randomly.
+            If not set(Default), start up seed of shuffle engine will be generated randomly. Default: None.
    Returns:
        Tensor: The shuffled LoDTensor with the same shape and lod as input.
@@ -478,11 +478,10 @@ def shuffle_batch(x, seed=None):
        .. code-block:: python
-            import paddle.fluid as fluid
+            >>> import paddle
-            import paddle
+            >>> paddle.enable_static()
-            paddle.enable_static()
+            >>> x = paddle.static.data(name="x", shape=[-1, 4])
-            x = paddle.static.data(name="x", shape=[-1, 4])
+            >>> out = paddle.incubate.layers.shuffle_batch(x)
-            out = paddle.incubate.layers.shuffle_batch(x)
    """
    helper = LayerHelper('shuffle_batch', **locals())
@@ -526,7 +525,7 @@ def partial_concat(input, start_index=0, length=-1):
                 [9, 10, 11]]
            output = partial_concat([x, y], start_index=0, length=2)
-          we get:
+        We get:
            output = [[0, 1, 6, 7],
                      [3, 4, 9, 10]]
@@ -534,20 +533,22 @@ def partial_concat(input, start_index=0, length=-1):
    Args:
        input(list): List of input Tensors with data type float32, float64, int32,
            int64.
-        start_index(int32): The start index of each instance for partial concatenation.
+        start_index(int32, optional): The start index of each instance for partial concatenation.
            Default is 0.
-        length(int32): The length of each instance for partial concatenation. Default is -1.
+        length(int32, optional): The length of each instance for partial concatenation. Default is -1.
            Negative values for all elements after start_index.
    Returns:
        Tensor: A Tensor with the same data type as input's.
    Examples:
        .. code-block:: python
-            import paddle.fluid as fluid
-            import paddle
+            >>> import paddle
-            x = paddle.randn(name="x", shape=[1,3], dtype="float32")
+            >>> x = paddle.randn(name="x", shape=[1,3], dtype="float32")
-            y = paddle.randn(name="y", shape=[1,3], dtype="float32")
+            >>> y = paddle.randn(name="y", shape=[1,3], dtype="float32")
-            concat = paddle.incubate.layers.partial_concat(
+            >>> concat = paddle.incubate.layers.partial_concat(
-                [x, y], start_index=0, length=2)
+            ...     [x, y], start_index=0, length=2)
    """
    if not isinstance(input, list):
        warnings.warn(
@@ -584,6 +585,7 @@ def partial_sum(input, start_index=0, length=-1):
    This Op exists in incubate layers, which means that it is not shown to the public.
    Only 2-D Tensor or LodTensor input is supported. Slice and concat can only be
    performed along the second dimension.
    .. code-block:: text
        Given:
@@ -592,30 +594,29 @@ def partial_sum(input, start_index=0, length=-1):
            y = [[6, 7 ,8],
                 [9, 10, 11]]
            output = partial_sum([x, y], start_index=0, length=2)
-          we get:
+        We get:
            output = [[6, 8],
                      [12, 14]]
    Args:
-        input(list): List of input Tensors with data type float32, float64, int32,
+        input (list): List of input Tensors with data type float32, float64, int32,
            int64.
+        start_index (int32, optional): The start index of each instance for partial sum. Default is 0.
+        length (int32, optional): The length of each instance for partial sum. Default is -1.
    Returns:
        Tensor: A Tensor with the same data type as input's.
    Examples:
        .. code-block:: python
-        import paddle.fluid as fluid
-        import numpy as np
+            >>> import paddle
-        import paddle
+            >>> paddle.enable_static()
-        paddle.enable_static()
+            >>> x = paddle.static.data(name="x", shape=[2, 3], dtype="float32")
-        x = paddle.static.data(name="x", shape=[2, 3], dtype="float32")
+            >>> y = paddle.static.data(name="y", shape=[2, 3], dtype="float32")
-        y = paddle.static.data(name="y", shape=[2, 3], dtype="float32")
+            >>> sum = paddle.incubate.layers.partial_sum([x,y], start_index=0, length=2)
-        sum = paddle.incubate.layers.partial_sum([x,y], start_index=0, length=2)
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        xx = np.array([1,2,3,4,5,6]).reshape((2,3)).astype("float32")
-        yy = np.array([6,5,4,4,5,6]).reshape((2,3)).astype("float32")
-        out = exe.run(feed={"x":xx, "y":yy}, fetch_list=[sum])
    """
    for id, x in enumerate(input):
        check_variable_and_dtype(
@@ -642,6 +643,7 @@ def tdm_child(x, node_nums, child_nums, param_attr=None, dtype='int32'):
    **Tdm Child**
     According to the input node_id on the given tree, return the corresponding child node_id and
      whether child is a leaf node by leaf_mask value.
    .. code-block:: text
        Given:
@@ -650,25 +652,26 @@ def tdm_child(x, node_nums, child_nums, param_attr=None, dtype='int32'):
            node_nums = 7
            child_nums = 2
-          we get:
+        We get:
            child = [[5, 6],
                     [0, 0]]
            leaf_mask = [[1, 1],
                         [0, 0]]
    Args:
-        x(Tensor): Tensor contained the node_id information, dtype support int32/int64.
+        x (Tensor): Tensor contained the node_id information, dtype support int32/int64.
-        node_nums(int): Number of total nodes.
+        node_nums (int): Number of total nodes.
-        child_nums(int): Maximum number of child nodes per node.
+        child_nums (int): Maximum number of child nodes per node.
-        param_attr(ParamAttr): To specify the tdm-tree-info parameter property. Default: None, which means the
+        param_attr (ParamAttr, optional): To specify the tdm-tree-info parameter property. Default: None, which means the
            default weight parameter property is used. See usage for details in: ref: `api_fluid_ParamAttr`, should
-            has shape(node_nums, 3 + child_nums), dtype support int32/int64.
+            has shape (node_nums, 3 + child_nums), dtype support int32/int64.
            The dimension[1] of tdm-tree-info contains the following:
-            1. Item_id(int, shape(1)), if node is a leaf node, give its item_id corresponding to node_id, else give 0.
+            1. Item_id (int, shape(1)), if node is a leaf node, give its item_id corresponding to node_id, else give 0.
-            2. Layer_id(int, shape(1)), indicates which layer the node is on.
+            2. Layer_id (int, shape(1)), indicates which layer the node is on.
-            3. Parent_id(int, shape(1)), node's parent node.
+            3. Parent_id (int, shape(1)), node's parent node.
-            4. Child_id(int, shape(child_nums)), all child node's node_id of this node should be given.
+            4. Child_id (int, shape(child_nums)), all child node's node_id of this node should be given.
-            If the number of child nodes is insufficient, padding 0 until child nums equal to child_nums
+            If the number of child nodes is insufficient, padding 0 until child nums equal to child_nums.
-        dtype(str): The data type of output child and leaf_mask, support int32/int64.
+        dtype (str, optional): The data type of output child and leaf_mask, support int32/int64. Default: int32.
    Returns:
        tuple: A tuple including input node's child(Tensor) and leaf_mask(Tensor).
@@ -676,27 +679,23 @@ def tdm_child(x, node_nums, child_nums, param_attr=None, dtype='int32'):
    Examples:
        .. code-block:: python
-        import paddle
-        import paddle.fluid as fluid
+            >>> import paddle
-        import numpy as np
+            >>> import numpy as np
-        paddle.enable_static()
+            >>> paddle.enable_static()
-        x = paddle.static.data(name="x", shape=[None, 1], dtype="int32", lod_level=1)
-        tree_info = [[0,0,0,1,2],
+            >>> x = paddle.static.data(name="x", shape=[None, 1], dtype="int32", lod_level=1)
-                     [0,1,0,3,4],[0,1,0,5,6],
+            >>> tree_info = [[0,0,0,1,2],
-                     [0,2,1,0,0],[1,2,1,0,0],[2,2,2,0,0],[3,2,2,0,0]]
+            ...             [0,1,0,3,4],[0,1,0,5,6],
-        tree_info_np = np.array(tree_info)
+            ...             [0,2,1,0,0],[1,2,1,0,0],[2,2,2,0,0],[3,2,2,0,0]]
-        tree_info_np = np.reshape(tree_info_np, (7,5))
+            >>> tree_info_np = np.array(tree_info)
-        node_nums = 7
+            >>> tree_info_np = np.reshape(tree_info_np, (7,5))
-        child_nums = 2
+            >>> node_nums = 7
-        child, leaf_mask  = paddle.incubate.layers.tdm_child(x, node_nums, child_nums,
+            >>> child_nums = 2
-                                param_attr=fluid.ParamAttr(
+            >>> child, leaf_mask  = paddle.incubate.layers.tdm_child(x, node_nums, child_nums,
-                                    initializer=paddle.nn.initializer.Assign(
+            ...                     param_attr=paddle.ParamAttr(
-                                                                            tree_info_np)))
+            ...                     initializer=paddle.nn.initializer.Assign(tree_info_np)))
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        xx = np.array([[2],[3]]).reshape((2,1)).astype("int32")
-        child_res, leaf_mask_res = exe.run(feed={"x":xx}, fetch_list=[child, leaf_mask])
    """
    helper = LayerHelper("tdm_child", **locals())
    check_dtype(
@@ -740,6 +739,7 @@ def tdm_sampler(
    """
    **Tdm Sampler**
    According to the input positive samples at leaf node(x), do negative sampling layer by layer on the given tree.
    .. code-block:: text
        Given:
@@ -753,7 +753,7 @@ def tdm_sampler(
            leaf_node_num = 4
            output_list = False
-          we get:
+        We get:
            out = [[1, 3], [1, 4], [2, 5], [2, 6]]
            labels = [[1, 1], [1, 1], [1, 1], [1, 1]]
            mask = [[1, 1], [1, 1], [1, 1], [1, 1]]
@@ -763,21 +763,21 @@ def tdm_sampler(
        neg_samples_num_list (list(int)): Number of negative samples per layer.
        layer_node_num_list (list(int)): Number of nodes per layer, must has same shape with neg_samples_num_list.
        leaf_node_num (int): Number of leaf nodes.
-        tree_travel_attr (ParamAttr): To specify the tdm-travel parameter property. Default: None, which means the
+        tree_travel_attr (ParamAttr, optional): To specify the tdm-travel parameter property. Default: None, which means the
            default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr`, should
            has shape (leaf_node_num, len(layer_node_num_list)), dtype support int32/int64.
-        tree_layer_attr (ParamAttr): To specify the tdm-layer parameter property. Default: None, which means the
+        tree_layer_attr (ParamAttr, optional): To specify the tdm-layer parameter property. Default: None, which means the
            default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr`, should
            has shape (node_num, 1), dtype support int32/int64.
-        output_positive (bool): Whether to output positive samples (includ label and mask )at the same time.
+        output_positive (bool, optional): Whether to output positive samples (include label and mask )at the same time. Default: True.
-        output_list (bool): Whether to divide the output into layers and organize it into list format.
+        output_list (bool, optional): Whether to divide the output into layers and organize it into list format. Default: True.
-        seed (int): The number of random seed.
+        seed (int, optional): The number of random seed. Default: 0.
-        tree_dtype(np.dtype|core.VarDesc.VarType|str): The dtype of tdm-travel and tdm-layer, support int32/int64
+        tree_dtype (np.dtype|core.VarDesc.VarType|str, optional): The dtype of tdm-travel and tdm-layer, support int32/int64. Default: int32.
-        dtype(np.dtype|core.VarDesc.VarType|str): The dtype of output(sampling results, labels and masks)
+        dtype (np.dtype|core.VarDesc.VarType|str, optional): The dtype of output(sampling results, labels and masks). Default: int32.
    Returns:
        tuple: A tuple including sampling results, corresponding labels and masks. if output_positive = True, sampling
-            result  will include both positive and negative samples. If sampling reseult is a positive sample, the label is 1,
+            result  will include both positive and negative samples. If sampling result is a positive sample, the label is 1,
            and if it is a negative sample, it is 0. If the tree is unbalanced, in order to ensure the consistency of the
            sampling result shape, the padding sample's mask = 0, the real sample's mask value = 1.
            If output_list = True, the result will organize into list format specified by layer information.
@@ -785,43 +785,37 @@ def tdm_sampler(
    Examples:
        .. code-block:: python
-        import paddle
-        import paddle.fluid as fluid
+            >>> import paddle
-        import numpy as np
+            >>> import numpy as np
-        paddle.enable_static()
+            >>> paddle.enable_static()
-        x = paddle.static.data(name="x", shape=[None, 1], dtype="int32", lod_level=1)
-        travel_list = [[1, 3], [1, 4], [2, 5], [2, 6]] # leaf node's travel path, shape(leaf_node_num, layer_num)
+            >>> x = paddle.static.data(name="x", shape=[None, 1], dtype="int32", lod_level=1)
-        layer_list_flat = [[1], [2], [3], [4], [5], [6]] # shape(node_nums, 1)
+            >>> travel_list = [[1, 3], [1, 4], [2, 5], [2, 6]] # leaf node's travel path, shape(leaf_node_num, layer_num)
+            >>> layer_list_flat = [[1], [2], [3], [4], [5], [6]] # shape(node_nums, 1)
-        neg_samples_num_list = [0, 0] # negative sample nums = 0
-        layer_node_num_list = [2, 4] #two layer (exclude root node)
+            >>> neg_samples_num_list = [0, 0] # negative sample nums = 0
-        leaf_node_num = 4
+            >>> layer_node_num_list = [2, 4] #two layer (exclude root node)
+            >>> leaf_node_num = 4
-        travel_array = np.array(travel_list)
-        layer_array = np.array(layer_list_flat)
+            >>> travel_array = np.array(travel_list)
+            >>> layer_array = np.array(layer_list_flat)
-        sample, label, mask = paddle.incubate.layers.tdm_sampler(
-            x,
+            >>> sample, label, mask = paddle.incubate.layers.tdm_sampler(
-            neg_samples_num_list,
+            ...     x,
-            layer_node_num_list,
+            ...     neg_samples_num_list,
-            leaf_node_num,
+            ...     layer_node_num_list,
-            tree_travel_attr=fluid.ParamAttr(
+            ...     leaf_node_num,
-                initializer=paddle.nn.initializer.Assign(
+            ...     tree_travel_attr=paddle.ParamAttr(
-                    travel_array)),
+            ...         initializer=paddle.nn.initializer.Assign(
-            tree_layer_attr=fluid.ParamAttr(
+            ...            travel_array)),
-                initializer=paddle.nn.initializer.Assign(
+            ...     tree_layer_attr=paddle.ParamAttr(
-                    layer_array)),
+            ...         initializer=paddle.nn.initializer.Assign(
-            output_positive=True,
+            ...             layer_array)),
-            output_list=True,
+            ...     output_positive=True,
-            seed=0,
+            ...     output_list=True,
-            tree_dtype='int32')
+            ...     seed=0,
+            ...     tree_dtype='int32')
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        xx = np.array([[0],[1]]).reshape((2,1)).astype("int32")
-        exe.run(feed={"x":xx})
    """
    helper = LayerHelper("tdm_sampler", **locals())
@@ -968,30 +962,33 @@ def rank_attention(
    rank_param gives the organization of data. Notice: It currently supports
    GPU device.
    This Op exists in incubate layers, which means that it is not shown to the public.
    Args:
-        input: Tensor with data type float32, float64.
+        input (Tensor): Tensor with data type float32, float64.
-        rank_offset: Tensor with data type int32.
+        rank_offset (Tensor): Tensor with data type int32.
-        rank_para_shape: The shape of rank_param.
+        rank_para_shape (list[int]): The shape of rank_param.
-        rank_param_attr: Attribute initializer of rank_param.
+        rank_param_attr (ParamAttr): Attribute initializer of rank_param.
-        max_rank: The max rank of input's ranks.
+        max_rank (int, optional): The max rank of input's ranks. Default is 3.
+        max_size (int, optional): The max size of input's ranks. Default is 0.
    Returns:
        Tensor: A Tensor with the same data type as input's.
    Examples:
        .. code-block:: python
-           import paddle.fluid as fluid
-           import paddle
+            >>> import paddle
-           paddle.enable_static()
+            >>> paddle.enable_static()
-           input = paddle.static.data(name="input", shape=[None, 2], dtype="float32")
+            >>> input = paddle.static.data(name="input", shape=[None, 2], dtype="float32")
-           rank_offset = paddle.static.data(name="rank_offset", shape=[None, 7], dtype="int32")
+            >>> rank_offset = paddle.static.data(name="rank_offset", shape=[None, 7], dtype="int32")
-           out = paddle.incubate.layers.rank_attention(input=input,
+            >>> out = paddle.incubate.layers.rank_attention(input=input,
-                                                     rank_offset=rank_offset,
+            ...                                             rank_offset=rank_offset,
-                                                     rank_param_shape=[18,3],
+            ...                                             rank_param_shape=[18,3],
-                                                     rank_param_attr=
+            ...                                             rank_param_attr=
-                                                     paddle.ParamAttr(learning_rate=1.0,
+            ...                                             paddle.ParamAttr(learning_rate=1.0,
-                                                                     name="ubm_rank_param.w_0"),
+            ...                                                              name="ubm_rank_param.w_0"),
-                                                      max_rank=3,
+            ...                                             max_rank=3,
-                                                      max_size=0)
+            ...                                             max_size=0)
    """
    helper = LayerHelper('rank_attention', **locals())
    dtype = helper.input_dtype(input_param_name='input')
@@ -1027,34 +1024,35 @@ def batch_fc(input, param_size, param_attr, bias_size, bias_attr, act=None):
    except that the bias and relu activation layers are added.
    Notice: It currently supports GPU device.
    This Op exists in incubate layers, which means that it is not shown to the public.
    Args:
-        input: Tensor with data type float32, float64.
+        input (Tensor): Tensor with data type float32, float64.
-        param_size: The size of w.
+        param_size (list[int]): The size of w.
-        param_attr: Attribute initializer of w.
+        param_attr (ParamAttr): Attribute initializer of w.
-        bias_size: The size of bias.
+        bias_size (list[int]): The size of bias.
-        bias_attr: Attribute initializer of bias.
+        bias_attr (ParamAttr): Attribute initializer of bias.
-        act: Activation to be applied to the output of this layer.
+        act (str, optional): Activation to be applied to the output of this layer. Default is None.
    Returns:
        Tensor: A Tensor with the same data type as input's.
    Examples:
        .. code-block:: python
-           import paddle.fluid as fluid
-           import paddle
+            >>> import paddle
+            >>> paddle.enable_static()
-           paddle.enable_static()
+            >>> input = paddle.static.data(name="input", shape=[16, 2, 3], dtype="float32")
-           input = paddle.static.data(name="input", shape=[16, 2, 3], dtype="float32")
+            >>> out = paddle.incubate.layers.batch_fc(input=input,
-           out = paddle.incubate.layers.batch_fc(input=input,
+            ...                                     param_size=[16, 3, 10],
-                                               param_size=[16, 3, 10],
+            ...                                     param_attr=
-                                               param_attr=
+            ...                                     paddle.ParamAttr(learning_rate=1.0,
-                                               paddle.ParamAttr(learning_rate=1.0,
+            ...                                                      name="w_0"),
-                                                               name="w_0"),
+            ...                                     bias_size=[16, 10],
-                                               bias_size=[16, 10],
+            ...                                     bias_attr=
-                                               bias_attr=
+            ...                                     paddle.ParamAttr(learning_rate=1.0,
-                                               paddle.ParamAttr(learning_rate=1.0,
+            ...                                                      name="b_0"),
-                                                               name="b_0"),
+            ...                                     act="relu")
-                                               act="relu")
    """
    helper = LayerHelper("batch_fc", **locals())
@@ -1089,23 +1087,26 @@ def _pull_box_extended_sparse(input, size, extend_size=64, dtype='float32'):
    This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in
    BoxPS lookup table. The result of this lookup is the embedding of each ID in the
    :attr:`input`.
    Args:
-        input(Tensor): Input is a Tensor<int64>, which
+        input (Tensor): Input is a Tensor<int64>, which contains the IDs information.
-            contains the IDs information.
+        size (int): The embedding size parameter, which indicates the size of
-        size(int): The embedding size parameter, which indicates the size of
            each embedding vector respectively.
-        extend_size(int): The embedding size parameter in extended dim,
+        extend_size (int, optional): The embedding size parameter in extended dim,
-            which indicates the size of each embedding vector respectively.
+            which indicates the size of each embedding vector respectively. Default is 64.
-        dtype(str): The dtype refers to the data type of output tensor. Only supports
+        dtype (str, optional): The dtype refers to the data type of output tensor. Only supports float32 now. Default is float32.
-      float32 now.
    Returns:
-        Tensor: The tensor storing the embeddings of the \
+        Tensor: The tensor storing the embeddings of the supplied inputs.
-                  supplied inputs.
    Examples:
        .. code-block:: python
-          import paddle.fluid as fluid
-          data = paddle.static.data(name='sequence', shape=[-1, 1], dtype='int64', lod_level=1)
+            >>> import paddle
-          emb, emb_ex = paddle.incubate.layers._pull_box_extended_sparse(input=data, size=8, extend_size=128)
+            >>> paddle.enable_static()
+            >>> data = paddle.static.data(name='sequence', shape=[-1, 1], dtype='int64', lod_level=1)
+            >>> emb, emb_ex = paddle.incubate.layers._pull_box_extended_sparse(input=data, size=8, extend_size=128)
    """
    helper = LayerHelper('pull_box_extended_sparse', **locals())
    helper.input_dtype()
@@ -1139,16 +1140,16 @@ def bilateral_slice(x, guide, grid, has_offset, name=None):
    For more information of bilateral slicing, please refer to Deep Bilateral Learning for Real-Time Image Enhancement <https://groups.csail.mit.edu/graphics/hdrnet/data/hdrnet.pdf>_
    Args:
-        x(Tensor): The input tensor, which is a 4-D tensor with shape
+        x (Tensor): The input tensor, which is a 4-D tensor with shape
                     [N, C, H, W], N is the batch size, C is the channel
                     number, H and W is the feature height and width.
                     The data type is float32 and float64.
-        guide(Tensor): Input grid tensor of shape [N, H, W]. The
+        guide (Tensor): Input grid tensor of shape [N, H, W]. The
                        data type is float32 and float64.
-        grid(Tensor): Input grid tensor of shape [N, C, D, H, W]. The
+        grid (Tensor): Input grid tensor of shape [N, C, D, H, W]. The
                        data type is float32 and float64.
-        has_offset(bool): Whether to slice with affine offset.
+        has_offset (bool): Whether to slice with affine offset.
-        name(str, optional): For detailed information, please refer
+        name (str, optional): For detailed information, please refer
                             to :ref:`api_guide_Name`. Usually name is no need to set and
                             None by default.
@@ -1159,19 +1160,18 @@ def bilateral_slice(x, guide, grid, has_offset, name=None):
        .. code-block:: python
-            import paddle.fluid as fluid
+            >>> import paddle
-            import paddle
+            >>> paddle.enable_static()
-            paddle.enable_static()
-            x = paddle.randn(name='x', shape=[1, 3, 101, 60], dtype='float32')
+            >>> x = paddle.randn(name='x', shape=[1, 3, 101, 60], dtype='float32')
-            guide = paddle.randn(name='guide', shape=[1, 101, 60], dtype='float32')
+            >>> guide = paddle.randn(name='guide', shape=[1, 101, 60], dtype='float32')
-            grid = paddle.randn(name='grid', shape=[1, 12, 8, 10, 6], dtype='float32')
+            >>> grid = paddle.randn(name='grid', shape=[1, 12, 8, 10, 6], dtype='float32')
-            # without offset
+            >>> # without offset
-            output = paddle.incubate.layers.bilateral_slice(x, guide, grid, has_offset=False)
+            >>> output = paddle.incubate.layers.bilateral_slice(x, guide, grid, has_offset=False)
-            # has offset
+            >>> # has offset
-            output = paddle.incubate.layers.bilateral_slice(x, guide, grid, has_offset=True)
+            >>> output = paddle.incubate.layers.bilateral_slice(x, guide, grid, has_offset=True)
    """
    if paddle.in_dynamic_mode():
@@ -1215,13 +1215,13 @@ def correlation(
    <https://arxiv.org/pdf/1709.02371.pdf>_
    Args:
-        x(Tensor): The input x is 4-D Tensor with shape [N, C, H, W]. The data type is float32 and float64.
+        x (Tensor): The input x is 4-D Tensor with shape [N, C, H, W]. The data type is float32 and float64.
-        y(Tensor): The input y is 4-D Tensor with shape [N, C, H, W]. The data type is float32 and float64.
+        y (Tensor): The input y is 4-D Tensor with shape [N, C, H, W]. The data type is float32 and float64.
-        pad_size(int): Pad size. The data type is int.
+        pad_size (int): Pad size. The data type is int.
-        max_displacement(int): Max displacement. The data type is int.
+        max_displacement (int): Max displacement. The data type is int.
-        stride1(int): stride size of x. The data type is int.
+        stride1 (int): stride size of x. The data type is int.
-        stride2(int): stride size of y. The data type is int.
+        stride2 (int): stride size of y. The data type is int.
-        corr_type_multiply(int, optional): The type of multiply. The data type is int. Default: 1.
+        corr_type_multiply (int, optional): The type of multiply. The data type is int. Default: 1.
    Returns:
        Tensor: The data type is same as input tensor.
@@ -1230,25 +1230,24 @@ def correlation(
        .. code-block:: python
-            import paddle.fluid as fluid
+            >>> import paddle
-            import paddle
+            >>> paddle.enable_static()
-            paddle.enable_static()
+            >>> x1 = paddle.static.data(name='x1',
-            x1 = paddle.static.data(name='x1',
+            ...                         shape=[2, 3, 4, 5],
-                               shape=[2,3,4,5],
+            ...                         dtype="float32")
-                               dtype="float32")
+            >>> x2 = paddle.static.data(name='x2',
-            x2 = paddle.static.data(name='x2',
+            ...                         shape=[2, 3, 4, 5],
-                                shape=[2,3,4,5],
+            ...                         dtype="float32")
-                                dtype="float32")
+            >>> out = paddle.incubate.layers.correlation(
-            out = paddle.incubate.layers.correlation(
+            ...                 x1,
-                            x1,
+            ...                 x2,
-                            x2,
+            ...                 pad_size=4,
-                            pad_size=4,
+            ...                 kernel_size=1,
-                            kernel_size=1,
+            ...                 max_displacement=4,
-                            max_displacement=4,
+            ...                 stride1=1,
-                            stride1=1,
+            ...                 stride2=1)
-                            stride2=1)
    """
@@ -1305,105 +1304,97 @@ def fused_bn_add_act(
    `[batch, in_height, in_width, in_channels]`.
    Args:
-        x(Tensor): The rank of input tensor can be 2, 3, 4, 5. The data type
+        x (Tensor): The rank of input tensor can be 2, 3, 4, 5. The data type
            is float16.
-        y(Tensor): The rank of input tensor can be 2, 3, 4, 5. The data type
+        y (Tensor): The rank of input tensor can be 2, 3, 4, 5. The data type
            is float16.
-        momentum(float|Tensor, optional): The value used for the moving_mean and
+        momentum (float|Tensor, optional): The value used for the moving_mean and
            moving_var computation. This should be a float number or a tensor with
            shape [1] and data type as float32. The updated formula is:
            :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
            :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
            Default is 0.9.
-        epsilon(float, optional): A value added to the denominator for
+        epsilon (float, optional): A value added to the denominator for
-            numerical stability. Default is 1e-5.
+            numerical stability. Default is 1e-05.
-        param_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
+        param_attr (ParamAttr, optional): The parameter attribute for Parameter `scale`
            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
-                will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
+            will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
-                If the Initializer of the param_attr is not set, the parameter is initialized
+            If the Initializer of the param_attr is not set, the parameter is initialized
-                with Xavier. Default: None.
+            with Xavier. Default: None.
-        bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm.
+        bias_attr (ParamAttr, optional): The parameter attribute for the bias of batch_norm.
            If it is set to None or one attribute of ParamAttr, batch_norm
-                will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
+            will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
-                If the Initializer of the bias_attr is not set, the bias is initialized zero.
+            If the Initializer of the bias_attr is not set, the bias is initialized zero.
-                Default: None.
+            Default: None.
-        moving_mean_name(str, optional): The name of moving_mean which store the global Mean. If it
+        moving_mean_name (str, optional): The name of moving_mean which store the global Mean. If it
            is set to None, batch_norm will save global mean with a random name, otherwise, batch_norm
-            will save global mean with the string.
+            will save global mean with the string. Default: None.
-        moving_variance_name(str, optional): The name of the moving_variance which store the global Variance.
+        moving_variance_name (str, optional): The name of the moving_variance which store the global Variance.
            If it is set to None, batch_norm will save global variance with a random name, otherwise, batch_norm
-            will save global variance with the string.
+            will save global variance with the string. Default: None.
-        act(string, optional): Activation type, linear|relu|prelu|...
+        act (string, optional): Activation type, linear|relu|prelu|... Default: None.
-        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
-            Usually name is no need to set and None by default.
+            Usually name is no need to set and None by default. Default: None.
    Examples:
-            .. code-block:: python
+        .. code-block:: python
-            import paddle
+            >>> # doctest: +REQUIRES(env:GPU)
-            import paddle.fluid as fluid
+            >>> import paddle
+            >>> paddle.enable_static()
-            paddle.enable_static()
-            # required: gpu
+            >>> def build_program(main_program, startup_program):
-            def build_program(main_program, startup_program):
+            ...     with paddle.static.program_guard(main_program, startup_program):
-                with fluid.program_guard(main_program, startup_program):
+            ...         x = paddle.static.data(name='x', shape=[-1, 1, 28, 28], dtype='float32')
-                    x = paddle.static.data(name='x', shape=[-1, 1, 28, 28], dtype='float32')
+            ...         y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
-                    y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
+            ...         conv1_1 = paddle.static.nn.conv2d(
-                    conv1_1 = paddle.static.nn.conv2d(
+            ...             input=x,
-                        input=x,
+            ...             filter_size=3,
-                        filter_size=3,
+            ...             num_filters=32,
-                        num_filters=32,
+            ...             stride=1,
-                        stride=1,
+            ...             padding=1,
-                        padding=1,
+            ...             act=None,
-                        act=None,
+            ...             bias_attr=False,
-                        bias_attr=False,
+            ...            data_format='NHWC')
-                        data_format='NHWC')
+            ...         conv1_2 = paddle.static.nn.conv2d(
-                    conv1_2 = paddle.static.nn.conv2d(
+            ...             input=x,
-                        input=x,
+            ...             filter_size=3,
-                        filter_size=3,
+            ...             num_filters=32,
-                        num_filters=32,
+            ...             stride=1,
-                        stride=1,
+            ...             padding=1,
-                        padding=1,
+            ...             act=None,
-                        act=None,
+            ...             bias_attr=False,
-                        bias_attr=False,
+            ...             data_format='NHWC')
-                        data_format='NHWC')
+            ...         bn = paddle.static.nn.batch_norm(
-                    bn = paddle.static.nn.batch_norm(
+            ...            input=conv1_1,
-                        input=conv1_1,
+            ...             act=None,
-                        act=None,
+            ...             data_layout='NHWC')
-                        data_layout='NHWC')
+            ...         fused_bn_add_act = paddle.incubate.layers.fused_bn_add_act(conv1_2, bn)
-                    fused_bn_add_act = paddle.incubate.layers.fused_bn_add_act(conv1_2, bn)
+            ...         prediction = paddle.static.nn.fc(x=fused_bn_add_act, size=10, activation='softmax')
-                    prediction = paddle.static.nn.fc(x=fused_bn_add_act, size=10, activation='softmax')
+            ...         loss = paddle.nn.functional.cross_entropy(
-                    loss = paddle.nn.functional.cross_entropy(
+            ...             input=prediction, label=y,
-                        input=prediction, label=y,
+            ...             reduction='none', use_softmax=False
-                        reduction='none', use_softmax=False
+            ...         )
-                    )
+            ...         loss = paddle.mean(loss)
-                    loss = paddle.mean(loss)
+            ...         sgd = paddle.optimizer.SGD(learning_rate=0.001)
-                    sgd = fluid.optimizer.SGD(learning_rate=0.001)
+            ...         sgd = paddle.static.amp.decorate(
-                    sgd = paddle.static.amp.decorate(
+            ...             sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0)
-                        sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0)
+            ...         sgd.minimize(loss)
-                    sgd.minimize(loss)
+            ...
+            ...     return x, y, loss
-                return x, y, loss
+            >>> iters = 5
-            iters = 5
+            >>> batch_size = 16
-            batch_size = 16
+            >>> support_gpu = paddle.is_compiled_with_cuda()
-            support_gpu = fluid.is_compiled_with_cuda()
+            >>> if support_gpu:
-            if support_gpu:
+            ...     main_program = paddle.static.Program()
-                main_program = fluid.Program()
+            ...     startup_program = paddle.static.Program()
-                startup_program = fluid.Program()
+            ...     place = paddle.CUDAPlace(0)
-                place = fluid.CUDAPlace(0)
+            ...     x, y, loss = build_program(main_program, startup_program)
-                x, y, loss = build_program(main_program, startup_program)
+            ...
+            ...     feeder = paddle.DataFeeder(feed_list=[x, y], place=place)
-                feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
+            ...     train_reader = paddle.batch(
-                train_reader = paddle.batch(
+            ...         paddle.dataset.mnist.train(), batch_size=batch_size)
-                    paddle.dataset.mnist.train(), batch_size=batch_size)
-                exe = fluid.Executor(place)
-                scope = fluid.Scope()
-                with fluid.scope_guard(scope):
-                    exe.run(startup_program)
-                    for _ in range(iters):
-                        data = next(train_reader())
-                        loss_v = exe.run(main_program, feed=feeder.feed(data), fetch_list=[loss])
    """
    helper = LayerHelper('fused_bn_add_act', **locals())
@@ -1550,27 +1541,29 @@ def _pull_gpups_sparse(
    :attr:`input`.
    Args:
-        input(Tensor): Input is a Tensor<int64>, which
+        input (Tensor): Input is a Tensor<int64>, which contains the IDs information.
-            contains the IDs information.
+        size (int|list of int): The embedding size parameter of each input, which indicates the size of
-        size(int|list of int): The embedding size parameter of each input, which indicates the size of
            each embedding vector respectively.
-        dtype(str): The dtype refers to the data type of output tensor. Only supports
+        dtype (str, optional): The dtype refers to the data type of output tensor. Only supportsfloat32 now. Default is float32.
-        float32 now.
+        is_distributed (bool, optional): Whether to use distributed mode. Default is False.
+        is_sparse (bool, optional): Whether to use sparse mode. Default is False.
    Returns:
-        Tensor: The tensor storing the embeddings of the \
+        Tensor: The tensor storing the embeddings of the supplied inputs, whose size are indicated by size respectively.
-                  supplied inputs, whose size are indicated by size respectively.
    Examples:
        .. code-block:: python
-          import paddle.incubate as incubate
+            >>> import paddle.incubate as incubate
-          slots = []
+            >>> import paddle
-          data_1 = paddle.static.data(name='sequence', shape=[-1,1], dtype='int64', lod_level=1)
+            >>> paddle.enable_static()
-          slots.append(data_1)
-          data_2 = paddle.static.data(name='sequence', shape=[-1,1], dtype='int64', lod_level=1)
+            >>> slots = []
-          slots.append(data_2)
+            >>> data_1 = paddle.static.data(name='sequence', shape=[-1,1], dtype='int64', lod_level=1)
-          embs = incubate.layers.pull_gpups_sparse(input=slots, size=[11, 35])
+            >>> slots.append(data_1)
+            >>> data_2 = paddle.static.data(name='sequence', shape=[-1,1], dtype='int64', lod_level=1)
+            >>> slots.append(data_2)
+            >>> embs = incubate.layers.pull_gpups_sparse(input=slots, size=[11, 35])
    """
    helper = LayerHelper('pull_gpups_sparse', **locals())
    if dtype != 'float32':
@@ -1613,23 +1606,26 @@ def _pull_box_sparse(
    :attr:`input`.
    Args:
-        input(Tensor): Input is a Tensor<int64>, which
+        input (Tensor): Input is a Tensor<int64>, which contains the IDs information.
-            contains the IDs information.
+        size (int): The embedding size parameter, which indicates the size of
-        size(int): The embedding size parameter, which indicates the size of
            each embedding vector respectively.
-        dtype(str): The dtype refers to the data type of output tensor. Only supports
+        dtype (str, optional): The dtype refers to the data type of output tensor. Only supports float32 now. Default is float32.
-        float32 now.
+        is_distributed (bool, optional): Whether to use distributed mode. Default is False.
+        is_sparse (bool, optional): Whether to use sparse mode. Default is False.
    Returns:
-        Tensor: The tensor storing the embeddings of the \
+        Tensor: The tensor storing the embeddings of the supplied inputs.
-                  supplied inputs.
    Examples:
        .. code-block:: python
-          import paddle.incubate as incubate
+            >>> import paddle.incubate as incubate
-          data = paddle.static.data(name='sequence', shape=[-1,1], dtype='int64', lod_level=1)
+            >>> import paddle
-          emb = incubate.layers.pull_box_sparse(input=data, size=[11])
+            >>> paddle.enable_static()
+            >>> x = paddle.static.data(name='x', shape=[-1, 1], dtype='int64', lod_level=1)
+            >>> y = paddle.static.data(name='y', shape=[-1, 1], dtype='int64', lod_level=1)
+            >>> emb_x, emb_y = incubate.layers._pull_box_sparse([x, y], size=1)
    """
    helper = LayerHelper('pull_box_sparse', **locals())
    if dtype != 'float32':

--- a/python/paddle/incubate/nn/functional/fused_dropout_add.py
+++ b/python/paddle/incubate/nn/functional/fused_dropout_add.py
@@ -51,15 +51,27 @@ def fused_dropout_add(
    Examples:
-        ..  code-block:: python
+        .. code-block:: python
-            # required: gpu
+            >>> # doctest: +REQUIRES(env:GPU)
-            import paddle
+            >>> import paddle
-            from paddle.incubate.nn.functional import fused_dropout_add
+            >>> from paddle.incubate.nn.functional import fused_dropout_add
-            x = paddle.randn([4, 10], dtype='float16')
+            >>> paddle.set_device('gpu')
-            y = paddle.randn([4, 10], dtype='float16')
+            >>> paddle.seed(2023)
-            out = fused_dropout_add(x, y, p=0.5)
+            >>> x = paddle.randn([4, 10], dtype="float32")
+            >>> y = paddle.randn([4, 10], dtype="float32")
+            >>> out = fused_dropout_add(x, y, p=0.5)
+            >>> print(out)
+            Tensor(shape=[4, 10], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            [[-0.49133155,  0.53819323, -2.58393312,  0.06336236, -1.09908366,
+               0.22085167,  2.19751787,  0.05034769,  0.53417486,  0.84864247],
+             [ 0.78248203, -1.59652555, -0.14399840, -0.77985179, -0.17006736,
+              -0.30991879, -0.36593807, -0.51025450,  1.46401680,  0.61627960],
+             [ 4.50472546, -0.48472026,  0.60729283,  0.33509624, -0.25593102,
+              -1.45173049,  1.06727099,  0.00440830, -0.77340341,  0.67393088],
+             [ 1.29453969,  0.07568165,  0.71947742, -0.71768606, -2.57172823,
+               1.89179027,  3.26482797,  1.10493207, -1.04569530, -1.04862499]])
    """
    if isinstance(p, (int, float)):
        # fast return for p == 0

--- a/python/paddle/incubate/nn/functional/fused_ec_moe.py
+++ b/python/paddle/incubate/nn/functional/fused_ec_moe.py
@@ -37,25 +37,20 @@ def fused_ec_moe(
    Examples:
        .. code-block:: python
-            # required: gpu
+            >>> # doctest: +REQUIRES(env:GPU)
-            import paddle
+            >>> import paddle
-            from paddle.incubate.nn.functional import fused_ec_moe
+            >>> from paddle.incubate.nn.functional import fused_ec_moe
-            batch = 10
+            >>> paddle.set_device('gpu')
-            seq_len = 128
+            >>> x = paddle.randn([10, 128, 1024])
-            d_model = 1024
+            >>> gate = paddle.randn([10, 128, 8])
-            d_feed_forward = d_model * 4
+            >>> bmm0_weight = paddle.randn([8, 1024, 4096])
-            num_expert = 8
+            >>> bmm0_bias = paddle.randn([8, 1024, 4096])
+            >>> bmm1_weight = paddle.randn([8, 1024, 4096])
-            x = paddle.randn([batch, seq_len, d_model])
+            >>> bmm1_bias = paddle.randn([8, 1024, 4096])
-            gate = paddle.randn([batch, seq_len, num_expert])
+            >>> out = fused_ec_moe(x, gate, bmm0_weight, bmm0_bias, bmm1_weight, bmm1_bias, act_type="gelu")
-            bmm0_weight = paddle.randn([num_expert, d_model, d_feed_forward])
+            >>> print(out.shape)
-            bmm0_bias = paddle.randn([num_expert, d_model, d_feed_forward])
+            [10, 128, 1024]
-            bmm1_weight = paddle.randn([num_expert, d_model, d_feed_forward])
-            bmm1_bias = paddle.randn([num_expert, d_model, d_feed_forward])
-            out = fused_ec_moe(x, gate, bmm0_weight, bmm0_bias, bmm1_weight, bmm1_bias, act_type="gelu")
-            print(out.shape) # [batch, seq_len, num_expert]
    """
    helper = LayerHelper('fused_moe', **locals())
    out = helper.create_variable_for_type_inference(dtype=x.dtype)

--- a/python/paddle/incubate/nn/functional/fused_gate_attention.py
+++ b/python/paddle/incubate/nn/functional/fused_gate_attention.py
@@ -39,7 +39,7 @@ def fused_gate_attention(
    to information from different representation subspaces. This API only
    support self_attention. The pseudo code is as follows:
-    .. code-block:: python
+    .. code-block:: text
        c = c ** (-0.5)
        q = paddle.einsum('nbqa,ahc->nbqhc', q_data, query_w) * c
@@ -64,20 +64,20 @@ def fused_gate_attention(
    Args:
        query (Tensor): The input query tensor. The shape is [batch_size, msa_len, res_len, q_dim].
        key (Tensor, optional): The input key tensor, which can be set when
-                                merge_qkv is False. The shape is [batch_size, msa_len, m_size, kv_dim].
+            merge_qkv is False. The shape is [batch_size, msa_len, m_size, kv_dim]. Default None.
-        query_weight (Tensor, optional): The weight of query linear, which
+        query_weight (Tensor, optional): The weight of query linear, which should be set when input
-                                         should be set when input key is not None. The shape is [q_dim, num_heads, head_dim].
+            key is not None. The shape is [q_dim, num_heads, head_dim]. Default None.
-        key_weight (Tensor, optional): The weight of key linear, which should
+        key_weight (Tensor, optional): The weight of key linear, which should be set when input key
-                                       be set when input key is not None. The shape is [kv_dim, num_heads, head_dim].
+            is not None. The shape is [kv_dim, num_heads, head_dim]. Default None.
-        value_weight (Tensor, optional): The weight of value linear, which should
+        value_weight (Tensor, optional): The weight of value linear, which should be set when input
-                                         be set when input key is not None. The shape is [kv_dim, num_heads, head_dim].
+            key is not None. The shape is [kv_dim, num_heads, head_dim]. Default None.
-        qkv_weight (Tensor, optional): The weight of qkv linear, which should
+        qkv_weight (Tensor, optional): The weight of qkv linear, which should be set when merge_qkv
-                                       be set when merge_qkv is True. The shape is [3, num_heads, head_dim, q_dim].
+            is True. The shape is [3, num_heads, head_dim, q_dim]. Default None.
-        gate_linear_weight (Tensor, optional): The weight of gating linear,
+        gate_linear_weight (Tensor, optional): The weight of gating linear, which should be set when
-                                       which should be set when has_gating is True. The shape is [q_dim, num_heads, head_dim].
+            has_gating is True. The shape is [q_dim, num_heads, head_dim]. Default None.
-        gate_linear_bias (Tensor, optional): The bias of gating linear, which
+        gate_linear_bias (Tensor, optional): The bias of gating linear, which should be set when
-                                             should be set when has_gating is True. The shape is [num_heads, head_dim]. Default None.
+            has_gating is True. The shape is [num_heads, head_dim]. Default None.
-        out_linear_weight (Tensor, optional): The weight of output linear. The shape is [num_heads, head_dim, q_dim].
+        out_linear_weight (Tensor, optional): The weight of output linear. The shape is [num_heads, head_dim, q_dim]. Default None.
        out_linear_bias (Tensor): The bias of output linear, the shape is [q_dim]. Default None.
        nonbatched_bias (Tensor, optional): The extra bias. The shape is [batch_size, 1, num_heads, res_len, m_size]. Default None.
        attn_mask (Tensor, optional):  The attention mask. The shape is [batch_size, msa_len, 1, 1, res_len]. Default None.
@@ -92,54 +92,54 @@ def fused_gate_attention(
        .. code-block:: python
-            # required: gpu
+            >>> # doctest: +REQUIRES(env:GPU)
-            import paddle
+            >>> import paddle
-            import paddle.incubate.nn.functional as F
+            >>> import paddle.incubate.nn.functional as F
-            # batch_size = 2
+            >>> # batch_size = 2
-            # msa_len = 4
+            >>> # msa_len = 4
-            # res_len = 2
+            >>> # res_len = 2
-            # q_dim = 4
+            >>> # q_dim = 4
-            # num_heads = 8
+            >>> # num_heads = 8
-            # head_dim = 4
+            >>> # head_dim = 4
-            # m_size = res_len (when merge_qkv is True)
+            >>> # m_size = res_len (when merge_qkv is True)
-            # query: [batch_size, msa_len, res_len, q_dim]
+            >>> # query: [batch_size, msa_len, res_len, q_dim]
-            query = paddle.rand(shape=[2, 4, 2, 4], dtype="float32")
+            >>> query = paddle.rand(shape=[2, 4, 2, 4], dtype="float32")
-            # qkv_weight:  [3, n_heads, head_dim, q_dim]
+            >>> # qkv_weight:  [3, n_heads, head_dim, q_dim]
-            qkv_weight = paddle.rand(shape=[3, 8, 4, 4], dtype="float32")
+            >>> qkv_weight = paddle.rand(shape=[3, 8, 4, 4], dtype="float32")
-            # nonbatched_bias: [batch_size, 1, num_heads, res_len, m_size]
+            >>> # nonbatched_bias: [batch_size, 1, num_heads, res_len, m_size]
-            nonbatched_bias = paddle.rand(shape=[2, 1, 8, 2, 2], dtype="float32")
+            >>> nonbatched_bias = paddle.rand(shape=[2, 1, 8, 2, 2], dtype="float32")
-            # attn_mask: [batch_size, msa_len, 1, 1, m_size]
+            >>> # attn_mask: [batch_size, msa_len, 1, 1, m_size]
-            attn_mask = paddle.rand(shape=[2, 4, 1, 1, 2], dtype="float32")
+            >>> attn_mask = paddle.rand(shape=[2, 4, 1, 1, 2], dtype="float32")
-            # gate_linear_weight: [q_dim, num_heads, head_dim]
+            >>> # gate_linear_weight: [q_dim, num_heads, head_dim]
-            gate_linear_weight = paddle.rand(shape=[4, 8, 4], dtype="float32")
+            >>> gate_linear_weight = paddle.rand(shape=[4, 8, 4], dtype="float32")
-            # gate_bias: [num_heads, head_dim]
+            >>> # gate_bias: [num_heads, head_dim]
-            gate_linear_bias = paddle.rand(shape=[8, 4], dtype="float32")
+            >>> gate_linear_bias = paddle.rand(shape=[8, 4], dtype="float32")
-            # out_linear_weight: [num_heads, head_dim, q_dim]
+            >>> # out_linear_weight: [num_heads, head_dim, q_dim]
-            out_linear_weight = paddle.rand(shape=[8, 4, 4], dtype="float32")
+            >>> out_linear_weight = paddle.rand(shape=[8, 4, 4], dtype="float32")
-            # out_linear_bias: [q_dim]
+            >>> # out_linear_bias: [q_dim]
-            out_linear_bias = paddle.rand(shape=[4], dtype="float32")
+            >>> out_linear_bias = paddle.rand(shape=[4], dtype="float32")
-            # output: [batch_size, msa_len, res_len, q_dim]
+            >>> # output: [batch_size, msa_len, res_len, q_dim]
-            output = F.fused_gate_attention(
+            >>> output = F.fused_gate_attention(
-                query=query,
+            ...     query=query,
-                qkv_weight=qkv_weight,
+            ...     qkv_weight=qkv_weight,
-                gate_linear_weight=gate_linear_weight,
+            ...     gate_linear_weight=gate_linear_weight,
-                gate_linear_bias=gate_linear_bias,
+            ...     gate_linear_bias=gate_linear_bias,
-                out_linear_weight=out_linear_weight,
+            ...     out_linear_weight=out_linear_weight,
-                out_linear_bias=out_linear_bias,
+            ...     out_linear_bias=out_linear_bias,
-                nonbatched_bias=nonbatched_bias,
+            ...     nonbatched_bias=nonbatched_bias,
-                attn_mask=attn_mask,
+            ...     attn_mask=attn_mask,
-                has_gating=True,
+            ...     has_gating=True,
-                merge_qkv=True)
+            ...     merge_qkv=True)
-            print(output.shape)
+            >>> print(output.shape)
-            # [2, 4, 2, 4]
+            [2, 4, 2, 4]
    """
    if in_dynamic_mode():

--- a/python/paddle/incubate/nn/functional/fused_matmul_bias.py
+++ b/python/paddle/incubate/nn/functional/fused_matmul_bias.py
@@ -28,11 +28,11 @@ def fused_matmul_bias(
    Args:
        x (Tensor): the first input Tensor to be multiplied.
        y (Tensor): the second input Tensor to be multiplied. Its rank must be 2.
-        bias (Tensor|None): the input bias Tensor. If it is None, no bias addition would
+        bias (Tensor, optional): the input bias Tensor. If it is None, no bias addition would
-            be performed. Otherwise, the bias is added to the matrix multiplication result.
+            be performed. Otherwise, the bias is added to the matrix multiplication result. Default: None.
-        transpose_x (bool): Whether to transpose :math:`x` before multiplication.
+        transpose_x (bool, optional): Whether to transpose :math:`x` before multiplication. Default: False.
-        transpose_y (bool): Whether to transpose :math:`y` before multiplication.
+        transpose_y (bool, optional): Whether to transpose :math:`y` before multiplication. Default: False.
-        name(str|None): For detailed information, please refer to
+        name (str, optional): For detailed information, please refer to
            :ref:`api_guide_Name` . Usually name is no need to set and None by default.
    Returns:
@@ -41,15 +41,18 @@ def fused_matmul_bias(
    Examples:
        .. code-block:: python
-            # required: gpu
+            >>> # doctest: +SKIP('fused_gemm_epilogue is only supported when CUDA version >= 11.6')
-            import paddle
+            >>> # doctest: +REQUIRES(env:GPU)
-            from paddle.incubate.nn.functional import fused_matmul_bias
+            >>> import paddle
+            >>> from paddle.incubate.nn.functional import fused_matmul_bias
-            x = paddle.randn([3, 4])
-            y = paddle.randn([4, 5])
+            >>> paddle.set_device('gpu')
-            bias = paddle.randn([5])
+            >>> x = paddle.randn([3, 5])
-            out = fused_matmul_bias(x, y, bias)
+            >>> y = paddle.randn([4, 5])
-            print(out.shape) # [3, 5]
+            >>> bias = paddle.randn([5])
+            >>> out = fused_matmul_bias(x, y, bias)
+            >>> print(out.shape)
+            [3, 5]
    """
    if bias is None:
        return matmul(x, y, transpose_x, transpose_y, name)
@@ -76,10 +79,10 @@ def fused_linear(x, weight, bias=None, transpose_weight=False, name=None):
    Args:
        x (Tensor): the input Tensor to be multiplied.
        weight (Tensor): the weight Tensor to be multiplied. Its rank must be 2.
-        bias (Tensor|None): the input bias Tensor. If it is None, no bias addition would
+        bias (Tensor, optional): the input bias Tensor. If it is None, no bias addition would
-            be performed. Otherwise, the bias is added to the matrix multiplication result.
+            be performed. Otherwise, the bias is added to the matrix multiplication result. Default: None.
-        transpose_weight (bool): Whether to transpose :math:`weight` before multiplication.
+        transpose_weight (bool, optional): Whether to transpose :math:`weight` before multiplication. Default: False.
-        name(str|None): For detailed information, please refer to
+        name (str, optional): For detailed information, please refer to
            :ref:`api_guide_Name` . Usually name is no need to set and None by default.
    Returns:
@@ -88,15 +91,18 @@ def fused_linear(x, weight, bias=None, transpose_weight=False, name=None):
    Examples:
        .. code-block:: python
-            # required: gpu
+            >>> # doctest: +SKIP('fused_gemm_epilogue is only supported when CUDA version >= 11.6')
-            import paddle
+            >>> # doctest: +REQUIRES(env:GPU)
-            from paddle.incubate.nn.functional import fused_linear
+            >>> import paddle
+            >>> from paddle.incubate.nn.functional import fused_linear
-            x = paddle.randn([3, 4])
-            weight = paddle.randn([4, 5])
+            >>> paddle.set_device('gpu')
-            bias = paddle.randn([5])
+            >>> x = paddle.randn([3, 4])
-            out = fused_linear(x, weight, bias)
+            >>> weight = paddle.randn([4, 5])
-            print(out.shape) # [3, 5]
+            >>> bias = paddle.randn([5])
+            >>> out = fused_linear(x, weight, bias)
+            >>> print(out.shape)
+            [3, 5]
    """
    return fused_matmul_bias(x, weight, bias, False, transpose_weight, name)
@@ -109,25 +115,32 @@ def fused_linear_activation(
    Args:
        x (Tensor): the input Tensor to be multiplied.
-        weight (Tensor): the weight Tensor to be multiplied. Its rank must be 2.
+        y (Tensor): the weight Tensor to be multiplied. Its rank must be 2.
        bias (Tensor): the input bias Tensor, the bias is added to the matrix multiplication result.
-        transpose_weight (bool): Whether to transpose :math:`weight` before multiplication.
+        trans_x (bool, optional): Whether to transpose :math:`x` before multiplication.
-        activation(str|None): Activation function, Currently, the available activation functions are limited to "gelu" (Gaussian Error Linear Unit) and "relu" (Rectified Linear Unit). These activation functions are applied to the output of the bias add.
+        trans_y (bool, optional): Whether to transpose :math:`y` before multiplication.
+        activation (str, optional): Activation function, Currently, the available activation functions are
+            limited to "gelu" (Gaussian Error Linear Unit) and "relu" (Rectified Linear Unit).
+            These activation functions are applied to the output of the bias add. Default: None.
    Returns:
        Tensor: the output Tensor.
    Examples:
        .. code-block:: python
-            # required: gpu
+            >>> # doctest: +SKIP('fused_gemm_epilogue is only supported when CUDA version >= 11.6')
-            import paddle
+            >>> # doctest: +REQUIRES(env:GPU)
-            from paddle.incubate.nn.functional import fused_linear_activation
+            >>> import paddle
+            >>> from paddle.incubate.nn.functional import fused_linear_activation
-            x = paddle.randn([3, 4])
-            weight = paddle.randn([4, 5])
+            >>> paddle.set_device('gpu')
-            bias = paddle.randn([5])
+            >>> x = paddle.randn([3, 4])
-            out = fused_linear_activation(x, weight, bias)
+            >>> weight = paddle.randn([4, 5])
-            print(out.shape) # [3, 5]
+            >>> bias = paddle.randn([5])
+            >>> out = fused_linear_activation(x, weight, bias)
+            >>> print(out.shape)
+            [3, 5]
    """
    if activation is None:
        activation = "none"

--- a/python/paddle/incubate/nn/functional/fused_rotary_position_embedding.py
+++ b/python/paddle/incubate/nn/functional/fused_rotary_position_embedding.py
@@ -44,14 +44,13 @@ def fused_rotary_position_embedding(
    Examples:
-        ..  code-block:: python
+        .. code-block:: python
-            >>> # required: gpu
            >>> # doctest: +REQUIRES(env:GPU)
            >>> import paddle
            >>> from paddle.incubate.nn.functional import fused_rotary_position_embedding
-            >>> paddle.device.set_device('gpu')
+            >>> paddle.set_device('gpu')
            >>> # batch_size = 2
            >>> # seq_len = 2

--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -45,44 +45,44 @@ def transpose(x, perm, name=None):
    Args:
        x (Tensor): The input Tensor. It is a N-D Tensor of data types bool, float32, float64, int32.
        perm (list|tuple): Permute the input according to the data of perm.
-        name (str): The name of this layer. It is optional.
+        name (str, optional): The name of this layer. For more information, please refer to :ref:`api_guide_Name`. Default is None.
    Returns:
        Tensor: A transposed n-D Tensor, with data type being bool, float32, float64, int32, int64.
-    For Example:
+    Examples:
        .. code-block:: text
-         x = [[[ 1  2  3  4] [ 5  6  7  8] [ 9 10 11 12]]
+            x = [[[ 1  2  3  4] [ 5  6  7  8] [ 9 10 11 12]]
-             [[13 14 15 16] [17 18 19 20] [21 22 23 24]]]
+                 [[13 14 15 16] [17 18 19 20] [21 22 23 24]]]
-         shape(x) =  [2,3,4]
+            shape(x) =  [2,3,4]
-         # Example 1
+            # Example 1
-         perm0 = [1,0,2]
+            perm0 = [1,0,2]
-         y_perm0 = [[[ 1  2  3  4] [13 14 15 16]]
+            y_perm0 = [[[ 1  2  3  4] [13 14 15 16]]
-                   [[ 5  6  7  8]  [17 18 19 20]]
+                       [[ 5  6  7  8]  [17 18 19 20]]
-                   [[ 9 10 11 12]  [21 22 23 24]]]
+                       [[ 9 10 11 12]  [21 22 23 24]]]
-         shape(y_perm0) = [3,2,4]
+            shape(y_perm0) = [3,2,4]
-         # Example 2
+            # Example 2
-         perm1 = [2,1,0]
+            perm1 = [2,1,0]
-         y_perm1 = [[[ 1 13] [ 5 17] [ 9 21]]
+            y_perm1 = [[[ 1 13] [ 5 17] [ 9 21]]
-                   [[ 2 14] [ 6 18] [10 22]]
+                       [[ 2 14] [ 6 18] [10 22]]
-                   [[ 3 15]  [ 7 19]  [11 23]]
+                       [[ 3 15]  [ 7 19]  [11 23]]
-                   [[ 4 16]  [ 8 20]  [12 24]]]
+                       [[ 4 16]  [ 8 20]  [12 24]]]
-         shape(y_perm1) = [4,3,2]
+            shape(y_perm1) = [4,3,2]
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            x = paddle.randn([2, 3, 4])
+            >>> x = paddle.randn([2, 3, 4])
-            x_transposed = paddle.transpose(x, perm=[1, 0, 2])
+            >>> x_transposed = paddle.transpose(x, perm=[1, 0, 2])
-            print(x_transposed.shape)
+            >>> print(x_transposed.shape)
-            # [3L, 2L, 4L]
+            [3, 2, 4]
    """
    if in_dynamic_mode():
@@ -180,10 +180,9 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
    Args:
        x (Tensor): The input tensor which is a Tensor.
        y (Tensor): The input tensor which is a Tensor.
-        transpose_x (bool, optional): Whether to transpose :math:`x` before multiplication.
+        transpose_x (bool, optional): Whether to transpose :math:`x` before multiplication. Default is False.
-        transpose_y (bool, optional): Whether to transpose :math:`y` before multiplication.
+        transpose_y (bool, optional): Whether to transpose :math:`y` before multiplication. Default is False.
-        name(str, optional): A name for this layer(optional). If set None, the layer
+        name (str, optional): If set None, the layer will be named automatically. For more information, please refer to :ref:`api_guide_Name`. Default is None.
-            will be named automatically.
    Returns:
        Tensor: The output Tensor.
@@ -192,42 +191,42 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            # vector * vector
+            >>> # vector * vector
-            x = paddle.rand([10])
+            >>> x = paddle.rand([10])
-            y = paddle.rand([10])
+            >>> y = paddle.rand([10])
-            z = paddle.matmul(x, y)
+            >>> z = paddle.matmul(x, y)
-            print(z.shape)
+            >>> print(z.shape)
-            # ()
+            []
-            # matrix * vector
+            >>> # matrix * vector
-            x = paddle.rand([10, 5])
+            >>> x = paddle.rand([10, 5])
-            y = paddle.rand([5])
+            >>> y = paddle.rand([5])
-            z = paddle.matmul(x, y)
+            >>> z = paddle.matmul(x, y)
-            print(z.shape)
+            >>> print(z.shape)
-            # (10,)
+            [10]
-            # batched matrix * broadcasted vector
+            >>> # batched matrix * broadcasted vector
-            x = paddle.rand([10, 5, 2])
+            >>> x = paddle.rand([10, 5, 2])
-            y = paddle.rand([2])
+            >>> y = paddle.rand([2])
-            z = paddle.matmul(x, y)
+            >>> z = paddle.matmul(x, y)
-            print(z.shape)
+            >>> print(z.shape)
-            # (10, 5)
+            [10, 5]
-            # batched matrix * batched matrix
+            >>> # batched matrix * batched matrix
-            x = paddle.rand([10, 5, 2])
+            >>> x = paddle.rand([10, 5, 2])
-            y = paddle.rand([10, 2, 5])
+            >>> y = paddle.rand([10, 2, 5])
-            z = paddle.matmul(x, y)
+            >>> z = paddle.matmul(x, y)
-            print(z.shape)
+            >>> print(z.shape)
-            # (10, 5, 5)
+            [10, 5, 5]
-            # batched matrix * broadcasted matrix
+            >>> # batched matrix * broadcasted matrix
-            x = paddle.rand([10, 1, 5, 2])
+            >>> x = paddle.rand([10, 1, 5, 2])
-            y = paddle.rand([1, 3, 2, 5])
+            >>> y = paddle.rand([1, 3, 2, 5])
-            z = paddle.matmul(x, y)
+            >>> z = paddle.matmul(x, y)
-            print(z.shape)
+            >>> print(z.shape)
-            # (10, 3, 5, 5)
+            [10, 3, 5, 5]
    """
    if in_dynamic_mode():
@@ -305,54 +304,61 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            x = paddle.arange(24, dtype="float32").reshape([2, 3, 4]) - 12
+            >>> x = paddle.arange(24, dtype="float32").reshape([2, 3, 4]) - 12
-            # x: Tensor(shape=[2, 3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            >>> print(x)
-            #          [[[-12., -11., -10., -9. ],
+            Tensor(shape=[2, 3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #            [-8. , -7. , -6. , -5. ],
+            [[[-12., -11., -10., -9. ],
-            #            [-4. , -3. , -2. , -1. ]],
+              [-8. , -7. , -6. , -5. ],
+              [-4. , -3. , -2. , -1. ]],
-            #           [[ 0. ,  1. ,  2. ,  3. ],
+             [[ 0. ,  1. ,  2. ,  3. ],
-            #            [ 4. ,  5. ,  6. ,  7. ],
+              [ 4. ,  5. ,  6. ,  7. ],
-            #            [ 8. ,  9. ,  10.,  11.]]])
+              [ 8. ,  9. ,  10.,  11.]]])
-            # compute frobenius norm along last two dimensions.
+            >>> # compute frobenius norm along last two dimensions.
-            out_fro = paddle.linalg.norm(x, p='fro', axis=[0,1])
+            >>> out_fro = paddle.linalg.norm(x, p='fro', axis=[0,1])
-            # out_fro: Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            >>> print(out_fro)
-            #                 [17.43559647, 16.91153526, 16.73320007, 16.91153526])
+            Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [17.43559647, 16.91153526, 16.73320007, 16.91153526])
-            # compute 2-order vector norm along last dimension.
-            out_pnorm = paddle.linalg.norm(x, p=2, axis=-1)
+            >>> # compute 2-order vector norm along last dimension.
-            # out_pnorm: Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            >>> out_pnorm = paddle.linalg.norm(x, p=2, axis=-1)
-            #                [[21.11871147, 13.19090557, 5.47722578 ],
+            >>> print(out_pnorm)
-            #                 [3.74165750 , 11.22497177, 19.13112640]])
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[21.11871147, 13.19090557, 5.47722578 ],
-            # compute 2-order  norm along [0,1] dimension.
+             [3.74165750 , 11.22497177, 19.13112640]])
-            out_pnorm = paddle.linalg.norm(x, p=2, axis=[0,1])
-            # out_pnorm: Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            >>> # compute 2-order  norm along [0,1] dimension.
-            #                  [17.43559647, 16.91153526, 16.73320007, 16.91153526])
+            >>> out_pnorm = paddle.linalg.norm(x, p=2, axis=[0,1])
+            >>> print(out_pnorm)
-            # compute inf-order  norm
+            Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
-            out_pnorm = paddle.linalg.norm(x, p=float("inf"))
+            [17.43559647, 16.91153526, 16.73320007, 16.91153526])
-            # out_pnorm  = Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #                    12.)
+            >>> # compute inf-order  norm
+            >>> out_pnorm = paddle.linalg.norm(x, p=float("inf"))
-            out_pnorm = paddle.linalg.norm(x, p=float("inf"), axis=0)
+            >>> print(out_pnorm)
-            # out_pnorm: Tensor(shape=[3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #                 [[12., 11., 10., 9. ],
+            12.)
-            #                  [8. , 7. , 6. , 7. ],
-            #                  [8. , 9. , 10., 11.]])
+            >>> out_pnorm = paddle.linalg.norm(x, p=float("inf"), axis=0)
+            >>> print(out_pnorm)
-            # compute -inf-order  norm
+            Tensor(shape=[3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
-            out_pnorm = paddle.linalg.norm(x, p=-float("inf"))
+            [[12., 11., 10., 9. ],
-            # out_pnorm: Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+             [8. , 7. , 6. , 7. ],
-            #                  0.)
+             [8. , 9. , 10., 11.]])
-            out_pnorm = paddle.linalg.norm(x, p=-float("inf"), axis=0)
+            >>> # compute -inf-order  norm
-            # out_pnorm: Tensor(shape=[3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            >>> out_pnorm = paddle.linalg.norm(x, p=-float("inf"))
-            #                  [[0., 1., 2., 3.],
+            >>> print(out_pnorm)
-            #                  [4., 5., 6., 5.],
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #                  [4., 3., 2., 1.]])
+            0.)
+            >>> out_pnorm = paddle.linalg.norm(x, p=-float("inf"), axis=0)
+            >>> print(out_pnorm)
+            Tensor(shape=[3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0., 1., 2., 3.],
+             [4., 5., 6., 5.],
+             [4., 3., 2., 1.]])
    """
    def frobenius_norm(input, dim=None, keepdim=False, name=None):
@@ -360,8 +366,10 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
        The frobenius norm OP is to calculate the frobenius norm of certain two dimensions of Tensor `input`.
        Args:
          input (Variable): Tensor, data type float32, float64.
-          dim (list, optional): None for last two dimensions.
+          dim (list, optional): None for last two dimensions. Default None.
          keepdim (bool, optional): Whether keep the dimensions as the `input`, Default False.
+          name (str, optional): The default value is None. Normally there is no need for
+              user to set this property. For more information, please refer to :ref:`api_guide_Name`.
        """
        if dim is not None and not (isinstance(dim, list) and len(dim) == 2):
            raise ValueError(
@@ -400,9 +408,12 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
        Calculate the p-order vector norm for certain  dimension of Tensor `input`.
        Args:
          input (Variable): Tensor, data type float32, float64.
-          porder (float, optional): None for porder=2.0.
+          porder (float, optional): None for porder=2.0. Default None.
-          axis (int, optional): None for last dimension.
+          axis (int, optional): None for last dimension. Default None.
          keepdim (bool, optional): Whether keep the dimensions as the `input`, Default False.
+          asvector (bool, optional): Whether keep the result as a vector, Default False.
+          name (str, optional): The default value is None. Normally there is no need for
+              user to set this property. For more information, please refer to :ref:`api_guide_Name`.
        """
        if in_dynamic_mode():
            if axis is None:
@@ -682,21 +693,29 @@ def dist(x, y, p=2, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            x = paddle.to_tensor([[3, 3],[3, 3]], dtype="float32")
+            >>> x = paddle.to_tensor([[3, 3],[3, 3]], dtype="float32")
-            y = paddle.to_tensor([[3, 3],[3, 1]], dtype="float32")
+            >>> y = paddle.to_tensor([[3, 3],[3, 1]], dtype="float32")
-            out = paddle.dist(x, y, 0)
+            >>> out = paddle.dist(x, y, 0)
-            print(out) # out = 1.
+            >>> print(out)
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            out = paddle.dist(x, y, 2)
+            1.)
-            print(out) # out = 2.
+            >>> out = paddle.dist(x, y, 2)
-            out = paddle.dist(x, y, float("inf"))
+            >>> print(out)
-            print(out) # out = 2.
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+            2.)
-            out = paddle.dist(x, y, float("-inf"))
-            print(out) # out = 0.
+            >>> out = paddle.dist(x, y, float("inf"))
+            >>> print(out)
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+            2.)
+            >>> out = paddle.dist(x, y, float("-inf"))
+            >>> print(out)
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+            0.)
    """
    if in_dynamic_mode():
        return _C_ops.dist(x, y, p)
@@ -740,83 +759,95 @@ def cond(x, p=None, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
+            >>> paddle.seed(2023)
-            x = paddle.to_tensor([[1., 0, -1], [0, 1, 0], [1, 0, 1]])
+            >>> x = paddle.to_tensor([[1., 0, -1], [0, 1, 0], [1, 0, 1]])
-            # compute conditional number when p is None
+            >>> # compute conditional number when p is None
-            out = paddle.linalg.cond(x)
+            >>> out = paddle.linalg.cond(x)
-            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            >>> print(out)
-            #        1.41421342)
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+            1.41421378)
-            # compute conditional number when order of the norm is 'fro'
-            out_fro = paddle.linalg.cond(x, p='fro')
+            >>> # compute conditional number when order of the norm is 'fro'
-            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            >>> out_fro = paddle.linalg.cond(x, p='fro')
-            #        3.16227770)
+            >>> print(out_fro)
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            # compute conditional number when order of the norm is 'nuc'
+            3.16227770)
-            out_nuc = paddle.linalg.cond(x, p='nuc')
-            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            >>> # compute conditional number when order of the norm is 'nuc'
-            #        9.24263859)
+            >>> out_nuc = paddle.linalg.cond(x, p='nuc')
+            >>> print(out_nuc)
-            # compute conditional number when order of the norm is 1
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            out_1 = paddle.linalg.cond(x, p=1)
+            9.24264145)
-            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        2.)
+            >>> # compute conditional number when order of the norm is 1
+            >>> out_1 = paddle.linalg.cond(x, p=1)
-            # compute conditional number when order of the norm is -1
+            >>> print(out_1)
-            out_minus_1 = paddle.linalg.cond(x, p=-1)
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            2.)
-            #        1.)
+            >>> # compute conditional number when order of the norm is -1
-            # compute conditional number when order of the norm is 2
+            >>> out_minus_1 = paddle.linalg.cond(x, p=-1)
-            out_2 = paddle.linalg.cond(x, p=2)
+            >>> print(out_minus_1)
-            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #        1.41421342)
+            1.)
-            # compute conditional number when order of the norm is -1
+            >>> # compute conditional number when order of the norm is 2
-            out_minus_2 = paddle.linalg.cond(x, p=-2)
+            >>> out_2 = paddle.linalg.cond(x, p=2)
-            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            >>> print(out_2)
-            #        0.70710683)
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+            1.41421378)
-            # compute conditional number when order of the norm is inf
-            out_inf = paddle.linalg.cond(x, p=float("inf"))
+            >>> # compute conditional number when order of the norm is -1
-            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            >>> out_minus_2 = paddle.linalg.cond(x, p=-2)
-            #        2.)
+            >>> print(out_minus_2)
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            # compute conditional number when order of the norm is -inf
+            0.70710671)
-            out_minus_inf = paddle.linalg.cond(x, p=-float("inf"))
-            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            >>> # compute conditional number when order of the norm is inf
-            #        1.)
+            >>> out_inf = paddle.linalg.cond(x, p=float("inf"))
+            >>> print(out_inf)
-            a = paddle.randn([2, 4, 4])
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            # Tensor(shape=[2, 4, 4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            2.)
-            #        [[[-0.06784091, -0.07095790,  1.31792855, -0.58959651],
-            #          [ 0.20818676, -0.85640615, -0.89998871, -1.47439921],
+            >>> # compute conditional number when order of the norm is -inf
-            #          [-0.49132481,  0.42250812, -0.77383220, -2.19794774],
+            >>> out_minus_inf = paddle.linalg.cond(x, p=-float("inf"))
-            #          [-0.33551720, -1.70003879, -1.09795380, -0.63737559]],
+            >>> print(out_minus_inf)
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #         [[ 1.12026262, -0.16119350, -1.21157813,  2.74383283],
+            1.)
-            #          [-0.15999718,  0.18798758, -0.69392562,  1.35720372],
-            #          [-0.53013402, -2.26304483,  1.40843511, -1.02288902],
+            >>> a = paddle.randn([2, 4, 4])
-            #          [ 0.69533503,  2.05261683, -0.02251151, -1.43127477]]])
+            >>> print(a)
+            Tensor(shape=[2, 4, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
-            a_cond_fro = paddle.linalg.cond(a, p='fro')
+            [[[ 0.06132207,  1.11349595,  0.41906244, -0.24858207],
-            # Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+              [-1.85169315, -1.50370061,  1.73954511,  0.13331604],
-            #        [8.86691189 , 75.23817444])
+              [ 1.66359663, -0.55764782, -0.59911072, -0.57773495],
+              [-1.03176904, -0.33741450, -0.29695082, -1.50258386]],
-            b = paddle.randn([2, 3, 4])
+             [[ 0.67233968, -1.07747352,  0.80170447, -0.06695852],
-            # Tensor(shape=[2, 3, 4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+              [-1.85003340, -0.23008066,  0.65083790,  0.75387722],
-            #        [[[-0.43754861,  1.80796063, -0.78729683, -1.82264030],
+              [ 0.61212337, -0.52664012,  0.19209868, -0.18707706],
-            #          [-0.27670753,  0.06620564,  0.29072434, -0.31155765],
+              [-0.00711021,  0.35236868, -0.40404350,  1.28656745]]])
-            #          [ 0.34123746, -0.05444612,  0.05001324, -1.46877074]],
+            >>> a_cond_fro = paddle.linalg.cond(a, p='fro')
-            #         [[-0.64331555, -1.51103854, -1.26277697, -0.68024760],
+            >>> print(a_cond_fro)
-            #          [ 2.59375715, -1.06665540,  0.96575671, -0.73330832],
+            Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #          [-0.47064447, -0.23945692, -0.95150250, -1.07125998]]])
+            [6.37173700 , 35.15114594])
-            b_cond_2 = paddle.linalg.cond(b, p=2)
-            # Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            >>> b = paddle.randn([2, 3, 4])
-            #        [6.64228773, 3.89068866])
+            >>> print(b)
+            Tensor(shape=[2, 3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[ 0.03306439,  0.70149767,  0.77064633, -0.55978841],
+              [-0.84461296,  0.99335045, -1.23486686,  0.59551388],
+              [-0.63035583, -0.98797107,  0.09410731,  0.47007179]],
+             [[ 0.85850012, -0.98949534, -1.63086998,  1.07340240],
+              [-0.05492965,  1.04750168, -2.33754158,  1.16518629],
+              [ 0.66847134, -1.05326962, -0.05703246, -0.48190674]]])
+            >>> b_cond_2 = paddle.linalg.cond(b, p=2)
+            >>> print(b_cond_2)
+            Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [2.86566353, 6.85834455])
    """
@@ -1081,7 +1112,7 @@ def dot(x, y, name=None):
    Parameters:
        x(Tensor): 1-D or 2-D ``Tensor``. Its dtype should be ``float32``, ``float64``, ``int32``, ``int64``, ``complex64``, ``complex128``
-        y(Tensor): 1-D or 2-D ``Tensor``. Its dtype soulde be ``float32``, ``float64``, ``int32``, ``int64``, ``complex64``, ``complex128``
+        y(Tensor): 1-D or 2-D ``Tensor``. Its dtype should be ``float32``, ``float64``, ``int32``, ``int64``, ``complex64``, ``complex128``
        name(str, optional): Name of the output. Default is None. It's used to print debug info for developers. Details: :ref:`api_guide_Name`
    Returns:
@@ -1089,21 +1120,25 @@ def dot(x, y, name=None):
    Examples:
-    .. code-block:: python
+        .. code-block:: python
-        import paddle
+            >>> import paddle
-        # 1-D Tensor * 1-D Tensor
+            >>> # 1-D Tensor * 1-D Tensor
-        x = paddle.to_tensor([1, 2, 3])
+            >>> x = paddle.to_tensor([1, 2, 3])
-        y = paddle.to_tensor([4, 5, 6])
+            >>> y = paddle.to_tensor([4, 5, 6])
-        z = paddle.dot(x, y)
+            >>> z = paddle.dot(x, y)
-        print(z)  # 32
+            >>> print(z)
+            Tensor(shape=[], dtype=int64, place=Place(cpu), stop_gradient=True,
+            32)
-        # 2-D Tensor * 2-D Tensor
+            >>> # 2-D Tensor * 2-D Tensor
-        x = paddle.to_tensor([[1, 2, 3], [2, 4, 6]])
+            >>> x = paddle.to_tensor([[1, 2, 3], [2, 4, 6]])
-        y = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
+            >>> y = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
-        z = paddle.dot(x, y)
+            >>> z = paddle.dot(x, y)
-        print(z)  # [32, 64]
+            >>> print(z)
+            Tensor(shape=[2], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [32, 64])
    """
    if in_dynamic_mode():
@@ -1167,31 +1202,30 @@ def cov(x, rowvar=True, ddof=True, fweights=None, aweights=None, name=None):
    element Cij is the covariance of xi and xj. The element Cii is the variance of xi itself.
    Parameters:
-        x(Tensor): A N-D(N<=2) Tensor containing multiple variables and observations. By default, each row of x represents a variable. Also see rowvar below.
+        x (Tensor): A N-D(N<=2) Tensor containing multiple variables and observations. By default, each row of x represents a variable. Also see rowvar below.
-        rowvar(Bool, optional): If rowvar is True (default), then each row represents a variable, with observations in the columns. Default: True
+        rowvar (Bool, optional): If rowvar is True (default), then each row represents a variable, with observations in the columns. Default: True.
-        ddof(Bool, optional): If ddof=True will return the unbiased estimate, and ddof=False will return the simple average. Default: True
+        ddof (Bool, optional): If ddof=True will return the unbiased estimate, and ddof=False will return the simple average. Default: True.
-        fweights(Tensor, optional): 1-D Tensor of integer frequency weights; The number of times each observation vector should be repeated. Default: None
+        fweights (Tensor, optional): 1-D Tensor of integer frequency weights; The number of times each observation vector should be repeated. Default: None.
-        aweights(Tensor, optional): 1-D Tensor of observation vector weights. How important of the observation vector, larger data means this element is more important. Default: None
+        aweights (Tensor, optional): 1-D Tensor of observation vector weights. How important of the observation vector, larger data means this element is more important. Default: None.
-        name(str, optional): Name of the output. Default is None. It's used to print debug info for developers. Details: :ref:`api_guide_Name`
+        name (str, optional): Name of the output. Default is None. It's used to print debug info for developers. Details: :ref:`api_guide_Name` .
    Returns:
        Tensor: The covariance matrix Tensor of the variables.
    Examples:
-    .. code-block:: python
+        .. code-block:: python
-        import paddle
-        xt = paddle.rand((3, 4))
+            >>> import paddle
-        paddle.linalg.cov(xt)
+            >>> paddle.seed(2023)
-        '''
+            >>> xt = paddle.rand((3, 4))
-        Tensor(shape=[3, 3], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            >>> paddle.linalg.cov(xt)
-            [[0.07918842, 0.06127326, 0.01493049],
+            >>> print(xt)
-                [0.06127326, 0.06166256, 0.00302668],
+            Tensor(shape=[3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
-                [0.01493049, 0.00302668, 0.01632146]])
+            [[0.86583614, 0.52014720, 0.25960937, 0.90525323],
-        '''
+             [0.42400089, 0.40641287, 0.97020894, 0.74437362],
+             [0.51785129, 0.73292869, 0.97786582, 0.04315904]])
    """
    op_type = 'cov'
    if len(x.shape) > 2 or len(x.shape) < 1:
@@ -1289,35 +1323,48 @@ def t(input, name=None):
    Args:
        input (Tensor): The input Tensor. It is a N-D (N<=2) Tensor of data types float32, float64, int32, int64.
-        name(str, optional): The default value is None.  Normally there is no need for
+        name (str, optional): The default value is None.  Normally there is no need for
-            user to set this property.  For more information, please refer to :ref:`api_guide_Name`
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
    Returns:
        Tensor: A transposed n-D Tensor, with data type being float16, float32, float64, int32, int64.
    Examples:
        .. code-block:: python
-           :name: code-example
+            :name: code-example
-             import paddle
+            >>> import paddle
-             # Example 1 (0-D tensor)
-             x = paddle.to_tensor([0.79])
+            >>> # Example 1 (0-D tensor)
-             paddle.t(x) # [0.79]
+            >>> x = paddle.to_tensor([0.79])
+            >>> out = paddle.t(x)
-             # Example 2 (1-D tensor)
+            >>> print(out)
-             x = paddle.to_tensor([0.79, 0.84, 0.32])
+            Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
-             paddle.t(x) # [0.79000002, 0.83999997, 0.31999999]
+            [0.79000002])
-             paddle.t(x).shape # [3]
+            >>> # Example 2 (1-D tensor)
-             # Example 3 (2-D tensor)
+            >>> x = paddle.to_tensor([0.79, 0.84, 0.32])
-             x = paddle.to_tensor([[0.79, 0.84, 0.32],
+            >>> out2 = paddle.t(x)
-                                  [0.64, 0.14, 0.57]])
+            >>> print(out2)
-             x.shape # [2, 3]
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
-             paddle.t(x)
+            [0.79000002, 0.83999997, 0.31999999])
-             # [[0.79000002, 0.63999999],
+            >>> print(paddle.t(x).shape)
-             #  [0.83999997, 0.14000000],
+            [3]
-             #  [0.31999999, 0.56999999]]
-             paddle.t(x).shape # [3, 2]
+            >>> # Example 3 (2-D tensor)
+            >>> x = paddle.to_tensor([[0.79, 0.84, 0.32],
+            ...                       [0.64, 0.14, 0.57]])
+            >>> print(x.shape)
+            [2, 3]
+            >>> out3 = paddle.t(x)
+            >>> print(out3)
+            Tensor(shape=[3, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0.79000002, 0.63999999],
+             [0.83999997, 0.14000000],
+             [0.31999999, 0.56999999]])
+            >>> print(paddle.t(x).shape)
+            [3, 2]
    """
    if len(input.shape) > 2:
@@ -1375,24 +1422,28 @@ def cross(x, y, axis=9, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            x = paddle.to_tensor([[1.0, 1.0, 1.0],
+            >>> x = paddle.to_tensor([[1.0, 1.0, 1.0],
-                                  [2.0, 2.0, 2.0],
+            ...                         [2.0, 2.0, 2.0],
-                                  [3.0, 3.0, 3.0]])
+            ...                         [3.0, 3.0, 3.0]])
-            y = paddle.to_tensor([[1.0, 1.0, 1.0],
+            >>> y = paddle.to_tensor([[1.0, 1.0, 1.0],
-                                  [1.0, 1.0, 1.0],
+            ...                         [1.0, 1.0, 1.0],
-                                  [1.0, 1.0, 1.0]])
+            ...                         [1.0, 1.0, 1.0]])
+            ...
-            z1 = paddle.cross(x, y)
+            >>> z1 = paddle.cross(x, y)
-            # [[-1. -1. -1.]
+            >>> print(z1)
-            #  [ 2.  2.  2.]
+            Tensor(shape=[3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #  [-1. -1. -1.]]
+            [[-1., -1., -1.],
+             [ 2.,  2.,  2.],
-            z2 = paddle.cross(x, y, axis=1)
+             [-1., -1., -1.]])
-            # [[0. 0. 0.]
-            #  [0. 0. 0.]
+            >>> z2 = paddle.cross(x, y, axis=1)
-            #  [0. 0. 0.]]
+            >>> print(z2)
+            Tensor(shape=[3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0., 0., 0.],
+             [0., 0., 0.],
+             [0., 0., 0.]])
    """
    if in_dynamic_mode():
        axis = K_DEFAULT_DIM if axis is None else axis
@@ -1439,7 +1490,7 @@ def cholesky(x, upper=False, name=None):
            where * is zero or more batch dimensions, and matrices on the
            inner-most 2 dimensions all should be symmetric positive-definite.
            Its data type should be float32 or float64.
-        upper (bool): The flag indicating whether to return upper or lower
+        upper (bool, optional): The flag indicating whether to return upper or lower
            triangular matrices. Default: False.
        name (str, optional): Name for the operation (optional, default is None).
            For more information, please refer to :ref:`api_guide_Name`.
@@ -1451,14 +1502,19 @@ def cholesky(x, upper=False, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
+            >>> paddle.seed(2023)
-            a = paddle.rand([3, 3], dtype="float32")
+            >>> a = paddle.rand([3, 3], dtype="float32")
-            a_t = paddle.transpose(a, [1, 0])
+            >>> a_t = paddle.transpose(a, [1, 0])
-            x = paddle.matmul(a, a_t) + 1e-03
+            >>> x = paddle.matmul(a, a_t) + 1e-03
-            out = paddle.linalg.cholesky(x, upper=False)
+            >>> out = paddle.linalg.cholesky(x, upper=False)
-            print(out)
+            >>> print(out)
+            Tensor(shape=[3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[1.04337072, 0.        , 0.        ],
+             [1.06467664, 0.17859250, 0.        ],
+             [1.30602181, 0.08326444, 0.22790681]])
    """
    if in_dynamic_mode():
        return _C_ops.cholesky(x, upper)
@@ -1486,12 +1542,12 @@ def matrix_rank(x, tol=None, hermitian=False, name=None):
    Args:
        x (Tensor): The input tensor. Its shape should be `[..., m, n]`, where `...` is zero or more batch dimensions. If `x` is a batch
            of matrices then the output has the same batch dimensions. The data type of `x` should be float32 or float64.
-        tol (float,Tensor,optional): the tolerance value. Default: None. If `tol` is not specified, and `sigma` is the largest
+        tol (float|Tensor, optional): the tolerance value. If `tol` is not specified, and `sigma` is the largest singular value
-            singular value (or eigenvalues in absolute value), and `eps` is the epsilon value for the dtype of `x`, then `tol` is computed
+            (or eigenvalues in absolute value), and `eps` is the epsilon value for the dtype of `x`, then `tol` is computed with formula
-            with formula `tol=sigma * max(m,n) * eps`. Note that if `x` is a batch of matrices, `tol` is computed this way for every batch.
+            `tol=sigma * max(m,n) * eps`. Note that if `x` is a batch of matrices, `tol` is computed this way for every batch. Default: None.
-        hermitian (bool,optional): indicates whether `x` is Hermitian. Default: False. When hermitian=True, `x` is assumed to be Hermitian,
+        hermitian (bool, optional): indicates whether `x` is Hermitian. Default: False. When hermitian=True, `x` is assumed to be Hermitian,
            enabling a more efficient method for finding eigenvalues, but `x` is not checked inside the function. Instead, We just use
-            the lower triangular of the matrix to compute.
+            the lower triangular of the matrix to compute. Default: False.
        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
    Returns:
@@ -1500,19 +1556,21 @@ def matrix_rank(x, tol=None, hermitian=False, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            a = paddle.eye(10)
+            >>> a = paddle.eye(10)
-            b = paddle.linalg.matrix_rank(a)
+            >>> b = paddle.linalg.matrix_rank(a)
-            print(b)
+            >>> print(b)
-            # b = 10
+            Tensor(shape=[], dtype=int32, place=Place(cpu), stop_gradient=True,
+            10)
-            c = paddle.ones(shape=[3, 4, 5, 5])
+            >>> c = paddle.ones(shape=[3, 4, 5, 5])
-            d = paddle.linalg.matrix_rank(c, tol=0.01, hermitian=True)
+            >>> d = paddle.linalg.matrix_rank(c, tol=0.01, hermitian=True)
-            print(d)
+            >>> print(d)
-            # d = [[1, 1, 1, 1],
+            Tensor(shape=[3, 4], dtype=int32, place=Place(cpu), stop_gradient=True,
-            #      [1, 1, 1, 1],
+            [[1, 1, 1, 1],
-            #      [1, 1, 1, 1]]
+             [1, 1, 1, 1],
+             [1, 1, 1, 1]])
    """
    if in_dynamic_mode():
@@ -1567,13 +1625,13 @@ def bmm(x, y, name=None):
    Both of the two input tensors must be three-dementional and share the same batch size.
-    if x is a (b, m, k) tensor, y is a (b, k, n) tensor, the output will be a (b, m, n) tensor.
+    If x is a (b, m, k) tensor, y is a (b, k, n) tensor, the output will be a (b, m, n) tensor.
    Args:
        x (Tensor): The input Tensor.
        y (Tensor): The input Tensor.
-        name(str|None): A name for this layer(optional). If set None, the layer
+        name (str|None): A name for this layer(optional). If set None, the layer
-            will be named automatically.
+            will be named automatically. Default: None.
    Returns:
        Tensor: The product Tensor.
@@ -1581,23 +1639,23 @@ def bmm(x, y, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            # In imperative mode:
+            >>> # In imperative mode:
-            # size x: (2, 2, 3) and y: (2, 3, 2)
+            >>> # size x: (2, 2, 3) and y: (2, 3, 2)
-            x = paddle.to_tensor([[[1.0, 1.0, 1.0],
+            >>> x = paddle.to_tensor([[[1.0, 1.0, 1.0],
-                                [2.0, 2.0, 2.0]],
+            ...                     [2.0, 2.0, 2.0]],
-                                [[3.0, 3.0, 3.0],
+            ...                     [[3.0, 3.0, 3.0],
-                                [4.0, 4.0, 4.0]]])
+            ...                     [4.0, 4.0, 4.0]]])
-            y = paddle.to_tensor([[[1.0, 1.0],[2.0, 2.0],[3.0, 3.0]],
+            >>> y = paddle.to_tensor([[[1.0, 1.0],[2.0, 2.0],[3.0, 3.0]],
-                                [[4.0, 4.0],[5.0, 5.0],[6.0, 6.0]]])
+            ...                     [[4.0, 4.0],[5.0, 5.0],[6.0, 6.0]]])
-            out = paddle.bmm(x, y)
+            >>> out = paddle.bmm(x, y)
-            # Tensor(shape=[2, 2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            >>> print(out)
-            #        [[[6. , 6. ],
+            Tensor(shape=[2, 2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #          [12., 12.]],
+            [[[6. , 6. ],
+              [12., 12.]],
-            #         [[45., 45.],
+             [[45., 45.],
-            #          [60., 60.]]])
+              [60., 60.]]])
    """
    if in_dynamic_mode():
@@ -1639,9 +1697,9 @@ def histogram(input, bins=100, min=0, max=0, name=None):
    Args:
        input (Tensor): A Tensor(or LoDTensor) with shape :math:`[N_1, N_2,..., N_k]` . The data type of the input Tensor
            should be float32, float64, int32, int64.
-        bins (int, optional): number of histogram bins.
+        bins (int, optional): number of histogram bins. Default: 100.
-        min (int, optional): lower end of the range (inclusive).
+        min (int, optional): lower end of the range (inclusive). Default: 0.
-        max (int, optional): upper end of the range (inclusive).
+        max (int, optional): upper end of the range (inclusive). Default: 0.
        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
    Returns:
@@ -1650,11 +1708,13 @@ def histogram(input, bins=100, min=0, max=0, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            inputs = paddle.to_tensor([1, 2, 1])
+            >>> inputs = paddle.to_tensor([1, 2, 1])
-            result = paddle.histogram(inputs, bins=4, min=0, max=3)
+            >>> result = paddle.histogram(inputs, bins=4, min=0, max=3)
-            print(result) # [0, 2, 1, 0]
+            >>> print(result)
+            Tensor(shape=[4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [0, 2, 1, 0])
    """
    if in_dynamic_mode():
        return _C_ops.histogram(input, bins, min, max)
@@ -1681,8 +1741,8 @@ def bincount(x, weights=None, minlength=0, name=None):
        x (Tensor): A Tensor with non-negative integer. Should be 1-D tensor.
        weights (Tensor, optional): Weight for each value in the input tensor. Should have the same shape as input. Default is None.
        minlength (int, optional): Minimum number of bins. Should be non-negative integer. Default is 0.
-        name(str, optional): The default value is None.  Normally there is no need for user to set this
+        name (str, optional): Normally there is no need for user to set this property.
-            property.  For more information, please refer to :ref:`api_guide_Name`.
+            For more information, please refer to :ref:`api_guide_Name`. Default is None.
    Returns:
        Tensor: The tensor of frequency.
@@ -1690,15 +1750,19 @@ def bincount(x, weights=None, minlength=0, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            x = paddle.to_tensor([1, 2, 1, 4, 5])
+            >>> x = paddle.to_tensor([1, 2, 1, 4, 5])
-            result1 = paddle.bincount(x)
+            >>> result1 = paddle.bincount(x)
-            print(result1) # [0, 2, 1, 0, 1, 1]
+            >>> print(result1)
+            Tensor(shape=[6], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [0, 2, 1, 0, 1, 1])
-            w = paddle.to_tensor([2.1, 0.4, 0.1, 0.5, 0.5])
+            >>> w = paddle.to_tensor([2.1, 0.4, 0.1, 0.5, 0.5])
-            result2 = paddle.bincount(x, weights=w)
+            >>> result2 = paddle.bincount(x, weights=w)
-            print(result2) # [0., 2.19999981, 0.40000001, 0., 0.50000000, 0.50000000]
+            >>> print(result2)
+            Tensor(shape=[6], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [0.        , 2.19999981, 0.40000001, 0.        , 0.50000000, 0.50000000])
    """
    if x.dtype not in [paddle.int32, paddle.int64]:
        raise TypeError("Elements in Input(x) should all be integers")
@@ -1738,8 +1802,8 @@ def mv(x, vec, name=None):
            should be one of float32, float64.
        vec (Tensor): A tensor with shape :math:`[N]` , The data type of the input Tensor x
            should be one of float32, float64.
-        name(str, optional): The default value is None.  Normally there is no need for user to set this
+        name (str, optional): Normally there is no need for user to set this property.
-            property.  For more information, please refer to :ref:`api_guide_Name`.
+            For more information, please refer to :ref:`api_guide_Name`. Default is None.
    Returns:
        Tensor: The tensor which is producted by x and vec.
@@ -1747,17 +1811,17 @@ def mv(x, vec, name=None):
    Examples:
        .. code-block:: python
-            # x: [M, N], vec: [N]
+            >>> # x: [M, N], vec: [N]
-            # paddle.mv(x, vec)  # out: [M]
+            >>> # paddle.mv(x, vec)  # out: [M]
-            import paddle
+            >>> import paddle
-            x = paddle.to_tensor([[2, 1, 3], [3, 0, 1]]).astype("float64")
+            >>> x = paddle.to_tensor([[2, 1, 3], [3, 0, 1]]).astype("float64")
-            vec = paddle.to_tensor([3, 5, 1]).astype("float64")
+            >>> vec = paddle.to_tensor([3, 5, 1]).astype("float64")
-            out = paddle.mv(x, vec)
+            >>> out = paddle.mv(x, vec)
-            print(out)
+            >>> print(out)
-            # Tensor(shape=[2], dtype=float64, place=Place(cpu), stop_gradient=True,
+            Tensor(shape=[2], dtype=float64, place=Place(cpu), stop_gradient=True,
-            #        [14., 10.])
+            [14., 10.])
    """
    if in_dynamic_mode():
        return _C_ops.mv(x, vec)
@@ -1803,8 +1867,8 @@ def det(x, name=None):
        x (Tensor): the input matrix of size `(n, n)` or the
            batch of matrices of size `(*, n, n)` where `*` is one or more
            batch dimensions.
-        name(str, optional): Name of the output. Default is None. It's used
+        name (str, optional): Name of the output.It's used to print debug info for
-            to print debug info for developers. Details: :ref:`api_guide_Name`
+            developers. Details: :ref:`api_guide_Name`. Default is None.
    Returns:
        Tensor, the determinant value of a square matrix or batches of square matrices.
@@ -1812,15 +1876,13 @@ def det(x, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
+            >>> paddle.seed(2023)
-            x =  paddle.randn([3,3,3])
+            >>> x =  paddle.randn([3,3,3])
+            >>> A = paddle.linalg.det(x)
-            A = paddle.linalg.det(x)
+            >>> print(A)
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
-            print(A)
+            [-1.29280925,  0.77832544,  0.89754158])
-            # [ 0.02547996,  2.52317095, -6.15900707])
    """
@@ -1854,15 +1916,17 @@ def slogdet(x, name=None):
    """
    Calculates the sign and natural logarithm of the absolute value of a square matrix's or batches square matrices' determinant.
-    The determinant can be computed with ``sign * exp`` (logabsdet)
+    The determinant can be computed with ``sign * exp`` (logabsdet).
-    Supports input of float, double
+    Supports input of float, double.
-    Note that for matrices that have zero determinant, this returns ``(0, -inf)``
+    Note that for matrices that have zero determinant, this returns ``(0, -inf)``.
    Args:
        x (Tensor): the batch of matrices of size :math:`(*, n, n)`
            where math:`*` is one or more batch dimensions.
+        name (str, optional): Name of the output.It's used to print debug info for
+            developers. Details: :ref:`api_guide_Name`. Default is None.
    Returns:
        y (Tensor), A tensor containing the sign of the determinant and the natural logarithm
@@ -1871,16 +1935,16 @@ def slogdet(x, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
+            >>> paddle.seed(2023)
-            x =  paddle.randn([3,3,3])
+            >>> x =  paddle.randn([3,3,3])
+            >>> A = paddle.linalg.slogdet(x)
-            A = paddle.linalg.slogdet(x)
+            >>> print(A)
+            >>> # doctest: +SKIP
-            print(A)
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-1.        ,  1.        ,  1.        ],
-            # [[ 1.        ,  1.        , -1.        ],
+             [ 0.25681755, -0.25061053, -0.10809582]])
-            # [-0.98610914, -0.43010661, -0.10872950]])
+            >>> # doctest: -SKIP
    """
    if in_dynamic_mode():
@@ -1931,8 +1995,8 @@ def svd(x, full_matrices=False, name=None):
            If full_matrices = False, svd op will use a economic method to store U and V.
            which means shape of U is `[..., N, K]`, shape of V is `[..., M, K]`. K = min(M, N).
            Default value is False.
-        name (str, optional): Name for the operation (optional, default is None).
+        name (str, optional): Name for the operation. For more information,
-            For more information, please refer to :ref:`api_guide_Name`.
+            please refer to :ref:`api_guide_Name`. Default value is None.
    Returns:
        - U (Tensor), is the singular value decomposition result U.
@@ -1944,25 +2008,29 @@ def svd(x, full_matrices=False, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            x = paddle.to_tensor([[1.0, 2.0], [1.0, 3.0], [4.0, 6.0]]).astype('float64')
+            >>> x = paddle.to_tensor([[1.0, 2.0], [1.0, 3.0], [4.0, 6.0]]).astype('float64')
-            x = x.reshape([3, 2])
+            >>> x = x.reshape([3, 2])
-            u, s, vh = paddle.linalg.svd(x)
+            >>> u, s, vh = paddle.linalg.svd(x)
-            print (u)
+            >>> print (u)
-            #U = [[ 0.27364809, -0.21695147  ],
+            Tensor(shape=[3, 2], dtype=float64, place=Place(cpu), stop_gradient=True,
-            #      [ 0.37892198, -0.87112408 ],
+            [[-0.27364809, -0.21695147],
-            #      [ 0.8840446 ,  0.44053933 ]]
+             [-0.37892198, -0.87112408],
+             [-0.88404460,  0.44053933]])
-            print (s)
-            #S = [8.14753743, 0.78589688]
+            >>> print (s)
-            print (vh)
+            Tensor(shape=[2], dtype=float64, place=Place(cpu), stop_gradient=True,
-            #VT= [[ 0.51411221,  0.85772294],
+            [8.14753743, 0.78589688])
-            #     [ 0.85772294, -0.51411221]]
+            >>> print (vh)
-            # one can verify : U * S * VT == X
+            Tensor(shape=[2, 2], dtype=float64, place=Place(cpu), stop_gradient=True,
-            #                  U * UH == I
+            [[-0.51411221, -0.85772294],
-            #                  V * VH == I
+             [ 0.85772294, -0.51411221]])
+            >>> # one can verify : U * S * VT == X
+            >>> #                  U * UH == I
+            >>> #                  V * VH == I
    """
    if in_dynamic_mode():
@@ -2002,8 +2070,9 @@ def pca_lowrank(x, q=None, center=True, niter=2, name=None):
            Default value is :math:`q=min(6,N,M)`.
        center (bool, optional): if True, center the input tensor.
            Default value is True.
-        name (str, optional): Name for the operation (optional, default is None).
+        niter (int, optional): number of iterations to perform. Default: 2.
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): Name for the operation. For more information,
+            please refer to :ref:`api_guide_Name`. Default: None.
    Returns:
        - Tensor U, is N x q matrix.
@@ -2015,29 +2084,30 @@ def pca_lowrank(x, q=None, center=True, niter=2, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
+            >>> paddle.seed(2023)
-            x = paddle.randn((5, 5), dtype='float64')
-            U, S, V = paddle.linalg.pca_lowrank(x)
+            >>> x = paddle.randn((5, 5), dtype='float64')
-            print(U)
+            >>> U, S, V = paddle.linalg.pca_lowrank(x)
-            # Tensor(shape=[5, 5], dtype=float64, place=Place(gpu:0), stop_gradient=True,
+            >>> print(U)
-            #        [[ 0.41057070,  0.40364287,  0.59099574, -0.34529432,  0.44721360],
+           Tensor(shape=[5, 5], dtype=float64, place=Place(cpu), stop_gradient=True,
-            #         [-0.30243321,  0.55670611, -0.15025419,  0.61321785,  0.44721360],
+           [[ 0.80131563,  0.11962647,  0.27667179, -0.25891214,  0.44721360],
-            #         [ 0.57427340, -0.15936327, -0.66414981, -0.06097905,  0.44721360],
+            [-0.12642301,  0.69917551, -0.17899393,  0.51296394,  0.44721360],
-            #         [-0.63897516, -0.09968973, -0.17298615, -0.59316819,  0.44721360],
+            [ 0.08997135, -0.69821706, -0.20059228,  0.51396579,  0.44721360],
-            #         [-0.04343573, -0.70129598,  0.39639442,  0.38622370,  0.44721360]])
+            [-0.23871837, -0.02815453, -0.59888153, -0.61932365,  0.44721360],
+            [-0.52614559, -0.09243040,  0.70179595, -0.14869394,  0.44721360]])
-            print(S)
-            # Tensor(shape=[5], dtype=float64, place=Place(gpu:0), stop_gradient=True,
+            >>> print(S)
-            #        [3.33724265, 2.57573259, 1.69479048, 0.68069312, 0.00000000])
+            Tensor(shape=[5], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [2.60101614, 2.40554940, 1.49768346, 0.19064830, 0.00000000])
-            print(V)
-            # Tensor(shape=[5, 5], dtype=float64, place=Place(gpu:0), stop_gradient=True,
+            >>> print(V)
-            #        [[ 0.09800724, -0.32627008, -0.23593953,  0.81840445,  0.39810690],
+            Tensor(shape=[5, 5], dtype=float64, place=Place(cpu), stop_gradient=True,
-            #         [-0.60100303,  0.63741176, -0.01953663,  0.09023999,  0.47326173],
+            [[ 0.58339481, -0.17143771,  0.00522143,  0.57976310,  0.54231640],
-            #         [ 0.25073864, -0.21305240, -0.32662950, -0.54786156,  0.69634740],
+             [ 0.22334335,  0.72963474, -0.30148399, -0.39388750,  0.41438019],
-            #         [ 0.33057205,  0.48282641, -0.75998527,  0.06744040, -0.27472705],
+             [ 0.05416913,  0.34666487,  0.93549758,  0.00063507,  0.04162998],
-            #         [ 0.67604895,  0.45688227,  0.50959437,  0.13179682,  0.23908071]])
+             [-0.39519094,  0.53074980, -0.16687419,  0.71175586, -0.16638919],
+             [-0.67131070, -0.19071018,  0.07795789, -0.04615811,  0.71046714]])
    """
    def conjugate(x):
@@ -2172,25 +2242,28 @@ def matrix_power(x, n, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            x = paddle.to_tensor([[1, 2, 3],
+            >>> x = paddle.to_tensor([[1, 2, 3],
-                                  [1, 4, 9],
+            ...                       [1, 4, 9],
-                                  [1, 8, 27]], dtype='float64')
+            ...                       [1, 8, 27]], dtype='float64')
-            print(paddle.linalg.matrix_power(x, 2))
+            >>> print(paddle.linalg.matrix_power(x, 2))
-            # [[6.  , 34. , 102.],
+            Tensor(shape=[3, 3], dtype=float64, place=Place(cpu), stop_gradient=True,
-            #  [14. , 90. , 282.],
+            [[6.  , 34. , 102.],
-            #  [36. , 250., 804.]]
+             [14. , 90. , 282.],
+             [36. , 250., 804.]])
-            print(paddle.linalg.matrix_power(x, 0))
-            # [[1., 0., 0.],
+            >>> print(paddle.linalg.matrix_power(x, 0))
-            #  [0., 1., 0.],
+            Tensor(shape=[3, 3], dtype=float64, place=Place(cpu), stop_gradient=True,
-            #  [0., 0., 1.]]
+            [[1., 0., 0.],
+             [0., 1., 0.],
-            print(paddle.linalg.matrix_power(x, -2))
+             [0., 0., 1.]])
-            # [[ 12.91666667, -12.75000000,  2.83333333 ],
-            #  [-7.66666667 ,  8.         , -1.83333333 ],
+            >>> print(paddle.linalg.matrix_power(x, -2))
-            #  [ 1.80555556 , -1.91666667 ,  0.44444444 ]]
+            Tensor(shape=[3, 3], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[ 12.91666667, -12.75000000,  2.83333333 ],
+             [-7.66666667 ,  8.         , -1.83333333 ],
+             [ 1.80555556 , -1.91666667 ,  0.44444444 ]])
    """
    if in_dynamic_mode():
        return _C_ops.matrix_power(x, n)
@@ -2218,14 +2291,14 @@ def qr(x, mode="reduced", name=None):
        x (Tensor): The input tensor. Its shape should be `[..., M, N]`,
            where ... is zero or more batch dimensions. M and N can be arbitrary
            positive number. The data type of x should be float32 or float64.
-        mode (str, optional): A flag to control the behavior of qr, the default is "reduced".
+        mode (str, optional): A flag to control the behavior of qr.
            Suppose x's shape is `[..., M, N]` and denoting `K = min(M, N)`:
            If mode = "reduced", qr op will return reduced Q and R matrices,
            which means Q's shape is `[..., M, K]` and R's shape is `[..., K, N]`.
            If mode = "complete", qr op will return complete Q and R matrices,
            which means Q's shape is `[..., M, M]` and R's shape is `[..., M, N]`.
            If mode = "r", qr op will only return reduced R matrix, which means
-            R's shape is `[..., K, N]`.
+            R's shape is `[..., K, N]`. Default: "reduced".
        name (str, optional): Name for the operation (optional, default is None).
            For more information, please refer to :ref:`api_guide_Name`.
@@ -2236,21 +2309,21 @@ def qr(x, mode="reduced", name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            x = paddle.to_tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]).astype('float64')
+            >>> x = paddle.to_tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]).astype('float64')
-            q, r = paddle.linalg.qr(x)
+            >>> q, r = paddle.linalg.qr(x)
-            print (q)
+            >>> print (q)
-            print (r)
+            Tensor(shape=[3, 2], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[-0.16903085,  0.89708523],
-            # Q = [[-0.16903085,  0.89708523],
+             [-0.50709255,  0.27602622],
-            #      [-0.50709255,  0.27602622],
+             [-0.84515425, -0.34503278]])
-            #      [-0.84515425, -0.34503278]])
+            >>> print (r)
+            Tensor(shape=[2, 2], dtype=float64, place=Place(cpu), stop_gradient=True,
-            # R = [[-5.91607978, -7.43735744],
+            [[-5.91607978, -7.43735744],
-            #      [ 0.        ,  0.82807867]])
+             [ 0.        ,  0.82807867]])
-            # one can verify : X = Q * R ;
+            >>> # one can verify : X = Q * R ;
    """
    if in_dynamic_mode():
        q, r = _C_ops.qr(x, mode)
@@ -2318,42 +2391,41 @@ def lu(x, pivot=True, get_infos=False, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            x = paddle.to_tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]).astype('float64')
+            >>> x = paddle.to_tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]).astype('float64')
-            lu,p,info = paddle.linalg.lu(x, get_infos=True)
+            >>> lu,p,info = paddle.linalg.lu(x, get_infos=True)
-            # >>> lu:
+            >>> print(lu)
-            # Tensor(shape=[3, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            Tensor(shape=[3, 2], dtype=float64, place=Place(cpu), stop_gradient=True,
-            #    [[5.        , 6.        ],
+            [[5.        , 6.        ],
-            #        [0.20000000, 0.80000000],
+             [0.20000000, 0.80000000],
-            #        [0.60000000, 0.50000000]])
+             [0.60000000, 0.50000000]])
-            # >>> p
+            >>> print(p)
-            # Tensor(shape=[2], dtype=int32, place=CUDAPlace(0), stop_gradient=True,
+            Tensor(shape=[2], dtype=int32, place=Place(cpu), stop_gradient=True,
-            #    [3, 3])
+            [3, 3])
-            # >>> info
+            >>> print(info)
-            # Tensor(shape=[], dtype=int32, place=CUDAPlace(0), stop_gradient=True,
+            Tensor(shape=[1], dtype=int32, place=Place(cpu), stop_gradient=True,
-            #    0)
+            [0])
-            P,L,U = paddle.linalg.lu_unpack(lu,p)
+            >>> P,L,U = paddle.linalg.lu_unpack(lu,p)
-            # >>> P
+            >>> print(P)
-            # (Tensor(shape=[3, 3], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            Tensor(shape=[3, 3], dtype=float64, place=Place(cpu), stop_gradient=True,
-            # [[0., 1., 0.],
+            [[0., 1., 0.],
-            # [0., 0., 1.],
+             [0., 0., 1.],
-            # [1., 0., 0.]]),
+             [1., 0., 0.]])
-            # >>> L
+            >>> print(L)
-            # Tensor(shape=[3, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            Tensor(shape=[3, 2], dtype=float64, place=Place(cpu), stop_gradient=True,
-            # [[1.        , 0.        ],
+            [[1.        , 0.        ],
-            # [0.20000000, 1.        ],
+             [0.20000000, 1.        ],
-            # [0.60000000, 0.50000000]]),
+             [0.60000000, 0.50000000]])
-            # >>> U
+            >>> print(U)
-            # Tensor(shape=[2, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            Tensor(shape=[2, 2], dtype=float64, place=Place(cpu), stop_gradient=True,
-            # [[5.        , 6.        ],
+            [[5.        , 6.        ],
-            # [0.        , 0.80000000]]))
+             [0.        , 0.80000000]])
+            >>> # one can verify : X = P @ L @ U ;
-            # one can verify : X = P @ L @ U ;
    """
    if in_dynamic_mode():
@@ -2397,7 +2469,7 @@ def lu_unpack(x, y, unpack_ludata=True, unpack_pivots=True, name=None):
        y (Tensor): Pivots get from paddle.linalg.lu.
-        unpack_ludata (bool,optional): whether to unpack L and U from x. Default: True.
+        unpack_ludata (bool, optional): whether to unpack L and U from x. Default: True.
        unpack_pivots (bool, optional): whether to unpack permutation matrix P from Pivtos. Default: True.
@@ -2415,41 +2487,41 @@ def lu_unpack(x, y, unpack_ludata=True, unpack_pivots=True, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            x = paddle.to_tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]).astype('float64')
+            >>> x = paddle.to_tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]).astype('float64')
-            lu,p,info = paddle.linalg.lu(x, get_infos=True)
+            >>> lu,p,info = paddle.linalg.lu(x, get_infos=True)
-            # >>> lu:
+            >>> print(lu)
-            # Tensor(shape=[3, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            Tensor(shape=[3, 2], dtype=float64, place=Place(cpu), stop_gradient=True,
-            #    [[5.        , 6.        ],
+            [[5.        , 6.        ],
-            #        [0.20000000, 0.80000000],
+             [0.20000000, 0.80000000],
-            #        [0.60000000, 0.50000000]])
+             [0.60000000, 0.50000000]])
-            # >>> p
+            >>> print(p)
-            # Tensor(shape=[2], dtype=int32, place=CUDAPlace(0), stop_gradient=True,
+            Tensor(shape=[2], dtype=int32, place=Place(cpu), stop_gradient=True,
-            #    [3, 3])
+            [3, 3])
-            # >>> info
+            >>> print(info)
-            # Tensor(shape=[], dtype=int32, place=CUDAPlace(0), stop_gradient=True,
+            Tensor(shape=[1], dtype=int32, place=Place(cpu), stop_gradient=True,
-            #    0)
+            [0])
-            P,L,U = paddle.linalg.lu_unpack(lu,p)
+            >>> P,L,U = paddle.linalg.lu_unpack(lu,p)
-            # >>> P
+            >>> print(P)
-            # (Tensor(shape=[3, 3], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            Tensor(shape=[3, 3], dtype=float64, place=Place(cpu), stop_gradient=True,
-            # [[0., 1., 0.],
+            [[0., 1., 0.],
-            # [0., 0., 1.],
+             [0., 0., 1.],
-            # [1., 0., 0.]]),
+             [1., 0., 0.]])
-            # >>> L
+            >>> print(L)
-            # Tensor(shape=[3, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            Tensor(shape=[3, 2], dtype=float64, place=Place(cpu), stop_gradient=True,
-            # [[1.        , 0.        ],
+            [[1.        , 0.        ],
-            # [0.20000000, 1.        ],
+             [0.20000000, 1.        ],
-            # [0.60000000, 0.50000000]]),
+             [0.60000000, 0.50000000]])
-            # >>> U
+            >>> print(U)
-            # Tensor(shape=[2, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            Tensor(shape=[2, 2], dtype=float64, place=Place(cpu), stop_gradient=True,
-            # [[5.        , 6.        ],
+            [[5.        , 6.        ],
-            # [0.        , 0.80000000]]))
+             [0.        , 0.80000000]])
-            # one can verify : X = P @ L @ U ;
+            >>> # one can verify : X = P @ L @ U ;
    """
    if x.ndim < 2:
        raise ValueError(
@@ -2507,27 +2579,25 @@ def eig(x, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            paddle.device.set_device("cpu")
+            >>> x = paddle.to_tensor([[1.6707249, 7.2249975, 6.5045543],
+            ...                       [9.956216,  8.749598,  6.066444 ],
-            x = paddle.to_tensor([[1.6707249, 7.2249975, 6.5045543],
+            ...                       [4.4251957, 1.7983172, 0.370647 ]])
-                               [9.956216,  8.749598,  6.066444 ],
+            >>> w, v = paddle.linalg.eig(x)
-                               [4.4251957, 1.7983172, 0.370647 ]])
+            >>> print(v)
-            w, v = paddle.linalg.eig(x)
+            Tensor(shape=[3, 3], dtype=complex64, place=Place(cpu), stop_gradient=True,
-            print(v)
+            [[ (0.5061365365982056+0j) ,  (0.7971761226654053+0j) ,
-            # Tensor(shape=[3, 3], dtype=complex128, place=CPUPlace, stop_gradient=False,
+               (0.1851806491613388+0j) ],
-            #       [[(-0.5061363550800655+0j) , (-0.7971760990842826+0j) ,
+             [ (0.8308236598968506+0j) , (-0.3463813066482544+0j) ,
-            #         (0.18518077798279986+0j)],
+               (-0.6837005615234375+0j) ],
-            #        [(-0.8308237755993192+0j) ,  (0.3463813401919749+0j) ,
+             [ (0.23142573237419128+0j), (-0.49449989199638367+0j),
-            #         (-0.6837005269141947+0j) ],
+               (0.7058765292167664+0j) ]])
-            #        [(-0.23142567697893396+0j),  (0.4944999840400175+0j) ,
-            #         (0.7058765252952796+0j) ]])
+            >>> print(w)
+            Tensor(shape=[3], dtype=complex64, place=Place(cpu), stop_gradient=True,
-            print(w)
+            [ (16.50470733642578+0j)  , (-5.503481388092041+0j)  ,
-            # Tensor(shape=[3], dtype=complex128, place=CPUPlace, stop_gradient=False,
+              (-0.21026138961315155+0j)])
-            #       [ (16.50471283351188+0j)  , (-5.5034820550763515+0j) ,
-            #         (-0.21026087843552282+0j)])
    """
    if in_dynamic_mode():
@@ -2570,18 +2640,20 @@ def eigvals(x, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
+            >>> paddle.seed(2023)
-            paddle.set_device("cpu")
+            >>> x = paddle.rand(shape=[3, 3], dtype='float64')
-            paddle.seed(1234)
+            >>> print(x)
+            Tensor(shape=[3, 3], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[0.86583615, 0.52014721, 0.25960938],
+             [0.90525323, 0.42400090, 0.40641288],
+             [0.97020893, 0.74437359, 0.51785128]])
-            x = paddle.rand(shape=[3, 3], dtype='float64')
+            >>> print(paddle.linalg.eigvals(x))
-            # [[0.02773777, 0.93004224, 0.06911496],
+            Tensor(shape=[3], dtype=complex128, place=Place(cpu), stop_gradient=True,
-            #  [0.24831591, 0.45733623, 0.07717843],
+            [ (1.788956694280852+0j)  ,  (0.16364484879581526+0j),
-            #  [0.48016702, 0.14235102, 0.42620817]])
+              (-0.14491322408727625+0j)])
-            print(paddle.linalg.eigvals(x))
-            # [(-0.27078833542132674+0j), (0.29962280156230725+0j), (0.8824477020120244+0j)] #complex128
    """
    x_shape = list(x.shape)
@@ -2641,33 +2713,32 @@ def multi_dot(x, name=None):
    Args:
        x ([Tensor]): The input tensors which is a list Tensor.
-        name(str|None): A name for this layer(optional). If set None, the layer
+        name (str, optional): Name for the operation (optional, default is None).
-            will be named automatically.
+            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        Tensor: The output Tensor.
    Examples:
-    .. code-block:: python
+        .. code-block:: python
-        import paddle
+            >>> import paddle
-        # A * B
+            >>> # A * B
-        A = paddle.rand([3, 4])
+            >>> A = paddle.rand([3, 4])
-        B = paddle.rand([4, 5])
+            >>> B = paddle.rand([4, 5])
-        out = paddle.linalg.multi_dot([A, B])
+            >>> out = paddle.linalg.multi_dot([A, B])
-        print(out.shape)
+            >>> print(out.shape)
-        # [3, 5]
+            [3, 5]
-        # A * B * C
+            >>> # A * B * C
-        A = paddle.rand([10, 5])
+            >>> A = paddle.rand([10, 5])
-        B = paddle.rand([5, 8])
+            >>> B = paddle.rand([5, 8])
-        C = paddle.rand([8, 7])
+            >>> C = paddle.rand([8, 7])
-        out = paddle.linalg.multi_dot([A, B, C])
+            >>> out = paddle.linalg.multi_dot([A, B, C])
-        print(out.shape)
+            >>> print(out.shape)
-        # [10, 7]
+            [10, 7]
    """
    if in_dynamic_mode():
@@ -2703,9 +2774,9 @@ def eigh(x, UPLO='L', name=None):
    Args:
        x (Tensor): A tensor with shape :math:`[*, N, N]` , The data type of the input Tensor x
            should be one of float32, float64, complex64, complex128.
-        UPLO(str, optional): (string, default 'L'), 'L' represents the lower triangular matrix,
+        UPLO (str, optional): (string, default 'L'), 'L' represents the lower triangular matrix,
-                        "'U' represents the upper triangular matrix.".
+            "'U' represents the upper triangular matrix.". Default: 'L'.
-        name(str, optional): The default value is None.  Normally there is no need for user to set this
+        name (str, optional): The default value is None. Normally there is no need for user to set this
            property.  For more information, please refer to :ref:`api_guide_Name`.
    Returns:
@@ -2717,15 +2788,17 @@ def eigh(x, UPLO='L', name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            x = paddle.to_tensor([[1, -2j], [2j, 5]])
+            >>> x = paddle.to_tensor([[1, -2j], [2j, 5]])
-            out_value, out_vector = paddle.linalg.eigh(x, UPLO='L')
+            >>> out_value, out_vector = paddle.linalg.eigh(x, UPLO='L')
-            print(out_value)
+            >>> print(out_value)
-            #[0.17157288, 5.82842712]
+            Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
-            print(out_vector)
+            [0.17157286, 5.82842731])
-            #[(-0.9238795325112867+0j), (-0.3826834323650898+0j)],
+            >>> print(out_vector)
-            #[ 0.3826834323650898j    , -0.9238795325112867j    ]]
+            Tensor(shape=[2, 2], dtype=complex64, place=Place(cpu), stop_gradient=True,
+            [[(-0.9238795042037964+0j), (-0.3826833963394165+0j)],
+             [ 0.3826833963394165j    , -0.9238795042037964j    ]])
    """
    if in_dynamic_mode():
@@ -2789,21 +2862,18 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
    If x is hermitian or symmetric matrix, svd will be replaced with eigh.
    Args:
-        x(Tensor): The input tensor. Its shape should be (*, m, n)
+        x (Tensor): The input tensor. Its shape should be (*, m, n)
            where * is zero or more batch dimensions. m and n can be
            arbitraty positive number. The data type of x should be
            float32 or float64 or complex64 or complex128. When data
            type is complex64 or cpmplex128, hermitian should be set
            True.
+        rcond (Tensor, optional): the tolerance value to determine
-        rcond(Tensor, optional): the tolerance value to determine
            when is a singular value zero. Default:1e-15.
+        hermitian (bool, optional): indicates whether x is Hermitian
-        hermitian(bool, optional): indicates whether x is Hermitian
            if complex or symmetric if real. Default: False.
+        name (str, optional): The default value is None. Normally there is no need for user to set this
-        name(str|None): A name for this layer(optional). If set None,
+            property. For more information, please refer to :ref:`api_guide_Name`.
-            the layer will be named automatically.
    Returns:
        Tensor: The tensor with same data type with x. it represents
@@ -2812,25 +2882,24 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            x = paddle.arange(15).reshape((3, 5)).astype('float64')
+            >>> x = paddle.arange(15).reshape((3, 5)).astype('float64')
-            input = paddle.to_tensor(x)
+            >>> input = paddle.to_tensor(x)
-            out = paddle.linalg.pinv(input)
+            >>> out = paddle.linalg.pinv(input)
-            print(input)
+            >>> print(input)
-            print(out)
+            Tensor(shape=[3, 5], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[0. , 1. , 2. , 3. , 4. ],
-            # input:
+             [5. , 6. , 7. , 8. , 9. ],
-            # [[0. , 1. , 2. , 3. , 4. ],
+             [10., 11., 12., 13., 14.]])
-            # [5. , 6. , 7. , 8. , 9. ],
-            # [10., 11., 12., 13., 14.]]
+            >>> print(out)
+            Tensor(shape=[5, 3], dtype=float64, place=Place(cpu), stop_gradient=True,
-            # out:
+            [[-0.22666667, -0.06666667,  0.09333333],
-            # [[-0.22666667, -0.06666667,  0.09333333],
+             [-0.12333333, -0.03333333,  0.05666667],
-            # [-0.12333333, -0.03333333,  0.05666667],
+             [-0.02000000, -0.00000000,  0.02000000],
-            # [-0.02000000,  0.00000000,  0.02000000],
+             [ 0.08333333,  0.03333333, -0.01666667],
-            # [ 0.08333333,  0.03333333, -0.01666667],
+             [ 0.18666667,  0.06666667, -0.05333333]])
-            # [ 0.18666667,  0.06666667, -0.05333333]]
            # one can verify : x * out * x = x ;
            # or              out * x * out = x ;
@@ -3034,7 +3103,7 @@ def solve(x, y, name=None):
            more batch dimensions. Its data type should be float32 or float64.
        y (Tensor): A vector/matrix or a batch of vectors/matrices. Its shape should be ``[*, M, K]``, where ``*`` is zero or
            more batch dimensions. Its data type should be float32 or float64.
-        name(str, optional): Name for the operation (optional, default is None).
+        name (str, optional): Name for the operation (optional, default is None).
            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
@@ -3045,18 +3114,19 @@ def solve(x, y, name=None):
        .. code-block:: python
-            # a square system of linear equations:
+            >>> # a square system of linear equations:
-            # 2*X0 + X1 = 9
+            >>> # 2*X0 + X1 = 9
-            # X0 + 2*X1 = 8
+            >>> # X0 + 2*X1 = 8
-            import paddle
+            >>> import paddle
-            x = paddle.to_tensor([[3, 1],[1, 2]], dtype="float64")
+            >>> x = paddle.to_tensor([[3, 1],[1, 2]], dtype="float64")
-            y = paddle.to_tensor([9, 8], dtype="float64")
+            >>> y = paddle.to_tensor([9, 8], dtype="float64")
-            out = paddle.linalg.solve(x, y)
+            >>> out = paddle.linalg.solve(x, y)
-            print(out)
+            >>> print(out)
-            # [2., 3.])
+            Tensor(shape=[2], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [2., 3.])
    """
    if in_dynamic_mode():
        return _C_ops.solve(x, y)
@@ -3077,7 +3147,7 @@ def triangular_solve(
    x, y, upper=True, transpose=False, unitriangular=False, name=None
 ):
    r"""
-    Computes the solution of a system of equations with a triangular coefficient.  `x` is coefficient matrix
+    Computes the solution of a system of equations with a triangular coefficient. `x` is coefficient matrix
    `y` is multiple right-hand sides of equations.
    Input `x` and `y` is 2D matrices or batches of 2D matrices. If the inputs are batches, the outputs is also
@@ -3103,7 +3173,7 @@ def triangular_solve(
        transpose (bool, optional): whether `x` should be transposed before calculation. Default: False.
        unitriangular (bool, optional): whether `x` is unit triangular. If True, the diagonal elements of `x` are assumed
            to be 1 and not referenced from `x` . Default: False.
-        name(str, optional): Name for the operation (optional, default is None).
+        name (str, optional): Name for the operation (optional, default is None).
            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
@@ -3112,20 +3182,23 @@ def triangular_solve(
    Examples:
        .. code-block:: python
-            # a square system of linear equations:
+            >>> # a square system of linear equations:
-            # x1 +   x2  +   x3 = 0
+            >>> # x1 +   x2  +   x3 = 0
-            #      2*x2  +   x3 = -9
+            >>> #      2*x2  +   x3 = -9
-            #               -x3 = 5
+            >>> #               -x3 = 5
-            import paddle
+            >>> import paddle
-            x = paddle.to_tensor([[1, 1, 1],
+            >>> x = paddle.to_tensor([[1, 1, 1],
-                                  [0, 2, 1],
+            ...                       [0, 2, 1],
-                                  [0, 0,-1]], dtype="float64")
+            ...                       [0, 0,-1]], dtype="float64")
-            y = paddle.to_tensor([[0], [-9], [5]], dtype="float64")
+            >>> y = paddle.to_tensor([[0], [-9], [5]], dtype="float64")
-            out = paddle.linalg.triangular_solve(x, y, upper=True)
+            >>> out = paddle.linalg.triangular_solve(x, y, upper=True)
-            print(out)
+            >>> print(out)
-            # [7, -2, -5]
+            Tensor(shape=[3, 1], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[ 7.],
+             [-2.],
+             [-5.]])
    """
    if in_dynamic_mode():
        return _C_ops.triangular_solve(x, y, upper, transpose, unitriangular)
@@ -3166,7 +3239,7 @@ def cholesky_solve(x, y, upper=False, name=None):
        y (Tensor): The input matrix which is upper or lower triangular Cholesky factor of square matrix A. Its shape should be `[*, M, M]`, where `*` is zero or
            more batch dimensions. Its data type should be float32 or float64.
        upper (bool, optional): whether to consider the Cholesky factor as a lower or upper triangular matrix. Default: False.
-        name(str, optional): Name for the operation (optional, default is None).
+        name (str, optional): Name for the operation (optional, default is None).
            For more information, please refer to :ref:`api_guide_Name`.
    Returns:
@@ -3175,16 +3248,19 @@ def cholesky_solve(x, y, upper=False, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            u = paddle.to_tensor([[1, 1, 1],
+            >>> u = paddle.to_tensor([[1, 1, 1],
-                                    [0, 2, 1],
+            ...                       [0, 2, 1],
-                                    [0, 0,-1]], dtype="float64")
+            ...                       [0, 0,-1]], dtype="float64")
-            b = paddle.to_tensor([[0], [-9], [5]], dtype="float64")
+            >>> b = paddle.to_tensor([[0], [-9], [5]], dtype="float64")
-            out = paddle.linalg.cholesky_solve(b, u, upper=True)
+            >>> out = paddle.linalg.cholesky_solve(b, u, upper=True)
-            print(out)
+            >>> print(out)
-            # [-2.5, -7, 9.5]
+            Tensor(shape=[3, 1], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[-2.50000000],
+             [-7.        ],
+             [ 9.50000000]])
    """
    if in_dynamic_mode():
        return _C_ops.cholesky_solve(x, y, upper)
@@ -3225,13 +3301,13 @@ def eigvalsh(x, UPLO='L', name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            x = paddle.to_tensor([[1, -2j], [2j, 5]])
+            >>> x = paddle.to_tensor([[1, -2j], [2j, 5]])
-            out_value = paddle.eigvalsh(x, UPLO='L')
+            >>> out_value = paddle.eigvalsh(x, UPLO='L')
-            print(out_value)
+            >>> print(out_value)
-            # Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #        [0.17157286, 5.82842731])
+            [0.17157286, 5.82842731])
    """
    if in_dynamic_mode():
        values, _ = _C_ops.eigvalsh(x, UPLO, x.stop_gradient)
@@ -3312,31 +3388,36 @@ def lstsq(x, y, rcond=None, driver=None, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            paddle.set_device("cpu")
+            >>> x = paddle.to_tensor([[1, 3], [3, 2], [5, 6.]])
-            x = paddle.to_tensor([[1, 3], [3, 2], [5, 6.]])
+            >>> y = paddle.to_tensor([[3, 4, 6], [5, 3, 4], [1, 2, 1.]])
-            y = paddle.to_tensor([[3, 4, 6], [5, 3, 4], [1, 2, 1.]])
+            >>> results = paddle.linalg.lstsq(x, y, driver="gelsd")
-            results = paddle.linalg.lstsq(x, y, driver="gelsd")
+            >>> print(results[0])
-            print(results[0])
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
-            # [[ 0.78350395, -0.22165027, -0.62371236],
+            [[ 0.78350395, -0.22165027, -0.62371236],
-            # [-0.11340097,  0.78866047,  1.14948535]]
+             [-0.11340097,  0.78866047,  1.14948535]])
-            print(results[1])
+            >>> print(results[1])
-            # [19.81443405, 10.43814468, 30.56185532])
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
-            print(results[2])
+            [19.81443405, 10.43814468, 30.56185532])
-            # 2
+            >>> print(results[2])
-            print(results[3])
+            Tensor(shape=[], dtype=int32, place=Place(cpu), stop_gradient=True,
-            # [9.03455734, 1.54167950]
+            2)
+            >>> print(results[3])
-            x = paddle.to_tensor([[10, 2, 3], [3, 10, 5], [5, 6, 12.]])
+            Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
-            y = paddle.to_tensor([[4, 2, 9], [2, 0, 3], [2, 5, 3.]])
+            [9.03455734, 1.54167950])
-            results = paddle.linalg.lstsq(x, y, driver="gels")
-            print(results[0])
+            >>> x = paddle.to_tensor([[10, 2, 3], [3, 10, 5], [5, 6, 12.]])
-            # [[ 0.39386186,  0.10230173,  0.93606132],
+            >>> y = paddle.to_tensor([[4, 2, 9], [2, 0, 3], [2, 5, 3.]])
-            # [ 0.10741687, -0.29028133,  0.11892585],
+            >>> results = paddle.linalg.lstsq(x, y, driver="gels")
-            # [-0.05115091,  0.51918161, -0.19948854]]
+            >>> print(results[0])
-            print(results[1])
+            Tensor(shape=[3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
-            # []
+            [[ 0.39386186,  0.10230169,  0.93606132],
+             [ 0.10741688, -0.29028130,  0.11892584],
+             [-0.05115093,  0.51918161, -0.19948851]])
+            >>> print(results[1])
+            Tensor(shape=[0], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [])
    """
    device = paddle.get_device()
    if device == "cpu":
@@ -3456,11 +3537,11 @@ def corrcoef(x, rowvar=True, name=None):
    The values of `R` are between -1 and 1.
-    Parameters:
+    Args:
-        x(Tensor): A N-D(N<=2) Tensor containing multiple variables and observations. By default, each row of x represents a variable. Also see rowvar below.
+        x (Tensor): A N-D(N<=2) Tensor containing multiple variables and observations. By default, each row of x represents a variable. Also see rowvar below.
-        rowvar(Bool, optional): If rowvar is True (default), then each row represents a variable, with observations in the columns. Default: True.
+        rowvar (bool, optional): If rowvar is True (default), then each row represents a variable, with observations in the columns. Default: True.
-        name(str, optional): Name of the output. Default is None. It's used to print debug info for developers. Details: :ref:`api_guide_Name`.
+        name (str, optional): Name of the output. It's used to print debug info for developers. Details: :ref:`api_guide_Name`. Default: None.
    Returns:
@@ -3469,15 +3550,15 @@ def corrcoef(x, rowvar=True, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
+            >>> paddle.seed(2023)
-            xt = paddle.rand((3,4))
-            print(paddle.linalg.corrcoef(xt))
-            # Tensor(shape=[3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            >>> xt = paddle.rand((3,4))
-            # [[ 1.        , -0.73702252,  0.66228950],
+            >>> print(paddle.linalg.corrcoef(xt))
-            # [-0.73702258,  1.        , -0.77104872],
+            Tensor(shape=[3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
-            # [ 0.66228974, -0.77104825,  1.        ]])
+            [[ 0.99999988, -0.47689581, -0.89559376],
+             [-0.47689593,  1.        ,  0.16345492],
+             [-0.89559382,  0.16345496,  1.        ]])
    """
    if len(x.shape) > 2 or len(x.shape) < 1:
@@ -3545,13 +3626,15 @@ def cdist(
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            x = paddle.to_tensor([[0.9041,  0.0196], [-0.3108, -2.4423], [-0.4821,  1.059]], dtype=paddle.float32)
+            >>> x = paddle.to_tensor([[0.9041,  0.0196], [-0.3108, -2.4423], [-0.4821,  1.059]], dtype=paddle.float32)
-            y = paddle.to_tensor([[-2.1763, -0.4713], [-0.6986,  1.3702]], dtype=paddle.float32)
+            >>> y = paddle.to_tensor([[-2.1763, -0.4713], [-0.6986,  1.3702]], dtype=paddle.float32)
-            distance = paddle.cdist(x, y)
+            >>> distance = paddle.cdist(x, y)
-            print(distance)
+            >>> print(distance)
-            # Tensor(shape=[3, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            Tensor(shape=[3, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
-            # [[3.1193, 2.0959], [2.7138, 3.8322], [2.2830, 0.3791]])
+            [[3.11927032, 2.09589314],
+             [2.71384072, 3.83217239],
+             [2.28300953, 0.37910119]])
    """
    check_variable_and_dtype(x, 'x', ('float32', 'float64'), 'cdist')

--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -129,12 +129,15 @@ def logical_and(x, y, out=None, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
+            >>> x = paddle.to_tensor([True])
+            >>> y = paddle.to_tensor([True, False, True, False])
+            >>> res = paddle.logical_and(x, y)
+            >>> print(res)
+            Tensor(shape=[4], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [True , False, True , False])
-            x = paddle.to_tensor([True])
-            y = paddle.to_tensor([True, False, True, False])
-            res = paddle.logical_and(x, y)
-            print(res) # [True False True False]
    """
    if in_dynamic_mode():
        return _C_ops.logical_and(x, y)
@@ -188,15 +191,15 @@ def logical_or(x, y, out=None, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            x = paddle.to_tensor([True, False], dtype="bool").reshape([2, 1])
+            >>> x = paddle.to_tensor([True, False], dtype="bool").reshape([2, 1])
-            y = paddle.to_tensor([True, False, True, False], dtype="bool").reshape([2, 2])
+            >>> y = paddle.to_tensor([True, False, True, False], dtype="bool").reshape([2, 2])
-            res = paddle.logical_or(x, y)
+            >>> res = paddle.logical_or(x, y)
-            print(res)
+            >>> print(res)
-            # Tensor(shape=[2, 2], dtype=bool, place=Place(cpu), stop_gradient=True,
+            Tensor(shape=[2, 2], dtype=bool, place=Place(cpu), stop_gradient=True,
-            #        [[True , True ],
+            [[True , True ],
-            #         [True , False]])
+             [True , False]])
    """
    if in_dynamic_mode():
        return _C_ops.logical_or(x, y)
@@ -249,15 +252,15 @@ def logical_xor(x, y, out=None, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            x = paddle.to_tensor([True, False], dtype="bool").reshape([2, 1])
+            >>> x = paddle.to_tensor([True, False], dtype="bool").reshape([2, 1])
-            y = paddle.to_tensor([True, False, True, False], dtype="bool").reshape([2, 2])
+            >>> y = paddle.to_tensor([True, False, True, False], dtype="bool").reshape([2, 2])
-            res = paddle.logical_xor(x, y)
+            >>> res = paddle.logical_xor(x, y)
-            print(res)
+            >>> print(res)
-            # Tensor(shape=[2, 2], dtype=bool, place=Place(cpu), stop_gradient=True,
+            Tensor(shape=[2, 2], dtype=bool, place=Place(cpu), stop_gradient=True,
-            #        [[False, True ],
+            [[False, True ],
-            #         [True , False]])
+             [True , False]])
    """
    if in_dynamic_mode():
        return _C_ops.logical_xor(x, y)
@@ -300,6 +303,7 @@ def logical_not(x, out=None, name=None):
        .. _Introduction to Tensor: ../../guides/beginner/tensor_en.html#chapter5-broadcasting-of-tensor
    Args:
        x(Tensor):  Operand of logical_not operator. Must be a Tensor of type bool, int8, int16, in32, in64, float16, float32, or float64, complex64, complex128.
        out(Tensor): The ``Tensor`` that specifies the output of the operator, which can be any ``Tensor`` that has been created in the program. The default value is None, and a new ``Tensor` will be created to save the output.
        name(str|None): The default value is None. Normally there is no need for users to set this property. For more information, please refer to :ref:`api_guide_Name`.
@@ -310,11 +314,13 @@ def logical_not(x, out=None, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            x = paddle.to_tensor([True, False, True, False])
+            >>> x = paddle.to_tensor([True, False, True, False])
-            res = paddle.logical_not(x)
+            >>> res = paddle.logical_not(x)
-            print(res) # [False  True False  True]
+            >>> print(res)
+            Tensor(shape=[4], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [False, True , False, True ])
    """
    if in_dynamic_mode():
        return _C_ops.logical_not(x)
@@ -340,9 +346,7 @@ def is_empty(x, name=None):
    Args:
        x (Tensor): The Tensor to be tested.
-        name (str, optional): The default value is ``None`` . Normally users
+        name (str, optional): The default value is ``None`` . Normally users don't have to set this parameter. For more information, please refer to :ref:`api_guide_Name` .
-                            don't have to set this parameter. For more information,
-                            please refer to :ref:`api_guide_Name` .
    Returns:
        Tensor: A bool scalar Tensor. True if 'x' is an empty Tensor.
@@ -350,12 +354,13 @@ def is_empty(x, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            input = paddle.rand(shape=[4, 32, 32], dtype='float32')
+            >>> input = paddle.rand(shape=[4, 32, 32], dtype='float32')
-            res = paddle.is_empty(x=input)
+            >>> res = paddle.is_empty(x=input)
-            # res: Tensor(shape=[], dtype=bool, place=Place(cpu), stop_gradient=True,
+            >>> print(res)
-            #        False)
+            Tensor(shape=[], dtype=bool, place=Place(cpu), stop_gradient=True,
+            False)
    """
    if in_dynamic_mode():
@@ -394,15 +399,19 @@ def equal_all(x, y, name=None):
    Examples:
        .. code-block:: python
-          import paddle
+            >>> import paddle
-          x = paddle.to_tensor([1, 2, 3])
+            >>> x = paddle.to_tensor([1, 2, 3])
-          y = paddle.to_tensor([1, 2, 3])
+            >>> y = paddle.to_tensor([1, 2, 3])
-          z = paddle.to_tensor([1, 4, 3])
+            >>> z = paddle.to_tensor([1, 4, 3])
-          result1 = paddle.equal_all(x, y)
+            >>> result1 = paddle.equal_all(x, y)
-          print(result1) # result1 = True
+            >>> print(result1)
-          result2 = paddle.equal_all(x, z)
+            Tensor(shape=[], dtype=bool, place=Place(cpu), stop_gradient=True,
-          print(result2) # result2 = False
+            True)
+            >>> result2 = paddle.equal_all(x, z)
+            >>> print(result2)
+            Tensor(shape=[], dtype=bool, place=Place(cpu), stop_gradient=True,
+            False)
    """
    if in_dynamic_mode():
        return _C_ops.equal_all(x, y)
@@ -429,11 +438,11 @@ def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
    two tensors are elementwise equal within a tolerance.
    Args:
-        x(Tensor): The input tensor, it's data type should be float16, float32, float64..
+        x (Tensor): The input tensor, it's data type should be float16, float32, float64.
-        y(Tensor): The input tensor, it's data type should be float16, float32, float64..
+        y (Tensor): The input tensor, it's data type should be float16, float32, float64.
-        rtol(rtoltype, optional): The relative tolerance. Default: :math:`1e-5` .
+        rtol (rtoltype, optional): The relative tolerance. Default: :math:`1e-5` .
-        atol(atoltype, optional): The absolute tolerance. Default: :math:`1e-8` .
+        atol (atoltype, optional): The absolute tolerance. Default: :math:`1e-8` .
-        equal_nan(equalnantype, optional): ${equal_nan_comment}.
+        equal_nan (equalnantype, optional): ${equal_nan_comment}. Default: False.
        name (str, optional): Name for the operation. For more information, please
            refer to :ref:`api_guide_Name`. Default: None.
@@ -443,27 +452,28 @@ def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
    Examples:
        .. code-block:: python
-          import paddle
+            >>> import paddle
-          x = paddle.to_tensor([10000., 1e-07])
+            >>> x = paddle.to_tensor([10000., 1e-07])
-          y = paddle.to_tensor([10000.1, 1e-08])
+            >>> y = paddle.to_tensor([10000.1, 1e-08])
-          result1 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
+            >>> result1 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name="ignore_nan")
-                                  equal_nan=False, name="ignore_nan")
+            >>> print(result1)
-          # False
+            Tensor(shape=[], dtype=bool, place=Place(cpu), stop_gradient=True,
+            False)
-          result2 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
+            >>> result2 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=True, name="equal_nan")
-                                      equal_nan=True, name="equal_nan")
+            >>> print(result2)
-          # False
+            Tensor(shape=[], dtype=bool, place=Place(cpu), stop_gradient=True,
+            False)
-          x = paddle.to_tensor([1.0, float('nan')])
+            >>> x = paddle.to_tensor([1.0, float('nan')])
-          y = paddle.to_tensor([1.0, float('nan')])
+            >>> y = paddle.to_tensor([1.0, float('nan')])
-          result1 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
+            >>> result1 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name="ignore_nan")
-                                  equal_nan=False, name="ignore_nan")
+            >>> print(result1)
-          # False
+            Tensor(shape=[], dtype=bool, place=Place(cpu), stop_gradient=True,
+            False)
-          result2 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
+            >>> result2 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=True, name="equal_nan")
-                                      equal_nan=True, name="equal_nan")
+            >>> print(result2)
-          # True
+            Tensor(shape=[], dtype=bool, place=Place(cpu), stop_gradient=True,
+            True)
    """
    if in_dynamic_mode():
@@ -502,9 +512,9 @@ def equal(x, y, name=None):
        The output has no gradient.
    Args:
-        x(Tensor): Tensor, data type is bool, float16, float32, float64, int32, int64.
+        x (Tensor): Tensor, data type is bool, float16, float32, float64, int32, int64.
-        y(Tensor): Tensor, data type is bool, float16, float32, float64, int32, int64.
+        y (Tensor): Tensor, data type is bool, float16, float32, float64, int32, int64.
-        name(str, optional): The default value is None.  Normally there is no need for
+        name (str, optional): The default value is None. Normally there is no need for
            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
    Returns:
@@ -514,12 +524,14 @@ def equal(x, y, name=None):
    Examples:
        .. code-block:: python
-          import paddle
+            >>> import paddle
-          x = paddle.to_tensor([1, 2, 3])
+            >>> x = paddle.to_tensor([1, 2, 3])
-          y = paddle.to_tensor([1, 3, 2])
+            >>> y = paddle.to_tensor([1, 3, 2])
-          result1 = paddle.equal(x, y)
+            >>> result1 = paddle.equal(x, y)
-          print(result1)  # result1 = [True False False]
+            >>> print(result1)
+            Tensor(shape=[3], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [True , False, False])
    """
    if not isinstance(y, (int, bool, float, Variable)):
        raise TypeError(
@@ -599,9 +611,9 @@ def greater_equal(x, y, name=None):
        The output has no gradient.
    Args:
-        x(Tensor): First input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
+        x (Tensor): First input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
-        y(Tensor): Second input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
+        y (Tensor): Second input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
-        name(str, optional): The default value is None.  Normally there is no need for
+        name (str, optional): The default value is None.  Normally there is no need for
            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        Tensor: The output shape is same as input :attr:`x`. The output data type is bool.
@@ -609,12 +621,14 @@ def greater_equal(x, y, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            x = paddle.to_tensor([1, 2, 3])
+            >>> x = paddle.to_tensor([1, 2, 3])
-            y = paddle.to_tensor([1, 3, 2])
+            >>> y = paddle.to_tensor([1, 3, 2])
-            result1 = paddle.greater_equal(x, y)
+            >>> result1 = paddle.greater_equal(x, y)
-            print(result1)  # result1 = [True False True]
+            >>> print(result1)
+            Tensor(shape=[3], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [True , False, True ])
    """
    if in_dynamic_mode():
        return _C_ops.greater_equal(x, y)
@@ -685,9 +699,9 @@ def greater_than(x, y, name=None):
        The output has no gradient.
    Args:
-        x(Tensor): First input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
+        x (Tensor): First input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
-        y(Tensor): Second input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
+        y (Tensor): Second input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
-        name(str, optional): The default value is None.  Normally there is no need for
+        name (str, optional): The default value is None.  Normally there is no need for
            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        Tensor: The output shape is same as input :attr:`x`. The output data type is bool.
@@ -695,12 +709,14 @@ def greater_than(x, y, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            x = paddle.to_tensor([1, 2, 3])
+            >>> x = paddle.to_tensor([1, 2, 3])
-            y = paddle.to_tensor([1, 3, 2])
+            >>> y = paddle.to_tensor([1, 3, 2])
-            result1 = paddle.greater_than(x, y)
+            >>> result1 = paddle.greater_than(x, y)
-            print(result1)  # result1 = [False False True]
+            >>> print(result1)
+            Tensor(shape=[3], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [False, False, True ])
    """
    if in_dynamic_mode():
        return _C_ops.greater_than(x, y)
@@ -771,9 +787,9 @@ def less_equal(x, y, name=None):
        The output has no gradient.
    Args:
-        x(Tensor): First input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
+        x (Tensor): First input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
-        y(Tensor): Second input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
+        y (Tensor): Second input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
-        name(str, optional): The default value is None.  Normally there is no need for
+        name (str, optional): The default value is None.  Normally there is no need for
            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
    Returns:
@@ -782,12 +798,14 @@ def less_equal(x, y, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            x = paddle.to_tensor([1, 2, 3])
+            >>> x = paddle.to_tensor([1, 2, 3])
-            y = paddle.to_tensor([1, 3, 2])
+            >>> y = paddle.to_tensor([1, 3, 2])
-            result1 = paddle.less_equal(x, y)
+            >>> result1 = paddle.less_equal(x, y)
-            print(result1)  # result1 = [True True False]
+            >>> print(result1)
+            Tensor(shape=[3], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [True , True , False])
    """
    if in_dynamic_mode():
        return _C_ops.less_equal(x, y)
@@ -858,9 +876,9 @@ def less_than(x, y, name=None):
        The output has no gradient.
    Args:
-        x(Tensor): First input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
+        x (Tensor): First input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
-        y(Tensor): Second input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
+        y (Tensor): Second input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
-        name(str, optional): The default value is None.  Normally there is no need for
+        name (str, optional): The default value is None.  Normally there is no need for
            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
    Returns:
@@ -869,12 +887,14 @@ def less_than(x, y, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            x = paddle.to_tensor([1, 2, 3])
+            >>> x = paddle.to_tensor([1, 2, 3])
-            y = paddle.to_tensor([1, 3, 2])
+            >>> y = paddle.to_tensor([1, 3, 2])
-            result1 = paddle.less_than(x, y)
+            >>> result1 = paddle.less_than(x, y)
-            print(result1)  # result1 = [False True False]
+            >>> print(result1)
+            Tensor(shape=[3], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [False, True , False])
    """
    if in_dynamic_mode():
        return _C_ops.less_than(x, y)
@@ -945,9 +965,9 @@ def not_equal(x, y, name=None):
        The output has no gradient.
    Args:
-        x(Tensor): First input to compare which is N-D tensor. The input data type should be bool, float32, float64, int32, int64.
+        x (Tensor): First input to compare which is N-D tensor. The input data type should be bool, float32, float64, int32, int64.
-        y(Tensor): Second input to compare which is N-D tensor. The input data type should be bool, float32, float64, int32, int64.
+        y (Tensor): Second input to compare which is N-D tensor. The input data type should be bool, float32, float64, int32, int64.
-        name(str, optional): The default value is None.  Normally there is no need for
+        name (str, optional): The default value is None.  Normally there is no need for
            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
    Returns:
@@ -956,12 +976,14 @@ def not_equal(x, y, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            x = paddle.to_tensor([1, 2, 3])
+            >>> x = paddle.to_tensor([1, 2, 3])
-            y = paddle.to_tensor([1, 3, 2])
+            >>> y = paddle.to_tensor([1, 3, 2])
-            result1 = paddle.not_equal(x, y)
+            >>> result1 = paddle.not_equal(x, y)
-            print(result1)  # result1 = [False True True]
+            >>> print(result1)
+            Tensor(shape=[3], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [False, True , True ])
    """
    if in_dynamic_mode():
        return _C_ops.not_equal(x, y)
@@ -1037,15 +1059,17 @@ def is_tensor(x):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            input1 = paddle.rand(shape=[2, 3, 5], dtype='float32')
+            >>> input1 = paddle.rand(shape=[2, 3, 5], dtype='float32')
-            check = paddle.is_tensor(input1)
+            >>> check = paddle.is_tensor(input1)
-            print(check)  #True
+            >>> print(check)
+            True
-            input3 = [1, 4]
+            >>> input3 = [1, 4]
-            check = paddle.is_tensor(input3)
+            >>> check = paddle.is_tensor(input3)
-            print(check)  #False
+            >>> print(check)
+            False
    """
    if in_dynamic_mode():
@@ -1113,7 +1137,9 @@ def bitwise_and(x, y, out=None, name=None):
    Args:
        x (Tensor): Input Tensor of ``bitwise_and`` . It is a N-D Tensor of bool, uint8, int8, int16, int32, int64.
        y (Tensor): Input Tensor of ``bitwise_and`` . It is a N-D Tensor of bool, uint8, int8, int16, int32, int64.
-        out(Tensor): Result of ``bitwise_and`` . It is a N-D Tensor with the same data type of input Tensor.
+        out (Tensor, optional): Result of ``bitwise_and`` . It is a N-D Tensor with the same data type of input Tensor. Default: None.
+        name (str, optional): The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        Tensor: Result of ``bitwise_and`` . It is a N-D Tensor with the same data type of input Tensor.
@@ -1121,11 +1147,13 @@ def bitwise_and(x, y, out=None, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            x = paddle.to_tensor([-5, -1, 1])
+            >>> x = paddle.to_tensor([-5, -1, 1])
-            y = paddle.to_tensor([4,  2, -3])
+            >>> y = paddle.to_tensor([4,  2, -3])
-            res = paddle.bitwise_and(x, y)
+            >>> res = paddle.bitwise_and(x, y)
-            print(res)  # [0, 2, 1]
+            >>> print(res)
+            Tensor(shape=[3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [0, 2, 1])
    """
    if in_dynamic_mode() and out is None:
        return _C_ops.bitwise_and(x, y)
@@ -1167,7 +1195,9 @@ def bitwise_or(x, y, out=None, name=None):
    Args:
        x (Tensor): Input Tensor of ``bitwise_or`` . It is a N-D Tensor of bool, uint8, int8, int16, int32, int64.
        y (Tensor): Input Tensor of ``bitwise_or`` . It is a N-D Tensor of bool, uint8, int8, int16, int32, int64.
-        out(Tensor): Result of ``bitwise_or`` . It is a N-D Tensor with the same data type of input Tensor.
+        out (Tensor, optional): Result of ``bitwise_or`` . It is a N-D Tensor with the same data type of input Tensor. Default: None.
+        name (str, optional): The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        Tensor: Result of ``bitwise_or`` . It is a N-D Tensor with the same data type of input Tensor.
@@ -1175,11 +1205,13 @@ def bitwise_or(x, y, out=None, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            x = paddle.to_tensor([-5, -1, 1])
+            >>> x = paddle.to_tensor([-5, -1, 1])
-            y = paddle.to_tensor([4,  2, -3])
+            >>> y = paddle.to_tensor([4,  2, -3])
-            res = paddle.bitwise_or(x, y)
+            >>> res = paddle.bitwise_or(x, y)
-            print(res)  # [-1, -1, -3]
+            >>> print(res)
+            Tensor(shape=[3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [-1, -1, -3])
    """
    if in_dynamic_mode() and out is None:
        return _C_ops.bitwise_or(x, y)
@@ -1222,7 +1254,9 @@ def bitwise_xor(x, y, out=None, name=None):
    Args:
        x (Tensor): Input Tensor of ``bitwise_xor`` . It is a N-D Tensor of bool, uint8, int8, int16, int32, int64.
        y (Tensor): Input Tensor of ``bitwise_xor`` . It is a N-D Tensor of bool, uint8, int8, int16, int32, int64.
-        out(Tensor): Result of ``bitwise_xor`` . It is a N-D Tensor with the same data type of input Tensor.
+        out (Tensor, optional): Result of ``bitwise_xor`` . It is a N-D Tensor with the same data type of input Tensor. Default: None.
+        name (str, optional): The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        Tensor: Result of ``bitwise_xor`` . It is a N-D Tensor with the same data type of input Tensor.
@@ -1230,11 +1264,13 @@ def bitwise_xor(x, y, out=None, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            x = paddle.to_tensor([-5, -1, 1])
+            >>> x = paddle.to_tensor([-5, -1, 1])
-            y = paddle.to_tensor([4,  2, -3])
+            >>> y = paddle.to_tensor([4,  2, -3])
-            res = paddle.bitwise_xor(x, y)
+            >>> res = paddle.bitwise_xor(x, y)
-            print(res) # [-1, -3, -4]
+            >>> print(res)
+            Tensor(shape=[3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [-1, -3, -4])
    """
    if in_dynamic_mode() and out is None:
        return _C_ops.bitwise_xor(x, y)
@@ -1275,7 +1311,9 @@ def bitwise_not(x, out=None, name=None):
    Args:
        x (Tensor): Input Tensor of ``bitwise_not`` . It is a N-D Tensor of bool, uint8, int8, int16, int32, int64.
-        out(Tensor): Result of ``bitwise_not`` . It is a N-D Tensor with the same data type of input Tensor.
+        out (Tensor, optional): Result of ``bitwise_not`` . It is a N-D Tensor with the same data type of input Tensor. Default: None.
+        name (str, optional): The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
    Returns:
        Tensor: Result of ``bitwise_not`` . It is a N-D Tensor with the same data type of input Tensor.
@@ -1283,10 +1321,12 @@ def bitwise_not(x, out=None, name=None):
    Examples:
        .. code-block:: python
-            import paddle
+            >>> import paddle
-            x = paddle.to_tensor([-5, -1, 1])
+            >>> x = paddle.to_tensor([-5, -1, 1])
-            res = paddle.bitwise_not(x)
+            >>> res = paddle.bitwise_not(x)
-            print(res) # [4, 0, -2]
+            >>> print(res)
+            Tensor(shape=[3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [ 4,  0, -2])
    """
    if in_dynamic_mode() and out is None:
        return _C_ops.bitwise_not(x)
@@ -1334,25 +1374,32 @@ def isclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
    Examples:
        .. code-block:: python
-          import paddle
+            >>> import paddle
-          x = paddle.to_tensor([10000., 1e-07])
+            >>> x = paddle.to_tensor([10000., 1e-07])
-          y = paddle.to_tensor([10000.1, 1e-08])
+            >>> y = paddle.to_tensor([10000.1, 1e-08])
-          result1 = paddle.isclose(x, y, rtol=1e-05, atol=1e-08,
+            >>> result1 = paddle.isclose(x, y, rtol=1e-05, atol=1e-08,
-                                  equal_nan=False, name="ignore_nan")
+            ...                          equal_nan=False, name="ignore_nan")
-          # [True, False]
+            >>> print(result1)
-          result2 = paddle.isclose(x, y, rtol=1e-05, atol=1e-08,
+            Tensor(shape=[2], dtype=bool, place=Place(cpu), stop_gradient=True,
-                                      equal_nan=True, name="equal_nan")
+            [True , False])
-          # [True, False]
+            >>> result2 = paddle.isclose(x, y, rtol=1e-05, atol=1e-08,
+            ...                          equal_nan=True, name="equal_nan")
-          x = paddle.to_tensor([1.0, float('nan')])
+            >>> print(result2)
-          y = paddle.to_tensor([1.0, float('nan')])
+            Tensor(shape=[2], dtype=bool, place=Place(cpu), stop_gradient=True,
-          result1 = paddle.isclose(x, y, rtol=1e-05, atol=1e-08,
+            [True , False])
-                                  equal_nan=False, name="ignore_nan")
+            >>> x = paddle.to_tensor([1.0, float('nan')])
-          # [True, False]
+            >>> y = paddle.to_tensor([1.0, float('nan')])
-          result2 = paddle.isclose(x, y, rtol=1e-05, atol=1e-08,
+            >>> result1 = paddle.isclose(x, y, rtol=1e-05, atol=1e-08,
-                                      equal_nan=True, name="equal_nan")
+            ...                          equal_nan=False, name="ignore_nan")
-          # [True, True]
+            >>> print(result1)
+            Tensor(shape=[2], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [True , False])
+            >>> result2 = paddle.isclose(x, y, rtol=1e-05, atol=1e-08,
+            ...                          equal_nan=True, name="equal_nan")
+            >>> print(result2)
+            Tensor(shape=[2], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [True, True])
    """
    if in_dynamic_mode():