From 4dbe441c6c497056eb942124727c160f6676dc14 Mon Sep 17 00:00:00 2001
From: cyberslack_lee <luhputu0815@gmail.com>
Date: Wed, 6 Sep 2023 10:28:13 +0800
Subject: [PATCH] [xdoctest] reformat example code with google style in No.
 250-260 (#56541)

* test=docs_preview

* test=docs_preview

* test=docs_preview

* test=docs_preview

* test=docs_preview

* test=docs_preview

* fix

* test=docs_preview

* test=docs_preview

* fix

* move stmts under imports

---------

Co-authored-by: SigureMo <sigure.qaq@gmail.com>
---
 .../fleet/parameter_server/pslib/__init__.py  |  583 ++++---
 .../distributed/models/moe/grad_clip.py       |   30 +-
 .../distributed/models/moe/moe_layer.py       |  113 +-
 python/paddle/incubate/layers/nn.py           |  798 +++++----
 .../nn/functional/fused_dropout_add.py        |   30 +-
 .../incubate/nn/functional/fused_ec_moe.py    |   31 +-
 .../nn/functional/fused_gate_attention.py     |  126 +-
 .../nn/functional/fused_matmul_bias.py        |   91 +-
 .../fused_rotary_position_embedding.py        |    5 +-
 python/paddle/tensor/linalg.py                | 1481 +++++++++--------
 python/paddle/tensor/logic.py                 |  377 +++--
 11 files changed, 1959 insertions(+), 1706 deletions(-)

diff --git a/python/paddle/incubate/distributed/fleet/parameter_server/pslib/__init__.py b/python/paddle/incubate/distributed/fleet/parameter_server/pslib/__init__.py
index d8b61aadb5c..3f36562b2ad 100644
--- a/python/paddle/incubate/distributed/fleet/parameter_server/pslib/__init__.py
+++ b/python/paddle/incubate/distributed/fleet/parameter_server/pslib/__init__.py
@@ -67,8 +67,8 @@ class PSLib(Fleet):
                     should call init_worker() to initialize global information about worker and connect
                     worker with pserver. You should run startup program before init_worker.
         Args:
-            executor(Executor): The executor to run for init server.
-            programs(Program|None): The program that need to run.
+            executor (Executor): The executor to run for init server.
+            programs (Program|None): The program that need to run.
         """
 
         if len(self._main_programs) == 0:
@@ -167,16 +167,24 @@ class PSLib(Fleet):
 
     def init_server(self, model_dir=None, **kwargs):
         """
-        init_server() will be called by user. It will load model from model_dir.
+        Called by user. It will load model from model_dir.
+
         Args:
-            model_dir(str): load model path, can be local or hdfs/afs path.
-            kwargs: user-defined attributes, currently support following:
-                model(int): load model mode.
-                            0 is for load whole model,
-                            1 is for load delta model (load diff),
-                            default is 0.
-        Example:
-            >>> fleet.init_server("/you/path/to/model", mode = 0)
+            model_dir(str, optional): Load model path, can be local or hdfs/afs path. Default is None.
+            kwargs: User-defined attributes, currently support following:
+
+                - model(int): Load model mode.
+
+                  0 is for load whole model,
+                  1 is for load delta model (load diff).
+                  Default is 0.
+
+        Examples:
+
+            .. code-block:: text
+
+                fleet.init_server("/you/path/to/model", mode = 0)
+
         """
         mode = kwargs.get("mode", 0)
         if isinstance(self._role_maker, HeterRoleMaker):
@@ -192,8 +200,7 @@ class PSLib(Fleet):
 
     def run_server(self):
         """
-        init_pserver(): will be called by user. When a user knows current process is_worker(), he/she
-            should call init_pserver() to initialize global information about parameter server
+        Called by user. When a user init server, after that he/she should run run_server() to start.
         """
         if self._opt_info:
             if "fleet_desc" in self._opt_info:
@@ -296,8 +303,8 @@ class PSLib(Fleet):
 
     def stop_worker(self):
         """
-        stop(): will be called after a user finishes his/her training task. Fleet instance will be
-            destroyed when stop() is called.
+        Will be called after a user finishes his/her training task. Fleet instance will be
+            destroyed when stop_worker() is called.
         """
         self._role_maker._barrier_worker()
         # all worker should be finalize first
@@ -315,14 +322,20 @@ class PSLib(Fleet):
     def distributed_optimizer(self, optimizer, strategy={}):
         """
         distributed_optimizer
+
         Args:
-            optimizer(Optimizer): optimizer
-            strategy(dict): strategy
-        Examples:
-            .. code-block:: python
-              fleet.distributed_optimizer(optimizer)
+            optimizer (Optimizer): Optimizer.
+            strategy (dict): Strategy.
+
         Returns:
-            optimizer(DownpourOptimizer): downpour optimizer
+            optimizer(DownpourOptimizer): Downpour optimizer.
+
+        Examples:
+
+            .. code-block:: text
+
+                fleet.distributed_optimizer(optimizer)
+
         """
         self._optimizer = DownpourOptimizer(optimizer, strategy)
         return self._optimizer
@@ -337,29 +350,42 @@ class PSLib(Fleet):
         export_for_deployment=True,
     ):
         """
-        save pserver model called from a worker
+        Save pserver model called from a worker.
+
         Args:
-            executor(Executor): fluid executor
-            dirname(str): save model path
-            feeded_var_names(list): default None
-            target_vars(list): default None
-            main_program(Program): default None
-            export_for_deployment(bool): default None
+
+            executor (Executor): Fluid executor.
+            dirname (str): Save model path.
+            feeded_var_names (list, optional): Default None.
+            target_vars (list, optional): Default None.
+            main_program (Program, optional): Default None.
+            export_for_deployment (bool, optional): Default None.
+
         Examples:
-            .. code-block:: python
-              fleet.save_inference_model(dirname="hdfs:/my/path")
+
+            .. code-block:: text
+
+                fleet.save_inference_model(dirname="hdfs:/my/path")
+
         """
         self._fleet_ptr.save_model(dirname, 0)
 
     def print_table_stat(self, table_id, pass_id, threshold):
         """
-        print stat info of table_id,
-        format: tableid, feasign size, mf size
+        Print stat info of table_id, format: tableid, feasign size, mf size.
+
         Args:
-            table_id(int): the id of table
-        Example:
-            .. code-block:: python
-              fleet.print_table_stat(0)
+
+            table_id (int): The id of table.
+            pass_id (int): The id of pass.
+            threshold (float): The threshold of print.
+
+        Examples:
+
+            .. code-block:: text
+
+                fleet.print_table_stat(0)
+
         """
         self._role_maker._barrier_worker()
         if self._role_maker.is_first_worker():
@@ -368,13 +394,19 @@ class PSLib(Fleet):
 
     def set_file_num_one_shard(self, table_id, file_num):
         """
-        set file_num in one shard
+        Set file_num in one shard.
+
         Args:
-            table_id(int): the id of table
-            file_num(int): file num in one shard
-        Example:
-            .. code-block:: python
-              fleet.set_file_num_one_shard(0, 5)
+
+            table_id (int): The id of table.
+            file_num (int): File num in one shard.
+
+        Examples:
+
+            .. code-block:: text
+
+                fleet.set_file_num_one_shard(0, 5)
+
         """
         self._role_maker._barrier_worker()
         if self._role_maker.is_first_worker():
@@ -383,20 +415,28 @@ class PSLib(Fleet):
 
     def save_persistables(self, executor, dirname, main_program=None, **kwargs):
         """
-        save presistable parameters,
-        when using fleet, it will save sparse and dense feature
+        Save presistable parameters,
+        when using fleet, it will save sparse and dense feature.
+
         Args:
-            executor(Executor): fluid executor
-            dirname(str): save path. It can be hdfs/afs path or local path
-            main_program(Program): fluid program, default None
-            kwargs: use define property, current support following
-                mode(int): 0 means save all pserver model,
-                           1 means save delta pserver model (save diff),
-                           2 means save xbox base,
-                           3 means save batch model.
-        Example:
-            .. code-block:: python
-              fleet.save_persistables(dirname="/you/path/to/model", mode = 0)
+
+            executor (Executor): Fluid executor.
+            dirname (str): Save path. It can be hdfs/afs path or local path.
+            main_program (Program, optional): Fluid program, default None.
+            kwargs: Use define property, current support following
+
+                - mode (int):
+                  0 means save all pserver model,
+                  1 means save delta pserver model (save diff),
+                  2 means save xbox base,
+                  3 means save batch model.
+
+        Examples:
+
+            .. code-block:: test
+
+                fleet.save_persistables(dirname="/you/path/to/model", mode = 0)
+
         """
         mode = kwargs.get("mode", 0)
         self._fleet_ptr.client_flush()
@@ -409,23 +449,28 @@ class PSLib(Fleet):
         self, executor, dirname, whitelist_path, main_program=None, **kwargs
     ):
         """
-        save whitelist, mode is consistent with fleet.save_persistables,
-        when using fleet, it will save sparse and dense feature
+        Save whitelist, mode is consistent with fleet.save_persistables,
+        when using fleet, it will save sparse and dense feature.
 
         Args:
-            executor(Executor): fluid executor
-            dirname(str): save path. It can be hdfs/afs path or local path
-            main_program(Program): fluid program, default None
-            kwargs: use define property, current support following
-                mode(int): 0 means save all pserver model,
-                           1 means save delta pserver model (save diff),
-                           2 means save xbox base,
-                           3 means save batch model.
 
-        Example:
-            .. code-block:: python
+            executor (Executor): Fluid executor.
+            dirname (str): save path. It can be hdfs/afs path or local path.
+            whitelist_path (str): whitelist path. It can be hdfs/afs path or local path.
+            main_program (Program, optional): fluid program, default None.
+            kwargs: Use define property, current support following
 
-              fleet.save_persistables(dirname="/you/path/to/model", mode = 0)
+                - mode (int):
+                  0 means save all pserver model,
+                  1 means save delta pserver model (save diff),
+                  2 means save xbox base,
+                  3 means save batch model.
+
+        Examples:
+
+            .. code-block:: text
+
+                fleet.save_persistables(dirname="/you/path/to/model", mode = 0)
 
         """
         mode = kwargs.get("mode", 0)
@@ -440,18 +485,23 @@ class PSLib(Fleet):
 
     def save_multi_table_one_path(self, table_ids, model_dir, **kwargs):
         """
-        save pslib multi sparse table in one path.
+        Save pslib multi sparse table in one path.
+
         Args:
-            table_ids(list): table ids
-            model_dir(str): if you use hdfs, model_dir should starts with
-                            'hdfs:', otherwise means local dir
-            kwargs(dict): user-defined properties.
-                          mode(int): the modes illustrated above, default 0
-                          prefix(str): the parts to save can have prefix,
-                                       for example, part-prefix-000-00000
+
+            table_ids (list): Table ids.
+            model_dir (str): If you use hdfs, model_dir should starts with 'hdfs:', otherwise means local dir.
+            kwargs (dict): User-defined properties.
+
+                - mode (int): The modes illustrated above, default 0.
+                - prefix (str): the parts to save can have prefix, for example, part-prefix-000-00000.
+
         Examples:
-            .. code-block:: python
-              fleet.save_multi_table_one_path("[0, 1]", "afs:/user/path/")
+
+            .. code-block:: text
+
+                fleet.save_multi_table_one_path("[0, 1]", "afs:/user/path/")
+
         """
         mode = kwargs.get("mode", 0)
         self._role_maker._barrier_worker()
@@ -463,21 +513,30 @@ class PSLib(Fleet):
 
     def save_cache_model(self, executor, dirname, main_program=None, **kwargs):
         """
-        save sparse cache table,
-        when using fleet, it will save sparse cache table
+        Save sparse cache table,
+        when using fleet, it will save sparse cache table.
+
         Args:
-            executor(Executor): fluid executor
-            dirname(str): save path. It can be hdfs/afs path or local path
-            main_program(Program): fluid program, default None
-            kwargs: use define property, current support following
-                mode(int): define for feature extension in the future,
-                           currently no use, will pass a default value 0
-                table_id(int): which table to save cache, default is 0
+
+            executor (Executor): Fluid executor.
+            dirname (str): Save path. It can be hdfs/afs path or local path.
+            main_program (Program, optional): Fluid program, default None.
+            kwargs: Use define property, current support following
+
+                - mode (int): Define for feature extension in the future,
+                  currently no use, will pass a default value 0.
+                - table_id (int): Which table to save cache, default is 0.
+
         Returns:
-            feasign_num(int): cache feasign num
-        Example:
-            .. code-block:: python
-              fleet.save_cache_model(None, dirname="/you/path/to/model", mode = 0)
+
+            feasign_num (int): cache feasign num.
+
+        Examples:
+
+            .. code-block:: text
+
+                fleet.save_cache_model(None, dirname="/you/path/to/model", mode = 0)
+
         """
         mode = kwargs.get("mode", 0)
         table_id = kwargs.get("table_id", 0)
@@ -506,10 +565,15 @@ class PSLib(Fleet):
 
     def shrink_sparse_table(self):
         """
-        shrink cvm of all sparse embedding in pserver, the decay rate
-        is defined as "show_click_decay_rate" in fleet_desc.prototxt
-        Example:
-            >>> fleet.shrink_sparse_table()
+        Shrink cvm of all sparse embedding in pserver, the decay rate
+        is defined as "show_click_decay_rate" in fleet_desc.prototxt.
+
+        Examples:
+
+            .. code-block:: text
+
+                fleet.shrink_sparse_table()
+
         """
         self._role_maker._barrier_worker()
         if self._role_maker.is_first_worker():
@@ -523,18 +587,22 @@ class PSLib(Fleet):
 
     def shrink_dense_table(self, decay, emb_dim=11, scope=None, table_id=None):
         """
-        shrink batch_sum in pserver by multiplying by decay
+        Shrink batch_sum in pserver by multiplying by decay.
+
         Args:
-            decay(float): the decay rate, usually range in (0, 1)
-            emb_dim(int): one element's length in datanorm layer
-            scope(Scope): Scope object, default is fluid.global_scope()
-            table_id(int): table id of shrinking dense table. None means shrink all,
-                           you should specify it when using multiple scopes,
-                           default is None.
-        Example:
-            >>> fleet.shrink_dense_table(0.98, 11, myscope1, 1)
-            >>> fleet.shrink_dense_table(0.98, 11, myscope1, 2)
-            >>> fleet.shrink_dense_table(0.98, 11, myscope2, 3)
+            decay (float): The decay rate, usually range in (0, 1).
+            emb_dim (int, optional): One element's length in datanorm layer. Default is 11.
+            scope (Scope, optional): Scope object, default is fluid.global_scope(). Default is None.
+            table_id (int, optional): Table id of shrinking dense table. None means shrink all,
+                you should specify it when using multiple scopes, default is None.
+
+        Examples:
+
+            .. code-block:: text
+
+                fleet.shrink_dense_table(0.98, 11, myscope1, 1)
+                fleet.shrink_dense_table(0.98, 11, myscope1, 2)
+                fleet.shrink_dense_table(0.98, 11, myscope2, 3)
         """
         if scope is None:
             scope = paddle.static.global_scope()
@@ -559,12 +627,17 @@ class PSLib(Fleet):
 
     def clear_one_table(self, table_id):
         """
-        clear_one_table() will be called by user. It will clear one table.
+        This function will be called by user. It will clear one table.
+
         Args:
-            table_id(int): table id
+
+            table_id (int): Table id.
+
         Examples:
-            .. code-block:: python
-              fleet.clear_one_table(0)
+
+            .. code-block:: text
+
+                fleet.clear_one_table(0)
         """
         self._role_maker._barrier_worker()
         if self._role_maker.is_first_worker():
@@ -573,10 +646,13 @@ class PSLib(Fleet):
 
     def clear_model(self):
         """
-        clear_model() will be called by user. It will clear sparse model.
+        This function will be called by user. It will clear sparse model.
+
         Examples:
-            .. code-block:: python
-              fleet.clear_model()
+
+            .. code-block:: text
+
+                fleet.clear_model()
         """
         self._role_maker._barrier_worker()
         if self._role_maker.is_first_worker():
@@ -585,40 +661,39 @@ class PSLib(Fleet):
 
     def load_pslib_whitelist(self, table_id, model_path, **kwargs):
         """
-        load pslib model for one table with whitelist
+        Load pslib model for one table with whitelist.
 
         Args:
-            table_id(int): load table id
-            model_path(str): load model path, can be local or hdfs/afs path
-            kwargs(dict): user defined params, currently support following:
-                only for load pslib model for one table:
-                    mode(int): load model mode. 0 is for load whole model, 1 is
-                               for load delta model (load diff), default is 0.
-                only for load params from paddle model:
-                    scope(Scope): Scope object
-                    model_proto_file(str): path of program desc proto binary
-                                           file, can be local or hdfs/afs file
-                    var_names(list): var name list
-                    load_combine(bool): load from a file or split param files
-                                        default False.
+            table_id (int): Load table id.
+            model_path (str): Load model path, can be local or hdfs/afs path.
+            kwargs (dict): User defined params, currently support following:
+
+                - only for load pslib model for one table:
+                  mode (int): load model mode. 0 is for load whole model, 1 is for load delta model (load diff), default is 0.
+                - only for load params from paddle model:
+                  scope (Scope): Scope object.
+                  model_proto_file (str): Path of program desc proto binary file, can be local or hdfs/afs file.
+                  var_names (list): Var name list.
+                  load_combine (bool): Load from a file or split param files, default False.
 
         Examples:
-            .. code-block:: python
 
-              # load pslib model for one table
-              fleet.load_one_table(0, "hdfs:/my_fleet_model/20190714/0/")
-              fleet.load_one_table(1, "hdfs:/xx/xxx", mode = 0)
+            .. code-block:: text
+
+                # load pslib model for one table
+                fleet.load_one_table(0, "hdfs:/my_fleet_model/20190714/0/")
+                fleet.load_one_table(1, "hdfs:/xx/xxx", mode = 0)
 
-              # load params from paddle model
-              fleet.load_one_table(2, "hdfs:/my_paddle_model/",
-                                   scope = my_scope,
-                                   model_proto_file = "./my_program.bin",
-                                   load_combine = False)
+                # load params from paddle model
+                fleet.load_one_table(2, "hdfs:/my_paddle_model/",
+                                    scope = my_scope,
+                                    model_proto_file = "./my_program.bin",
+                                    load_combine = False)
 
-              # below is how to save proto binary file
-              with open("my_program.bin", "wb") as fout:
-                  my_program = fluid.default_main_program()
-                  fout.write(my_program.desc.serialize_to_string())
+                # below is how to save proto binary file
+                with open("my_program.bin", "wb") as fout:
+                    my_program = fluid.default_main_program()
+                    fout.write(my_program.desc.serialize_to_string())
 
         """
         self._role_maker._barrier_worker()
@@ -631,35 +706,39 @@ class PSLib(Fleet):
 
     def load_one_table(self, table_id, model_path, **kwargs):
         """
-        load pslib model for one table or load params from paddle model
+        Load pslib model for one table or load params from paddle model.
+
         Args:
-            table_id(int): load table id
-            model_path(str): load model path, can be local or hdfs/afs path
-            kwargs(dict): user defined params, currently support following:
-                only for load pslib model for one table:
-                    mode(int): load model mode. 0 is for load whole model, 1 is
-                               for load delta model (load diff), default is 0.
-                only for load params from paddle model:
-                    scope(Scope): Scope object
-                    model_proto_file(str): path of program desc proto binary
-                                           file, can be local or hdfs/afs file
-                    var_names(list): var name list
-                    load_combine(bool): load from a file or split param files
-                                        default False.
+
+            table_id (int): Load table id.
+            model_path (str): Load model path, can be local or hdfs/afs path.
+            kwargs (dict): user defined params, currently support following:
+
+                - only for load pslib model for one table:
+                  mode(int): load model mode. 0 is for load whole model, 1 is for load delta model (load diff), default is 0.
+                - only for load params from paddle model:
+                  scope(Scope): Scope object.
+                  model_proto_file(str): Path of program desc proto binary file, can be local or hdfs/afs file.
+                  var_names(list): var name list.
+                  load_combine(bool): load from a file or split param files, default False.
+
         Examples:
-            .. code-block:: python
-              # load pslib model for one table
-              fleet.load_one_table(0, "hdfs:/my_fleet_model/20190714/0/")
-              fleet.load_one_table(1, "hdfs:/xx/xxx", mode = 0)
-              # load params from paddle model
-              fleet.load_one_table(2, "hdfs:/my_paddle_model/",
-                                   scope = my_scope,
-                                   model_proto_file = "./my_program.bin",
-                                   load_combine = False)
-              # below is how to save proto binary file
-              with open("my_program.bin", "wb") as fout:
-                  my_program = fluid.default_main_program()
-                  fout.write(my_program.desc.serialize_to_string())
+
+            .. code-block:: text
+
+                # load pslib model for one table
+                fleet.load_one_table(0, "hdfs:/my_fleet_model/20190714/0/")
+                fleet.load_one_table(1, "hdfs:/xx/xxx", mode = 0)
+                # load params from paddle model
+                fleet.load_one_table(2, "hdfs:/my_paddle_model/",
+                                    scope = my_scope,
+                                    model_proto_file = "./my_program.bin",
+                                    load_combine = False)
+                # below is how to save proto binary file
+                with open("my_program.bin", "wb") as fout:
+                    my_program = fluid.default_main_program()
+                    fout.write(my_program.desc.serialize_to_string())
+
         """
         self._role_maker._barrier_worker()
         mode = kwargs.get("mode", 0)
@@ -691,15 +770,16 @@ class PSLib(Fleet):
         load_combine=False,
     ):
         """
-        load params from paddle model, and push params to pserver
+        Load params from paddle model, and push params to pserver.
+
         Args:
-            scope(Scope): Scope object
-            table_id(int): the id of table to load
-            model_path(str): path of paddle model, can be local or hdfs/afs file
-            model_proto_file(str): path of program desc proto binary file,
-                                   can be local or hdfs/afs file
-            var_names(list): load var names
-            load_combine(bool): load from a file or split param files
+            scope (Scope): Scope object.
+            table_id (int): The id of table to load.
+            model_path (str): Path of paddle model, can be local or hdfs/afs file.
+            model_proto_file (str): Path of program desc proto binary file, can be local or hdfs/afs file.
+            var_names (list, optional): Load var names. Default is None.
+            load_combine (bool, optional): Load from a file or split param files. Default is False.
+
         """
         self._role_maker._barrier_worker()
         if self._role_maker.is_first_worker():
@@ -800,14 +880,18 @@ class PSLib(Fleet):
            usually for online predict)
         3: load batch model (do some statistic works in checkpoint, such as
            calculate unseen days of each feasign)
+
         Args:
-            model_dir(str): if you use hdfs, model_dir should starts with
-                            'hdfs:', otherwise means local dir
-            kwargs(dict): user-defined properties.
-                          mode(int): the modes illustrated above, default 0
+            model_dir (str, optional): If you use hdfs, model_dir should starts with
+                'hdfs:', otherwise means local dir. Default is None.
+            kwargs (dict): user-defined properties.
+
+                - mode (int): The modes illustrated above, default 0.
+
         Examples:
-            .. code-block:: python
-              fleet.load_model("afs:/user/path/")
+            .. code-block:: text
+
+                fleet.load_model("afs:/user/path/")
         """
         mode = kwargs.get("mode", 0)
         self._role_maker._barrier_worker()
@@ -818,14 +902,19 @@ class PSLib(Fleet):
     def save_model(self, model_dir=None, **kwargs):
         """
         save pslib model, the modes are same with load model.
+
         Args:
-            model_dir(str): if you use hdfs, model_dir should starts with
-                            'hdfs:', otherwise means local dir
-            kwargs(dict): user-defined properties.
-                          mode(int): the modes illustrated above, default 0
+            model_dir (str, optional): If you use hdfs, model_dir should starts with
+                'hdfs:', otherwise means local dir. Default is None.
+            kwargs (dict): user-defined properties.
+
+                - mode (int): The modes illustrated above, default 0.
+
         Examples:
-            .. code-block:: python
-              fleet.save_model("afs:/user/path/")
+            .. code-block:: text
+
+                fleet.save_model("afs:/user/path/")
+
         """
         mode = kwargs.get("mode", 0)
         prefix = kwargs.get("prefix", None)
@@ -836,18 +925,21 @@ class PSLib(Fleet):
 
     def save_one_table(self, table_id, model_dir, **kwargs):
         """
-        save pslib model's one table, the modes are same with load model.
+        Save pslib model's one table, the modes are same with load model.
+
         Args:
-            table_id(int): table id
-            model_dir(str): if you use hdfs, model_dir should starts with
-                            'hdfs:', otherwise means local dir
-            kwargs(dict): user-defined properties.
-                          mode(int): the modes illustrated above, default 0
-                          prefix(str): the parts to save can have prefix,
-                                       for example, part-prefix-000-00000
+            table_id (int): Table id.
+            model_dir (str): if you use hdfs, model_dir should starts with
+                'hdfs:', otherwise means local dir.
+            kwargs (dict): user-defined properties.
+
+                - mode (int): the modes illustrated above, default 0.
+                - prefix (str): the parts to save can have prefix, for example, part-prefix-000-00000.
+
         Examples:
-            .. code-block:: python
-              fleet.save_one_table("afs:/user/path/")
+            .. code-block:: text
+
+                fleet.save_one_table("afs:/user/path/")
         """
         mode = kwargs.get("mode", 0)
         prefix = kwargs.get("prefix", None)
@@ -890,15 +982,17 @@ def _prepare_params(
     dtype='float32',
 ):
     """
-    preprocess params, this interface is not for users.
+    Preprocess params, this interface is not for users.
+
     Args:
-        input(Variable|list of Variable): Input is a Tensor<int64> Variable
-        size(list of int): the embedding dim
-        is_sparse(bool): whether input is sparse ids
-        is_distributed(bool): whether in distributed mode
-        padding_idx(int): padding idx of input
-        param_attr(ParamAttr): To specify the weight parameter property
-        dtype(str): data type of output
+        input (Variable|list of Variable): Input is a Tensor<int64> Variable.
+        size (list of int): The embedding dim.
+        is_sparse (bool, optional): Whether input is sparse ids. Default is False.
+        is_distributed (bool, optional): Whether in distributed mode. Default is False.
+        padding_idx (int, optional): Padding idx of input. Default is None.
+        param_attr (ParamAttr, optional): To specify the weight parameter property. Default is None.
+        dtype (str, optional): Data type of output. Default is 'float32'.
+
     """
     if param_attr is None:
         raise ValueError("param_attr must be set")
@@ -953,15 +1047,16 @@ def _fleet_embedding(
     dtype='float32',
 ):
     """
-    add fleet embedding, this interface is not for users.
+    Add fleet embedding, this interface is not for users.
+
     Args:
-        input(Variable|list of Variable): Input is a Tensor<int64> Variable
-        size(list of int): the embedding dim
-        is_sparse(bool): whether input is sparse ids
-        is_distributed(bool): whether in distributed mode
-        padding_idx(int): padding idx of input
-        param_attr(ParamAttr): To specify the weight parameter property
-        dtype(str): data type of output
+        input (Variable|list of Variable): Input is a Tensor<int64> Variable.
+        size (list[int]): The embedding dim.
+        is_sparse (bool, optional): Whether input is sparse ids. Default is False.
+        is_distributed (bool, optional): Whether in distributed mode. Default is False.
+        padding_idx (int, optional): Padding idx of input. Default is None.
+        param_attr (ParamAttr, optional): To specify the weight parameter property. Default is None.
+        dtype (str, optional): Data type of output. Default is 'float32'.
     """
 
     def _pull_sparse(
@@ -1041,15 +1136,16 @@ def _fleet_embedding_v2(
     dtype='float32',
 ):
     """
-    add fleet embedding v2, this interface is not for users.
+    Add fleet embedding v2, this interface is not for users.
+
     Args:
-        input(Variable|list of Variable): Input is a Tensor<int64> Variable
-        size(list of int): the embedding dim
-        is_sparse(bool): whether input is sparse ids
-        is_distributed(bool): whether in distributed mode
-        padding_idx(int): padding idx of input
-        param_attr(ParamAttr): To specify the weight parameter property
-        dtype(str): data type of output
+        input (Variable|list of Variable): Input is a Tensor<int64> Variable.
+        size (list[int]): The embedding dim.
+        is_sparse (bool, optional): Whether input is sparse ids. Default is False.
+        is_distributed (bool, optional): Whether in distributed mode. Default is False.
+        padding_idx (int, optional): Padding idx of input. Default is None.
+        param_attr (ParamAttr, optional): To specify the weight parameter property. Default is None.
+        dtype (str, optional): Data type of output. Default is 'float32'.
     """
 
     def _pull_sparse_v2(
@@ -1120,16 +1216,19 @@ def _fleet_embedding_v2(
 
 class fleet_embedding:
     """
-    fleet embedding class, it is used as a wrapper
-    Example:
-        .. code-block:: python
-          with fleet_embedding(click_name=label.name):
-              emb = paddle.static.nn.embedding(
-                  input=var,
-                  size=[-1, 11],
-                  is_sparse=True,
-                  is_distributed=True,
-                  param_attr=fluid.ParamAttr(name="embedding"))
+    Fleet embedding class, it is used as a wrapper.
+
+    Examples:
+
+        .. code-block:: text
+
+            with fleet_embedding(click_name=label.name):
+                emb = paddle.static.nn.embedding(
+                    input=var,
+                    size=[-1, 11],
+                    is_sparse=True,
+                    is_distributed=True,
+                    param_attr=fluid.ParamAttr(name="embedding"))
     """
 
     def __init__(self, click_name, scale_sparse_grad=True):
@@ -1165,9 +1264,11 @@ class DownpourOptimizer(DistributedOptimizer):
     run distributed training. The optimized information will be stored in
     Fleet() instance who holds the global information about current distributed
     training.
+
     Args:
         optimizer(Optimizer): subclass of Optimizer.
         strategy(any): config for DownpourOptimizer.
+
     Returns:
         None
     """
@@ -1270,22 +1371,24 @@ class DownpourOptimizer(DistributedOptimizer):
         program_mode="all_reduce",
     ):
         """
-        minimize a program through loss, loss can be a list in DistributedOptimizer.
+        Minimize a program through loss, loss can be a list in DistributedOptimizer.
         Note that in parameter server mode, a worker will not get anything about optimize_os
         Because optimizer algorithms run on pserver side. We will make this usable in pserver
         process, but currently the optimization part is written into Fleet(). A user does not
         need to care about how to startup a pserver node.
+
         Args:
-            losses (Variable|Variable List): loss variable or loss variable list to run optimization.
-            scopes (Scope| Scope List): scope instance.
-            startup_programs (Program|Program List): startup_program for initializing parameters
-                in `parameter_list`.
-            parameter_list (list): list of Variables to update.
-            no_grad_set (set|None): set of Variables should be ignored.
-            program_mode (str|"all_reduce"): grad action for grogram when use_ps_gpu.
+            losses (Variable|Variable List): Loss variable or loss variable list to run optimization.
+            scopes (Scope|Scope List, Optional): Scope instance. Default is None.
+            startup_programs (Program|Program List, Optional): Startup_program for initializing parameters
+                in `parameter_list`. Default is None.
+            parameter_list (list, Optional): List of Variables to update. Default is None.
+            no_grad_set (set, Optional): Set of Variables should be ignored. Default is None.
+            program_mode (str, Optional): Grad action for grogram when use_ps_gpu. Default is "all_reduce".
+
         Returns:
             tuple: (optimize_ops, params_grads) which are, list of operators appended;
-            and list of (param, grad) Variables pair for optimization.
+                and list of (param, grad) Variables pair for optimization.
         """
 
         if not isinstance(losses, list):
diff --git a/python/paddle/incubate/distributed/models/moe/grad_clip.py b/python/paddle/incubate/distributed/models/moe/grad_clip.py
index 59ba6bb8f94..2a0dd89b77f 100644
--- a/python/paddle/incubate/distributed/models/moe/grad_clip.py
+++ b/python/paddle/incubate/distributed/models/moe/grad_clip.py
@@ -22,7 +22,7 @@ from paddle.nn.clip import ClipGradBase, _squared_l2_norm
 
 class ClipGradForMOEByGlobalNorm(ClipGradBase):
     r"""
-    The Algrithm is the same as paddle.nn.ClipGradByGlobalNorm
+    The Algorithm is the same as paddle.nn.ClipGradByGlobalNorm
     Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
     :math:`t\_list` , and limit it to ``clip_norm`` .
 
@@ -50,7 +50,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
 
     Note:
         ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0.
-        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
+        Please use ``need_clip`` in ``ParamAttr`` to specify the clip scope.
 
     Reference:
         https://github.com/laekov/fastmoe/blob/master/examples/megatron/clip-grad-v2.2.patch
@@ -64,22 +64,22 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
         group_name (str, optional): The group name for this clip. Default value is ``default_moe_group``.
 
     Examples:
+
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
-            linear = paddle.nn.Linear(in_features=10, out_features=10,
-                                      weight_attr=paddle.ParamAttr(need_clip=True),
-                                      bias_attr=paddle.ParamAttr(need_clip=False))
-            out = linear(x)
-            loss = paddle.mean(out)
-            loss.backward()
+            >>> x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
+            >>> linear = paddle.nn.Linear(in_features=10, out_features=10,
+            ...                           weight_attr=paddle.ParamAttr(need_clip=True),
+            ...                           bias_attr=paddle.ParamAttr(need_clip=False))
+            >>> out = linear(x)
+            >>> loss = paddle.mean(out)
+            >>> loss.backward()
 
-            is_expert_func = lambda param: "expert_" in param.name
-            clip = paddle.nn.ClipGradForMOEByGlobalNorm(clip_norm=1.0,is_expert_func, None)
-            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
-            sdg.step()
+            >>> clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) # Cause paddle.nn hasn't this interface, so we use ClipGradByGlobalNorm here.
+            >>> sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
+            >>> sdg.step()
     """
 
     def __init__(
@@ -124,7 +124,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
             else:
                 sum_square_list.append(sum_square)
 
-        # all parameters have been filterd out
+        # all parameters have been filtered out
         if (
             len(sum_square_list)
             + len(sum_square_list_fp16)
diff --git a/python/paddle/incubate/distributed/models/moe/moe_layer.py b/python/paddle/incubate/distributed/models/moe/moe_layer.py
index 7c63c431bb9..023dd8fa247 100644
--- a/python/paddle/incubate/distributed/models/moe/moe_layer.py
+++ b/python/paddle/incubate/distributed/models/moe/moe_layer.py
@@ -263,63 +263,68 @@ def prepare_forward(gate, num_expert, world_size, moe_group):
 class MoELayer(nn.Layer):
     """MoE Layer
     Args:
-        d_model: (int) model dimention
-        experts: (nn.LayerList) expert networks list
-        gate: (dict|NaiveGate|SwitchGate|NaiveGate):
-                if gate is a dict:
-                    gate is a gate network config, containing 2 keys:
-                    `type`(str) value can be: "naive", "gshard", "switch" or None, default is "gshard"
-                    `top_k`(int) default value is 2
-                else gate is an instance of NaiveGate|SwitchGate|NaiveGate:
-
-        moe_group: moe group for experts communication
-        mp_group: mp group for mp commutication
-        recompute_interval(int, optional): whether to use recompute, default 0, means to disable recompute.
-        recompute_ctx(dict, optional): the context for recompute, if recompute_interval > 1, recompute_ctx must be given.
+        d_model (int): Model dimention.
+        experts (nn.LayerList): Expert networks list.
+        gate (dict|NaiveGate|SwitchGate|NaiveGate):
+
+            - If gate is a dict:
+              gate is a gate network config, containing 2 keys:
+              `type` (str) value can be: "naive", "gshard", "switch" or None, default is "gshard".
+              `top_k` (int) Default value is 2.
+            else gate is an instance of NaiveGate|SwitchGate|NaiveGate:
+
+        moe_group: moe group for experts communication.
+        mp_group: mp group for mp communication.
+        recompute_interval (int, optional): Whether to use recompute, default 0, means to disable recompute.
+        recompute_ctx (dict, optional): The context for recompute, if recompute_interval > 1, recompute_ctx must be given.
+
     Examples:
+
         .. code-block:: python
-        from paddle.nn import layer, LayerList
-        from paddle.distributed.moe import MoElayer
-        from paddle.distributed.collective import Group
-        from paddle.distributed import fleet
-
-        moe_group = Group(fleet.worker_index(),
-                          0,
-                          list(range(fleet.worker_num())))
-        mp_group = None
-
-        num_experts=8
-        dim_feedforward=512
-        d_model=8
-        top_k=2
-
-        class ExpertLayer(Layer):
-            def __init__(self, d_model, d_hidden, name=None,rank=0, windex = 0, num_expert=1):
-                super().__init__()
-                self.htoh4 = nn.Linear(d_model, d_hidden)
-                self.h4toh = nn.Linear(d_hidden, d_model)
-
-            def forward(self, x):
-                x = self.htoh4(x)
-                x = self.h4toh(x)
-                return x
 
-        gate_config = {
-                "type": "gshard",
-                "top_k": top_k,
-        }
-
-        experts_list = LayerList()
-        for expi in range(num_experts):
-            exp_layer = ExpertLayer(d_model, dim_feedforward // top_k, windex=expi, num_expert=num_experts)
-            experts_list.append(exp_layer)
-
-        moeLayer = MoELayer(d_model = d_model,
-                            experts=experts_list,
-                            gate=gate_config,
-                            moe_group=moe_group,
-                            mp_group=mp_group,
-                            recompute_interval=0)
+            >>> # doctest: +SKIP('Until Distributed move successfully, just skip it')
+            >>> from paddle.nn import layer, LayerList
+            >>> from paddle.distributed.moe import MoElayer
+            >>> from paddle.distributed.collective import Group
+            >>> from paddle.distributed import fleet
+
+            >>> moe_group = Group(fleet.worker_index(),
+            ...                   0,
+            ...                   list(range(fleet.worker_num())))
+            >>> mp_group = None
+
+            >>> num_experts=8
+            >>> dim_feedforward=512
+            >>> d_model=8
+            >>> top_k=2
+
+            >>> class ExpertLayer(Layer):
+            ...     def __init__(self, d_model, d_hidden, name=None,rank=0, windex = 0, num_expert=1):
+            ...         super().__init__()
+            ...         self.htoh4 = nn.Linear(d_model, d_hidden)
+            ...         self.h4toh = nn.Linear(d_hidden, d_model)
+
+            ...     def forward(self, x):
+            ...         x = self.htoh4(x)
+            ...         x = self.h4toh(x)
+            ...         return x
+
+            >>> gate_config = {
+            ...         "type": "gshard",
+            ...         "top_k": top_k,
+            ... }
+
+            >>> experts_list = LayerList()
+            >>> for expi in range(num_experts):
+            ...     exp_layer = ExpertLayer(d_model, dim_feedforward // top_k, windex=expi, num_expert=num_experts)
+            ...     experts_list.append(exp_layer)
+
+            >>> moeLayer = MoELayer(d_model = d_model,
+            ...                     experts=experts_list,
+            ...                     gate=gate_config,
+            ...                     moe_group=moe_group,
+            ...                     mp_group=mp_group,
+            ...                     recompute_interval=0)
 
     """
 
diff --git a/python/paddle/incubate/layers/nn.py b/python/paddle/incubate/layers/nn.py
index f1b2c8fbff2..1777aaa9a4b 100644
--- a/python/paddle/incubate/layers/nn.py
+++ b/python/paddle/incubate/layers/nn.py
@@ -54,37 +54,40 @@ def fused_embedding_seq_pool(
         size (tuple|list): The shape of the lookup_table parameter. It should
             have two elements which indicate the size of the dictionary of
             embedding and the size of each embedding vector respectively.
-        is_sparse (bool): The flag indicating whether to use sparse update.
+        is_sparse (bool, optional): The flag indicating whether to use sparse update.
             Default: False.
-        padding_idx (int|long|None): It will output all-zero padding data whenever
+        padding_idx (int|long|None, optional): It will output all-zero padding data whenever
             lookup encounters :math:`padding\_idx` in Ids. If set :attr:`None`, it makes
             no effect to output. If :math:`padding\_idx < 0`, the :math:`padding\_idx`
             will automatically be converted to :math:`size[0] + padding\_idx` to use.
             Default: None.
-        combiner (str): The pooling type of sequence_pool, and only support `sum`.
+        combiner (str, optional): The pooling type of sequence_pool, and only support `sum`.
             Default: sum.
-        param_attr (ParamAttr): Parameters for this layer.
-        dtype (np.dtype|core.VarDesc.VarType|str): The dtype refers to the data type of output
-            tensor. It can be float32, float_16, int etc.
+        param_attr (ParamAttr, optional): Parameters for this layer. Default: None.
+        dtype (np.dtype|core.VarDesc.VarType|str, optional): The dtype refers to the data type of output
+            tensor. It can be float32, float_16, int etc. Default: float32.
+
     Returns:
         The Tensor of sequence pooling.
+
     Examples:
         .. code-block:: python
-            import numpy as np
-            import paddle.fluid as fluid
-            import paddle
-            paddle.enable_static()
-
-            dict_size = 20
-            data_t = paddle.static.data(
-                name='word', shape=[-1, 1], dtype='int64', lod_level=1)
-            padding_idx = np.random.randint(1, 10)
-            out = paddle.incubate.layers.fused_embedding_seq_pool(
-                input=data_t,
-                size=[dict_size, 32],
-                param_attr='w',
-                padding_idx=padding_idx,
-                is_sparse=False)
+
+            >>> import numpy as np
+            >>> import paddle
+            >>> paddle.enable_static()
+
+            >>> dict_size = 20
+            >>> data_t = paddle.static.data(
+            ...     name='word', shape=[-1, 1], dtype='int64', lod_level=1)
+            >>> padding_idx = np.random.randint(1, 10)
+            >>> out = paddle.incubate.layers.fused_embedding_seq_pool(
+            ...     input=data_t,
+            ...     size=[dict_size, 32],
+            ...     param_attr='w',
+            ...     padding_idx=padding_idx,
+            ...     is_sparse=False)
+
     """
     helper = LayerHelper('fused_embedding_seq_pool', **locals())
     w = helper.create_parameter(
@@ -130,27 +133,25 @@ def fused_seqpool_cvm(
         cvm_offset(int, optional): cvm offset. Default: 2, which means cvm contains show, click.
 
     Returns:
-        Tensor : The tensor storing sequence pool and cvm
-        of input.
+        Tensor : The tensor storing sequence pool and cvm of input.
 
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.fluid as fluid
-            paddle.enable_static()
+            >>> import paddle
+            >>> paddle.enable_static()
 
-            data = paddle.static.data(name='x', shape=[-1, 1], dtype='int64', lod_level=1)
-            data2 = paddle.static.data(name='y', shape=[-1, 1], dtype='int64', lod_level=1)
-            inputs = [data, data2]
-            embs = paddle.incubate.layers.nn._pull_box_sparse(input=inputs, size=11, is_distributed=True, is_sparse=True)
+            >>> data = paddle.static.data(name='x', shape=[-1, 1], dtype='int64', lod_level=1)
+            >>> data2 = paddle.static.data(name='y', shape=[-1, 1], dtype='int64', lod_level=1)
+            >>> inputs = [data, data2]
+            >>> embs = paddle.incubate.layers.nn._pull_box_sparse(input=inputs, size=11, is_distributed=True, is_sparse=True)
 
-            label = paddle.static.data(name="label", shape=[-1, 1], dtype="int64", lod_level=1)
-            ones = fluid.layers.fill_constant_batch_size_like(input=label, shape=[-1, 1], dtype="int64", value=1)
-            show_clk = paddle.cast(paddle.concat([ones, label], axis=1), dtype='float32')
-            show_clk.stop_gradient = True
+            >>> label = paddle.static.data(name="label", shape=[-1, 1], dtype="int64", lod_level=1)
+            >>> ones = paddle.static.data(name="ones", shape=[-1, 1], dtype="int64", lod_level=1)
+            >>> show_clk = paddle.cast(paddle.concat([ones, label], axis=1), dtype='float32')
+            >>> show_clk.stop_gradient = True
 
-            cvms = paddle.incubate.layers.fused_seqpool_cvm(embs, 'sum', show_clk)
+            >>> cvms = paddle.incubate.layers.fused_seqpool_cvm(embs, 'sum', show_clk)
 
 
     """
@@ -212,10 +213,10 @@ def multiclass_nms2(
     In the NMS step, this operator greedily selects a subset of detection bounding
     boxes that have high scores larger than score_threshold, if providing this
     threshold, then selects the largest nms_top_k confidences scores if nms_top_k
-    is larger than -1. Then this operator pruns away boxes that have high IOU
+    is larger than -1. Then this operator prunes away boxes that have high IOU
     (intersection over union) overlap with already selected boxes by adaptive
     threshold NMS based on parameters of nms_threshold and nms_eta.
-    Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
+    After NMS step, at most keep_top_k number of total bboxes are to be kept
     per image if keep_top_k is larger than -1.
 
     Args:
@@ -228,7 +229,7 @@ def multiclass_nms2(
                            [xmin, ymin, xmax, ymax], when box size equals to 4.
                            2. (LoDTensor) A 3-D Tensor with shape [M, C, 4]
                            M is the number of bounding boxes, C is the
-                           class number
+                           class number.
         scores (Tensor): Two types of scores are supported:
                            1. (Tensor) A 3-D Tensor with shape [N, C, M]
                            represents the predicted confidence predictions.
@@ -241,22 +242,22 @@ def multiclass_nms2(
                            M is the number of bbox, C is the class number.
                            In this case, input BBoxes should be the second
                            case with shape [M, C, 4].
-        background_label (int): The index of background label, the background
-                                label will be ignored. If set to -1, then all
-                                categories will be considered. Default: 0
         score_threshold (float): Threshold to filter out bounding boxes with
                                  low confidence score. If not provided,
                                  consider all boxes.
         nms_top_k (int): Maximum number of detections to be kept according to
                          the confidences after the filtering detections based
                          on score_threshold.
-        nms_threshold (float): The threshold to be used in NMS. Default: 0.3
-        nms_eta (float): The threshold to be used in NMS. Default: 1.0
         keep_top_k (int): Number of total bboxes to be kept per image after NMS
                           step. -1 means keeping all bboxes after NMS step.
-        normalized (bool): Whether detections are normalized. Default: True
-        return_index(bool): Whether return selected index. Default: False
-        name(str): Name of the multiclass nms op. Default: None.
+        nms_threshold (float, optional): The threshold to be used in NMS. Default: 0.3.
+        normalized (bool, optional): Whether detections are normalized. Default: True.
+        nms_eta (float, optional): The threshold to be used in NMS. Default: 1.0.
+        background_label (int, optional): The index of background label, the background
+                                label will be ignored. If set to -1, then all
+                                categories will be considered. Default: 0.
+        return_index(bool, optional): Whether return selected index. Default: False.
+        name(str, optional): Name of the multiclass nms op. Default: None.
 
     Returns:
         A tuple with two dimensions of the tensor: (Out, Index) if return_index is True,
@@ -279,23 +280,21 @@ def multiclass_nms2(
     Examples:
         .. code-block:: python
 
-
-            import paddle.fluid as fluid
-            import paddle
-            paddle.enable_static()
-            boxes = paddle.static.data(name='bboxes', shape=[-1, 81, 4],
-                                      dtype='float32', lod_level=1)
-            scores = paddle.static.data(name='scores', shape=[-1, 81],
-                                      dtype='float32', lod_level=1)
-            out, index = paddle.incubate.layers.multiclass_nms2(bboxes=boxes,
-                                              scores=scores,
-                                              background_label=0,
-                                              score_threshold=0.5,
-                                              nms_top_k=400,
-                                              nms_threshold=0.3,
-                                              keep_top_k=200,
-                                              normalized=False,
-                                              return_index=True)
+            >>> import paddle
+            >>> paddle.enable_static()
+            >>> boxes = paddle.static.data(name='bboxes', shape=[-1, 81, 4],
+            ...                           dtype='float32', lod_level=1)
+            >>> scores = paddle.static.data(name='scores', shape=[-1, 81],
+            ...                           dtype='float32', lod_level=1)
+            >>> out, index = paddle.incubate.layers.multiclass_nms2(bboxes=boxes,
+            ...                                   scores=scores,
+            ...                                   background_label=0,
+            ...                                   score_threshold=0.5,
+            ...                                   nms_top_k=400,
+            ...                                   nms_threshold=0.3,
+            ...                                   keep_top_k=200,
+            ...                                   normalized=False,
+            ...                                   return_index=True)
     """
     helper = LayerHelper('multiclass_nms2', **locals())
 
@@ -353,26 +352,27 @@ def search_pyramid_hash(
         pyramid_layer (int): The number of pyramid layers. It should be greater than 2.
         rand_len (int): The minimum length of pyramid hash cell.
         drop_out_percent (float): The probability of dropping out the input token randomly.
-            It should satisfy: [0., 1.]
+            It should satisfy: [0., 1.].
         is_training (bool): Whether in training or testing phrase.
-        use_filter(bool): If set True, the white filter and black filter should be given by
+        use_filter (bool): If set True, the white filter and black filter should be given by
             :attr:`param_attr_wl` and :attr:`param_attr_bl` .
-        white_list_len(int): If set :math:`white_list_len>0` , white filter with shape [white_list_len, 1]
+        white_list_len (int): If set :math:`white_list_len>0` , white filter with shape [white_list_len, 1]
             should be provided by param_attr_wl.
-        black_list_len(int): If set :math:`black_list_len>0` , black filter with shape [black_list_len, 1]
+        black_list_len (int): If set :math:`black_list_len>0` , black filter with shape [black_list_len, 1]
             should be provided by param_attr_bl.
-        seed(int): The number of random seed.
-        lr(float): The learning rate of weight created by :attr:`param_attr` with shape [space_len+rand_len, 1]
+        seed (int): The number of random seed.
+        lr (float): The learning rate of weight created by :attr:`param_attr` with shape [space_len+rand_len, 1]
             in this layer.
-        param_attr(ParamAttr): To specify the weight parameter property. Default: None, which means the
+        param_attr (ParamAttr, optional): To specify the weight parameter property. Default: None, which means the
             default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr` .
-        param_attr_wl(ParamAttr): Specified parameters of white filter.
-        param_attr_bl(ParamAttr): Specified parameters of black filter.
-        distribute_update_vars(list[ParamAttr.name]): Decided which params should be updated in distribute training.
-            Used in Distribute Transpiler to create a trainer/server program.
-        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name` .
-        dtype(str): The data type of output Tensor, float32.
+        param_attr_wl (ParamAttr, optional): Specified parameters of white filter. Default: None.
+        param_attr_bl (ParamAttr, optional): Specified parameters of black filter. Default: None.
+        distribute_update_vars(list[ParamAttr.name], optional): Decided which params should be updated in distribute training.
+            Used in Distribute Transpiler to create a trainer/server program. Default: None.
+        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name` . Default: None.
+        dtype (str, optional): The data type of output Tensor, float32. Default: float32.
+
     Returns:
         Tensor: LoDTensor of pyramid hash embedding.
     """
@@ -451,25 +451,25 @@ def shuffle_batch(x, seed=None):
     :attr:`x` is a LoDTensor to be shuffled with shape :math:`[N_1, N_2, ..., N_k, D]` . Note that the last dim of input will not be shuffled.
     :math:`N_1 * N_2 * ... * N_k` numbers of elements with length :math:`D` will be shuffled randomly.
 
-    For Example:
+    Examples:
 
-    .. code-block:: text
+        .. code-block:: text
 
-      Input:
-        x.data = [[1, 2], [3, 4], [5, 6], [7, 8]]
-        x.dims = [4, 2]
+            Input:
+              x.data = [[1, 2], [3, 4], [5, 6], [7, 8]]
+              x.dims = [4, 2]
 
-      Attrs:
-        seed = 2019
+            Attrs:
+              seed = 2019
 
-      Output:
-        Out.data =[[7, 8], [1, 2], [3, 4], [5, 6]]
-        Out.dims = [4, 2]
+            Output:
+              Out.data =[[7, 8], [1, 2], [3, 4], [5, 6]]
+              Out.dims = [4, 2]
 
     Args:
         x (Tensor): The input Tensor. The input Tensor is a N-D LoDTensor with type int, float32 or float64.
-        seed (None|int|Tensor): The start up seed. If set, seed will be set as the start up seed of shuffle engine.
-                If not set(Default), start up seed of shuffle engine will be generated randomly.
+        seed (None|int|Tensor, optional): The start up seed. If set, seed will be set as the start up seed of shuffle engine.
+            If not set(Default), start up seed of shuffle engine will be generated randomly. Default: None.
 
     Returns:
         Tensor: The shuffled LoDTensor with the same shape and lod as input.
@@ -478,11 +478,10 @@ def shuffle_batch(x, seed=None):
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            import paddle
-            paddle.enable_static()
-            x = paddle.static.data(name="x", shape=[-1, 4])
-            out = paddle.incubate.layers.shuffle_batch(x)
+            >>> import paddle
+            >>> paddle.enable_static()
+            >>> x = paddle.static.data(name="x", shape=[-1, 4])
+            >>> out = paddle.incubate.layers.shuffle_batch(x)
     """
     helper = LayerHelper('shuffle_batch', **locals())
 
@@ -526,7 +525,7 @@ def partial_concat(input, start_index=0, length=-1):
                  [9, 10, 11]]
             output = partial_concat([x, y], start_index=0, length=2)
 
-          we get:
+        We get:
 
             output = [[0, 1, 6, 7],
                       [3, 4, 9, 10]]
@@ -534,20 +533,22 @@ def partial_concat(input, start_index=0, length=-1):
     Args:
         input(list): List of input Tensors with data type float32, float64, int32,
             int64.
-        start_index(int32): The start index of each instance for partial concatenation.
+        start_index(int32, optional): The start index of each instance for partial concatenation.
             Default is 0.
-        length(int32): The length of each instance for partial concatenation. Default is -1.
+        length(int32, optional): The length of each instance for partial concatenation. Default is -1.
             Negative values for all elements after start_index.
+
     Returns:
         Tensor: A Tensor with the same data type as input's.
+
     Examples:
         .. code-block:: python
-            import paddle.fluid as fluid
-            import paddle
-            x = paddle.randn(name="x", shape=[1,3], dtype="float32")
-            y = paddle.randn(name="y", shape=[1,3], dtype="float32")
-            concat = paddle.incubate.layers.partial_concat(
-                [x, y], start_index=0, length=2)
+
+            >>> import paddle
+            >>> x = paddle.randn(name="x", shape=[1,3], dtype="float32")
+            >>> y = paddle.randn(name="y", shape=[1,3], dtype="float32")
+            >>> concat = paddle.incubate.layers.partial_concat(
+            ...     [x, y], start_index=0, length=2)
     """
     if not isinstance(input, list):
         warnings.warn(
@@ -584,6 +585,7 @@ def partial_sum(input, start_index=0, length=-1):
     This Op exists in incubate layers, which means that it is not shown to the public.
     Only 2-D Tensor or LodTensor input is supported. Slice and concat can only be
     performed along the second dimension.
+
     .. code-block:: text
 
         Given:
@@ -592,30 +594,29 @@ def partial_sum(input, start_index=0, length=-1):
             y = [[6, 7 ,8],
                  [9, 10, 11]]
             output = partial_sum([x, y], start_index=0, length=2)
-          we get:
+
+        We get:
 
             output = [[6, 8],
                       [12, 14]]
     Args:
-        input(list): List of input Tensors with data type float32, float64, int32,
+        input (list): List of input Tensors with data type float32, float64, int32,
             int64.
+        start_index (int32, optional): The start index of each instance for partial sum. Default is 0.
+        length (int32, optional): The length of each instance for partial sum. Default is -1.
+
     Returns:
         Tensor: A Tensor with the same data type as input's.
+
     Examples:
         .. code-block:: python
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
-        paddle.enable_static()
-
-        x = paddle.static.data(name="x", shape=[2, 3], dtype="float32")
-        y = paddle.static.data(name="y", shape=[2, 3], dtype="float32")
-        sum = paddle.incubate.layers.partial_sum([x,y], start_index=0, length=2)
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        xx = np.array([1,2,3,4,5,6]).reshape((2,3)).astype("float32")
-        yy = np.array([6,5,4,4,5,6]).reshape((2,3)).astype("float32")
-        out = exe.run(feed={"x":xx, "y":yy}, fetch_list=[sum])
+
+            >>> import paddle
+            >>> paddle.enable_static()
+
+            >>> x = paddle.static.data(name="x", shape=[2, 3], dtype="float32")
+            >>> y = paddle.static.data(name="y", shape=[2, 3], dtype="float32")
+            >>> sum = paddle.incubate.layers.partial_sum([x,y], start_index=0, length=2)
     """
     for id, x in enumerate(input):
         check_variable_and_dtype(
@@ -642,6 +643,7 @@ def tdm_child(x, node_nums, child_nums, param_attr=None, dtype='int32'):
     **Tdm Child**
      According to the input node_id on the given tree, return the corresponding child node_id and
       whether child is a leaf node by leaf_mask value.
+
     .. code-block:: text
 
         Given:
@@ -650,25 +652,26 @@ def tdm_child(x, node_nums, child_nums, param_attr=None, dtype='int32'):
             node_nums = 7
             child_nums = 2
 
-          we get:
+        We get:
             child = [[5, 6],
                      [0, 0]]
             leaf_mask = [[1, 1],
                          [0, 0]]
+
     Args:
-        x(Tensor): Tensor contained the node_id information, dtype support int32/int64.
-        node_nums(int): Number of total nodes.
-        child_nums(int): Maximum number of child nodes per node.
-        param_attr(ParamAttr): To specify the tdm-tree-info parameter property. Default: None, which means the
+        x (Tensor): Tensor contained the node_id information, dtype support int32/int64.
+        node_nums (int): Number of total nodes.
+        child_nums (int): Maximum number of child nodes per node.
+        param_attr (ParamAttr, optional): To specify the tdm-tree-info parameter property. Default: None, which means the
             default weight parameter property is used. See usage for details in: ref: `api_fluid_ParamAttr`, should
-            has shape(node_nums, 3 + child_nums), dtype support int32/int64.
+            has shape (node_nums, 3 + child_nums), dtype support int32/int64.
             The dimension[1] of tdm-tree-info contains the following:
-            1. Item_id(int, shape(1)), if node is a leaf node, give its item_id corresponding to node_id, else give 0.
-            2. Layer_id(int, shape(1)), indicates which layer the node is on.
-            3. Parent_id(int, shape(1)), node's parent node.
-            4. Child_id(int, shape(child_nums)), all child node's node_id of this node should be given.
-            If the number of child nodes is insufficient, padding 0 until child nums equal to child_nums
-        dtype(str): The data type of output child and leaf_mask, support int32/int64.
+            1. Item_id (int, shape(1)), if node is a leaf node, give its item_id corresponding to node_id, else give 0.
+            2. Layer_id (int, shape(1)), indicates which layer the node is on.
+            3. Parent_id (int, shape(1)), node's parent node.
+            4. Child_id (int, shape(child_nums)), all child node's node_id of this node should be given.
+            If the number of child nodes is insufficient, padding 0 until child nums equal to child_nums.
+        dtype (str, optional): The data type of output child and leaf_mask, support int32/int64. Default: int32.
 
     Returns:
         tuple: A tuple including input node's child(Tensor) and leaf_mask(Tensor).
@@ -676,27 +679,23 @@ def tdm_child(x, node_nums, child_nums, param_attr=None, dtype='int32'):
 
     Examples:
         .. code-block:: python
-        import paddle
-        import paddle.fluid as fluid
-        import numpy as np
-        paddle.enable_static()
-        x = paddle.static.data(name="x", shape=[None, 1], dtype="int32", lod_level=1)
-        tree_info = [[0,0,0,1,2],
-                     [0,1,0,3,4],[0,1,0,5,6],
-                     [0,2,1,0,0],[1,2,1,0,0],[2,2,2,0,0],[3,2,2,0,0]]
-        tree_info_np = np.array(tree_info)
-        tree_info_np = np.reshape(tree_info_np, (7,5))
-        node_nums = 7
-        child_nums = 2
-        child, leaf_mask  = paddle.incubate.layers.tdm_child(x, node_nums, child_nums,
-                                param_attr=fluid.ParamAttr(
-                                    initializer=paddle.nn.initializer.Assign(
-                                                                            tree_info_np)))
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        xx = np.array([[2],[3]]).reshape((2,1)).astype("int32")
-        child_res, leaf_mask_res = exe.run(feed={"x":xx}, fetch_list=[child, leaf_mask])
+
+            >>> import paddle
+            >>> import numpy as np
+            >>> paddle.enable_static()
+
+            >>> x = paddle.static.data(name="x", shape=[None, 1], dtype="int32", lod_level=1)
+            >>> tree_info = [[0,0,0,1,2],
+            ...             [0,1,0,3,4],[0,1,0,5,6],
+            ...             [0,2,1,0,0],[1,2,1,0,0],[2,2,2,0,0],[3,2,2,0,0]]
+            >>> tree_info_np = np.array(tree_info)
+            >>> tree_info_np = np.reshape(tree_info_np, (7,5))
+            >>> node_nums = 7
+            >>> child_nums = 2
+            >>> child, leaf_mask  = paddle.incubate.layers.tdm_child(x, node_nums, child_nums,
+            ...                     param_attr=paddle.ParamAttr(
+            ...                     initializer=paddle.nn.initializer.Assign(tree_info_np)))
+
     """
     helper = LayerHelper("tdm_child", **locals())
     check_dtype(
@@ -740,6 +739,7 @@ def tdm_sampler(
     """
     **Tdm Sampler**
     According to the input positive samples at leaf node(x), do negative sampling layer by layer on the given tree.
+
     .. code-block:: text
 
         Given:
@@ -753,7 +753,7 @@ def tdm_sampler(
             leaf_node_num = 4
             output_list = False
 
-          we get:
+        We get:
             out = [[1, 3], [1, 4], [2, 5], [2, 6]]
             labels = [[1, 1], [1, 1], [1, 1], [1, 1]]
             mask = [[1, 1], [1, 1], [1, 1], [1, 1]]
@@ -763,21 +763,21 @@ def tdm_sampler(
         neg_samples_num_list (list(int)): Number of negative samples per layer.
         layer_node_num_list (list(int)): Number of nodes per layer, must has same shape with neg_samples_num_list.
         leaf_node_num (int): Number of leaf nodes.
-        tree_travel_attr (ParamAttr): To specify the tdm-travel parameter property. Default: None, which means the
+        tree_travel_attr (ParamAttr, optional): To specify the tdm-travel parameter property. Default: None, which means the
             default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr`, should
             has shape (leaf_node_num, len(layer_node_num_list)), dtype support int32/int64.
-        tree_layer_attr (ParamAttr): To specify the tdm-layer parameter property. Default: None, which means the
+        tree_layer_attr (ParamAttr, optional): To specify the tdm-layer parameter property. Default: None, which means the
             default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr`, should
             has shape (node_num, 1), dtype support int32/int64.
-        output_positive (bool): Whether to output positive samples (includ label and mask )at the same time.
-        output_list (bool): Whether to divide the output into layers and organize it into list format.
-        seed (int): The number of random seed.
-        tree_dtype(np.dtype|core.VarDesc.VarType|str): The dtype of tdm-travel and tdm-layer, support int32/int64
-        dtype(np.dtype|core.VarDesc.VarType|str): The dtype of output(sampling results, labels and masks)
+        output_positive (bool, optional): Whether to output positive samples (include label and mask )at the same time. Default: True.
+        output_list (bool, optional): Whether to divide the output into layers and organize it into list format. Default: True.
+        seed (int, optional): The number of random seed. Default: 0.
+        tree_dtype (np.dtype|core.VarDesc.VarType|str, optional): The dtype of tdm-travel and tdm-layer, support int32/int64. Default: int32.
+        dtype (np.dtype|core.VarDesc.VarType|str, optional): The dtype of output(sampling results, labels and masks). Default: int32.
 
     Returns:
         tuple: A tuple including sampling results, corresponding labels and masks. if output_positive = True, sampling
-            result  will include both positive and negative samples. If sampling reseult is a positive sample, the label is 1,
+            result  will include both positive and negative samples. If sampling result is a positive sample, the label is 1,
             and if it is a negative sample, it is 0. If the tree is unbalanced, in order to ensure the consistency of the
             sampling result shape, the padding sample's mask = 0, the real sample's mask value = 1.
             If output_list = True, the result will organize into list format specified by layer information.
@@ -785,43 +785,37 @@ def tdm_sampler(
 
     Examples:
         .. code-block:: python
-        import paddle
-        import paddle.fluid as fluid
-        import numpy as np
-        paddle.enable_static()
-        x = paddle.static.data(name="x", shape=[None, 1], dtype="int32", lod_level=1)
-        travel_list = [[1, 3], [1, 4], [2, 5], [2, 6]] # leaf node's travel path, shape(leaf_node_num, layer_num)
-        layer_list_flat = [[1], [2], [3], [4], [5], [6]] # shape(node_nums, 1)
-
-        neg_samples_num_list = [0, 0] # negative sample nums = 0
-        layer_node_num_list = [2, 4] #two layer (exclude root node)
-        leaf_node_num = 4
-
-        travel_array = np.array(travel_list)
-        layer_array = np.array(layer_list_flat)
-
-        sample, label, mask = paddle.incubate.layers.tdm_sampler(
-            x,
-            neg_samples_num_list,
-            layer_node_num_list,
-            leaf_node_num,
-            tree_travel_attr=fluid.ParamAttr(
-                initializer=paddle.nn.initializer.Assign(
-                    travel_array)),
-            tree_layer_attr=fluid.ParamAttr(
-                initializer=paddle.nn.initializer.Assign(
-                    layer_array)),
-            output_positive=True,
-            output_list=True,
-            seed=0,
-            tree_dtype='int32')
-
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        xx = np.array([[0],[1]]).reshape((2,1)).astype("int32")
-
-        exe.run(feed={"x":xx})
+
+            >>> import paddle
+            >>> import numpy as np
+            >>> paddle.enable_static()
+
+            >>> x = paddle.static.data(name="x", shape=[None, 1], dtype="int32", lod_level=1)
+            >>> travel_list = [[1, 3], [1, 4], [2, 5], [2, 6]] # leaf node's travel path, shape(leaf_node_num, layer_num)
+            >>> layer_list_flat = [[1], [2], [3], [4], [5], [6]] # shape(node_nums, 1)
+
+            >>> neg_samples_num_list = [0, 0] # negative sample nums = 0
+            >>> layer_node_num_list = [2, 4] #two layer (exclude root node)
+            >>> leaf_node_num = 4
+
+            >>> travel_array = np.array(travel_list)
+            >>> layer_array = np.array(layer_list_flat)
+
+            >>> sample, label, mask = paddle.incubate.layers.tdm_sampler(
+            ...     x,
+            ...     neg_samples_num_list,
+            ...     layer_node_num_list,
+            ...     leaf_node_num,
+            ...     tree_travel_attr=paddle.ParamAttr(
+            ...         initializer=paddle.nn.initializer.Assign(
+            ...            travel_array)),
+            ...     tree_layer_attr=paddle.ParamAttr(
+            ...         initializer=paddle.nn.initializer.Assign(
+            ...             layer_array)),
+            ...     output_positive=True,
+            ...     output_list=True,
+            ...     seed=0,
+            ...     tree_dtype='int32')
 
     """
     helper = LayerHelper("tdm_sampler", **locals())
@@ -968,30 +962,33 @@ def rank_attention(
     rank_param gives the organization of data. Notice: It currently supports
     GPU device.
     This Op exists in incubate layers, which means that it is not shown to the public.
+
     Args:
-        input: Tensor with data type float32, float64.
-        rank_offset: Tensor with data type int32.
-        rank_para_shape: The shape of rank_param.
-        rank_param_attr: Attribute initializer of rank_param.
-        max_rank: The max rank of input's ranks.
+        input (Tensor): Tensor with data type float32, float64.
+        rank_offset (Tensor): Tensor with data type int32.
+        rank_para_shape (list[int]): The shape of rank_param.
+        rank_param_attr (ParamAttr): Attribute initializer of rank_param.
+        max_rank (int, optional): The max rank of input's ranks. Default is 3.
+        max_size (int, optional): The max size of input's ranks. Default is 0.
     Returns:
         Tensor: A Tensor with the same data type as input's.
+
     Examples:
         .. code-block:: python
-           import paddle.fluid as fluid
-           import paddle
-           paddle.enable_static()
-
-           input = paddle.static.data(name="input", shape=[None, 2], dtype="float32")
-           rank_offset = paddle.static.data(name="rank_offset", shape=[None, 7], dtype="int32")
-           out = paddle.incubate.layers.rank_attention(input=input,
-                                                     rank_offset=rank_offset,
-                                                     rank_param_shape=[18,3],
-                                                     rank_param_attr=
-                                                     paddle.ParamAttr(learning_rate=1.0,
-                                                                     name="ubm_rank_param.w_0"),
-                                                      max_rank=3,
-                                                      max_size=0)
+
+            >>> import paddle
+            >>> paddle.enable_static()
+
+            >>> input = paddle.static.data(name="input", shape=[None, 2], dtype="float32")
+            >>> rank_offset = paddle.static.data(name="rank_offset", shape=[None, 7], dtype="int32")
+            >>> out = paddle.incubate.layers.rank_attention(input=input,
+            ...                                             rank_offset=rank_offset,
+            ...                                             rank_param_shape=[18,3],
+            ...                                             rank_param_attr=
+            ...                                             paddle.ParamAttr(learning_rate=1.0,
+            ...                                                              name="ubm_rank_param.w_0"),
+            ...                                             max_rank=3,
+            ...                                             max_size=0)
     """
     helper = LayerHelper('rank_attention', **locals())
     dtype = helper.input_dtype(input_param_name='input')
@@ -1027,34 +1024,35 @@ def batch_fc(input, param_size, param_attr, bias_size, bias_attr, act=None):
     except that the bias and relu activation layers are added.
     Notice: It currently supports GPU device.
     This Op exists in incubate layers, which means that it is not shown to the public.
+
     Args:
-        input: Tensor with data type float32, float64.
-        param_size: The size of w.
-        param_attr: Attribute initializer of w.
-        bias_size: The size of bias.
-        bias_attr: Attribute initializer of bias.
-        act: Activation to be applied to the output of this layer.
+        input (Tensor): Tensor with data type float32, float64.
+        param_size (list[int]): The size of w.
+        param_attr (ParamAttr): Attribute initializer of w.
+        bias_size (list[int]): The size of bias.
+        bias_attr (ParamAttr): Attribute initializer of bias.
+        act (str, optional): Activation to be applied to the output of this layer. Default is None.
 
     Returns:
         Tensor: A Tensor with the same data type as input's.
+
     Examples:
         .. code-block:: python
-           import paddle.fluid as fluid
-           import paddle
-
-           paddle.enable_static()
-
-           input = paddle.static.data(name="input", shape=[16, 2, 3], dtype="float32")
-           out = paddle.incubate.layers.batch_fc(input=input,
-                                               param_size=[16, 3, 10],
-                                               param_attr=
-                                               paddle.ParamAttr(learning_rate=1.0,
-                                                               name="w_0"),
-                                               bias_size=[16, 10],
-                                               bias_attr=
-                                               paddle.ParamAttr(learning_rate=1.0,
-                                                               name="b_0"),
-                                               act="relu")
+
+            >>> import paddle
+            >>> paddle.enable_static()
+
+            >>> input = paddle.static.data(name="input", shape=[16, 2, 3], dtype="float32")
+            >>> out = paddle.incubate.layers.batch_fc(input=input,
+            ...                                     param_size=[16, 3, 10],
+            ...                                     param_attr=
+            ...                                     paddle.ParamAttr(learning_rate=1.0,
+            ...                                                      name="w_0"),
+            ...                                     bias_size=[16, 10],
+            ...                                     bias_attr=
+            ...                                     paddle.ParamAttr(learning_rate=1.0,
+            ...                                                      name="b_0"),
+            ...                                     act="relu")
     """
 
     helper = LayerHelper("batch_fc", **locals())
@@ -1089,23 +1087,26 @@ def _pull_box_extended_sparse(input, size, extend_size=64, dtype='float32'):
     This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in
     BoxPS lookup table. The result of this lookup is the embedding of each ID in the
     :attr:`input`.
+
     Args:
-        input(Tensor): Input is a Tensor<int64>, which
-            contains the IDs information.
-        size(int): The embedding size parameter, which indicates the size of
+        input (Tensor): Input is a Tensor<int64>, which contains the IDs information.
+        size (int): The embedding size parameter, which indicates the size of
             each embedding vector respectively.
-        extend_size(int): The embedding size parameter in extended dim,
-            which indicates the size of each embedding vector respectively.
-        dtype(str): The dtype refers to the data type of output tensor. Only supports
-      float32 now.
+        extend_size (int, optional): The embedding size parameter in extended dim,
+            which indicates the size of each embedding vector respectively. Default is 64.
+        dtype (str, optional): The dtype refers to the data type of output tensor. Only supports float32 now. Default is float32.
+
     Returns:
-        Tensor: The tensor storing the embeddings of the \
-                  supplied inputs.
+        Tensor: The tensor storing the embeddings of the supplied inputs.
+
     Examples:
         .. code-block:: python
-          import paddle.fluid as fluid
-          data = paddle.static.data(name='sequence', shape=[-1, 1], dtype='int64', lod_level=1)
-          emb, emb_ex = paddle.incubate.layers._pull_box_extended_sparse(input=data, size=8, extend_size=128)
+
+            >>> import paddle
+            >>> paddle.enable_static()
+
+            >>> data = paddle.static.data(name='sequence', shape=[-1, 1], dtype='int64', lod_level=1)
+            >>> emb, emb_ex = paddle.incubate.layers._pull_box_extended_sparse(input=data, size=8, extend_size=128)
     """
     helper = LayerHelper('pull_box_extended_sparse', **locals())
     helper.input_dtype()
@@ -1139,16 +1140,16 @@ def bilateral_slice(x, guide, grid, has_offset, name=None):
     For more information of bilateral slicing, please refer to Deep Bilateral Learning for Real-Time Image Enhancement <https://groups.csail.mit.edu/graphics/hdrnet/data/hdrnet.pdf>_
 
     Args:
-        x(Tensor): The input tensor, which is a 4-D tensor with shape
+        x (Tensor): The input tensor, which is a 4-D tensor with shape
                      [N, C, H, W], N is the batch size, C is the channel
                      number, H and W is the feature height and width.
                      The data type is float32 and float64.
-        guide(Tensor): Input grid tensor of shape [N, H, W]. The
+        guide (Tensor): Input grid tensor of shape [N, H, W]. The
                         data type is float32 and float64.
-        grid(Tensor): Input grid tensor of shape [N, C, D, H, W]. The
+        grid (Tensor): Input grid tensor of shape [N, C, D, H, W]. The
                         data type is float32 and float64.
-        has_offset(bool): Whether to slice with affine offset.
-        name(str, optional): For detailed information, please refer
+        has_offset (bool): Whether to slice with affine offset.
+        name (str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
 
@@ -1159,19 +1160,18 @@ def bilateral_slice(x, guide, grid, has_offset, name=None):
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            import paddle
-            paddle.enable_static()
+            >>> import paddle
+            >>> paddle.enable_static()
 
-            x = paddle.randn(name='x', shape=[1, 3, 101, 60], dtype='float32')
-            guide = paddle.randn(name='guide', shape=[1, 101, 60], dtype='float32')
-            grid = paddle.randn(name='grid', shape=[1, 12, 8, 10, 6], dtype='float32')
+            >>> x = paddle.randn(name='x', shape=[1, 3, 101, 60], dtype='float32')
+            >>> guide = paddle.randn(name='guide', shape=[1, 101, 60], dtype='float32')
+            >>> grid = paddle.randn(name='grid', shape=[1, 12, 8, 10, 6], dtype='float32')
 
-            # without offset
-            output = paddle.incubate.layers.bilateral_slice(x, guide, grid, has_offset=False)
+            >>> # without offset
+            >>> output = paddle.incubate.layers.bilateral_slice(x, guide, grid, has_offset=False)
 
-            # has offset
-            output = paddle.incubate.layers.bilateral_slice(x, guide, grid, has_offset=True)
+            >>> # has offset
+            >>> output = paddle.incubate.layers.bilateral_slice(x, guide, grid, has_offset=True)
 
     """
     if paddle.in_dynamic_mode():
@@ -1215,13 +1215,13 @@ def correlation(
     <https://arxiv.org/pdf/1709.02371.pdf>_
 
     Args:
-        x(Tensor): The input x is 4-D Tensor with shape [N, C, H, W]. The data type is float32 and float64.
-        y(Tensor): The input y is 4-D Tensor with shape [N, C, H, W]. The data type is float32 and float64.
-        pad_size(int): Pad size. The data type is int.
-        max_displacement(int): Max displacement. The data type is int.
-        stride1(int): stride size of x. The data type is int.
-        stride2(int): stride size of y. The data type is int.
-        corr_type_multiply(int, optional): The type of multiply. The data type is int. Default: 1.
+        x (Tensor): The input x is 4-D Tensor with shape [N, C, H, W]. The data type is float32 and float64.
+        y (Tensor): The input y is 4-D Tensor with shape [N, C, H, W]. The data type is float32 and float64.
+        pad_size (int): Pad size. The data type is int.
+        max_displacement (int): Max displacement. The data type is int.
+        stride1 (int): stride size of x. The data type is int.
+        stride2 (int): stride size of y. The data type is int.
+        corr_type_multiply (int, optional): The type of multiply. The data type is int. Default: 1.
 
     Returns:
         Tensor: The data type is same as input tensor.
@@ -1230,25 +1230,24 @@ def correlation(
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            import paddle
-            paddle.enable_static()
-            x1 = paddle.static.data(name='x1',
-                               shape=[2,3,4,5],
-                               dtype="float32")
-            x2 = paddle.static.data(name='x2',
-                                shape=[2,3,4,5],
-                                dtype="float32")
-
-
-            out = paddle.incubate.layers.correlation(
-                            x1,
-                            x2,
-                            pad_size=4,
-                            kernel_size=1,
-                            max_displacement=4,
-                            stride1=1,
-                            stride2=1)
+            >>> import paddle
+            >>> paddle.enable_static()
+            >>> x1 = paddle.static.data(name='x1',
+            ...                         shape=[2, 3, 4, 5],
+            ...                         dtype="float32")
+            >>> x2 = paddle.static.data(name='x2',
+            ...                         shape=[2, 3, 4, 5],
+            ...                         dtype="float32")
+
+
+            >>> out = paddle.incubate.layers.correlation(
+            ...                 x1,
+            ...                 x2,
+            ...                 pad_size=4,
+            ...                 kernel_size=1,
+            ...                 max_displacement=4,
+            ...                 stride1=1,
+            ...                 stride2=1)
 
     """
 
@@ -1305,105 +1304,97 @@ def fused_bn_add_act(
     `[batch, in_height, in_width, in_channels]`.
 
     Args:
-        x(Tensor): The rank of input tensor can be 2, 3, 4, 5. The data type
+        x (Tensor): The rank of input tensor can be 2, 3, 4, 5. The data type
             is float16.
-        y(Tensor): The rank of input tensor can be 2, 3, 4, 5. The data type
+        y (Tensor): The rank of input tensor can be 2, 3, 4, 5. The data type
             is float16.
-        momentum(float|Tensor, optional): The value used for the moving_mean and
+        momentum (float|Tensor, optional): The value used for the moving_mean and
             moving_var computation. This should be a float number or a tensor with
             shape [1] and data type as float32. The updated formula is:
             :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
             :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
             Default is 0.9.
-        epsilon(float, optional): A value added to the denominator for
-            numerical stability. Default is 1e-5.
-        param_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
+        epsilon (float, optional): A value added to the denominator for
+            numerical stability. Default is 1e-05.
+        param_attr (ParamAttr, optional): The parameter attribute for Parameter `scale`
             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
-                will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
-                If the Initializer of the param_attr is not set, the parameter is initialized
-                with Xavier. Default: None.
-        bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm.
+            will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
+            If the Initializer of the param_attr is not set, the parameter is initialized
+            with Xavier. Default: None.
+        bias_attr (ParamAttr, optional): The parameter attribute for the bias of batch_norm.
             If it is set to None or one attribute of ParamAttr, batch_norm
-                will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
-                If the Initializer of the bias_attr is not set, the bias is initialized zero.
-                Default: None.
-        moving_mean_name(str, optional): The name of moving_mean which store the global Mean. If it
+            will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
+            If the Initializer of the bias_attr is not set, the bias is initialized zero.
+            Default: None.
+        moving_mean_name (str, optional): The name of moving_mean which store the global Mean. If it
             is set to None, batch_norm will save global mean with a random name, otherwise, batch_norm
-            will save global mean with the string.
-        moving_variance_name(str, optional): The name of the moving_variance which store the global Variance.
+            will save global mean with the string. Default: None.
+        moving_variance_name (str, optional): The name of the moving_variance which store the global Variance.
             If it is set to None, batch_norm will save global variance with a random name, otherwise, batch_norm
-            will save global variance with the string.
-        act(string, optional): Activation type, linear|relu|prelu|...
-        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
-            Usually name is no need to set and None by default.
+            will save global variance with the string. Default: None.
+        act (string, optional): Activation type, linear|relu|prelu|... Default: None.
+        name (str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default. Default: None.
 
     Examples:
-            .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-
-            paddle.enable_static()
-            # required: gpu
-            def build_program(main_program, startup_program):
-                with fluid.program_guard(main_program, startup_program):
-                    x = paddle.static.data(name='x', shape=[-1, 1, 28, 28], dtype='float32')
-                    y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
-                    conv1_1 = paddle.static.nn.conv2d(
-                        input=x,
-                        filter_size=3,
-                        num_filters=32,
-                        stride=1,
-                        padding=1,
-                        act=None,
-                        bias_attr=False,
-                        data_format='NHWC')
-                    conv1_2 = paddle.static.nn.conv2d(
-                        input=x,
-                        filter_size=3,
-                        num_filters=32,
-                        stride=1,
-                        padding=1,
-                        act=None,
-                        bias_attr=False,
-                        data_format='NHWC')
-                    bn = paddle.static.nn.batch_norm(
-                        input=conv1_1,
-                        act=None,
-                        data_layout='NHWC')
-                    fused_bn_add_act = paddle.incubate.layers.fused_bn_add_act(conv1_2, bn)
-                    prediction = paddle.static.nn.fc(x=fused_bn_add_act, size=10, activation='softmax')
-                    loss = paddle.nn.functional.cross_entropy(
-                        input=prediction, label=y,
-                        reduction='none', use_softmax=False
-                    )
-                    loss = paddle.mean(loss)
-                    sgd = fluid.optimizer.SGD(learning_rate=0.001)
-                    sgd = paddle.static.amp.decorate(
-                        sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0)
-                    sgd.minimize(loss)
-
-                return x, y, loss
-
-            iters = 5
-            batch_size = 16
-            support_gpu = fluid.is_compiled_with_cuda()
-            if support_gpu:
-                main_program = fluid.Program()
-                startup_program = fluid.Program()
-                place = fluid.CUDAPlace(0)
-                x, y, loss = build_program(main_program, startup_program)
-
-                feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
-                train_reader = paddle.batch(
-                    paddle.dataset.mnist.train(), batch_size=batch_size)
-                exe = fluid.Executor(place)
-                scope = fluid.Scope()
-                with fluid.scope_guard(scope):
-                    exe.run(startup_program)
-                    for _ in range(iters):
-                        data = next(train_reader())
-                        loss_v = exe.run(main_program, feed=feeder.feed(data), fetch_list=[loss])
+        .. code-block:: python
+
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> paddle.enable_static()
+
+            >>> def build_program(main_program, startup_program):
+            ...     with paddle.static.program_guard(main_program, startup_program):
+            ...         x = paddle.static.data(name='x', shape=[-1, 1, 28, 28], dtype='float32')
+            ...         y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
+            ...         conv1_1 = paddle.static.nn.conv2d(
+            ...             input=x,
+            ...             filter_size=3,
+            ...             num_filters=32,
+            ...             stride=1,
+            ...             padding=1,
+            ...             act=None,
+            ...             bias_attr=False,
+            ...            data_format='NHWC')
+            ...         conv1_2 = paddle.static.nn.conv2d(
+            ...             input=x,
+            ...             filter_size=3,
+            ...             num_filters=32,
+            ...             stride=1,
+            ...             padding=1,
+            ...             act=None,
+            ...             bias_attr=False,
+            ...             data_format='NHWC')
+            ...         bn = paddle.static.nn.batch_norm(
+            ...            input=conv1_1,
+            ...             act=None,
+            ...             data_layout='NHWC')
+            ...         fused_bn_add_act = paddle.incubate.layers.fused_bn_add_act(conv1_2, bn)
+            ...         prediction = paddle.static.nn.fc(x=fused_bn_add_act, size=10, activation='softmax')
+            ...         loss = paddle.nn.functional.cross_entropy(
+            ...             input=prediction, label=y,
+            ...             reduction='none', use_softmax=False
+            ...         )
+            ...         loss = paddle.mean(loss)
+            ...         sgd = paddle.optimizer.SGD(learning_rate=0.001)
+            ...         sgd = paddle.static.amp.decorate(
+            ...             sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0)
+            ...         sgd.minimize(loss)
+            ...
+            ...     return x, y, loss
+
+            >>> iters = 5
+            >>> batch_size = 16
+            >>> support_gpu = paddle.is_compiled_with_cuda()
+            >>> if support_gpu:
+            ...     main_program = paddle.static.Program()
+            ...     startup_program = paddle.static.Program()
+            ...     place = paddle.CUDAPlace(0)
+            ...     x, y, loss = build_program(main_program, startup_program)
+            ...
+            ...     feeder = paddle.DataFeeder(feed_list=[x, y], place=place)
+            ...     train_reader = paddle.batch(
+            ...         paddle.dataset.mnist.train(), batch_size=batch_size)
     """
     helper = LayerHelper('fused_bn_add_act', **locals())
 
@@ -1550,27 +1541,29 @@ def _pull_gpups_sparse(
     :attr:`input`.
 
     Args:
-        input(Tensor): Input is a Tensor<int64>, which
-            contains the IDs information.
-        size(int|list of int): The embedding size parameter of each input, which indicates the size of
+        input (Tensor): Input is a Tensor<int64>, which contains the IDs information.
+        size (int|list of int): The embedding size parameter of each input, which indicates the size of
             each embedding vector respectively.
-        dtype(str): The dtype refers to the data type of output tensor. Only supports
-        float32 now.
+        dtype (str, optional): The dtype refers to the data type of output tensor. Only supportsfloat32 now. Default is float32.
+        is_distributed (bool, optional): Whether to use distributed mode. Default is False.
+        is_sparse (bool, optional): Whether to use sparse mode. Default is False.
 
     Returns:
-        Tensor: The tensor storing the embeddings of the \
-                  supplied inputs, whose size are indicated by size respectively.
+        Tensor: The tensor storing the embeddings of the supplied inputs, whose size are indicated by size respectively.
 
     Examples:
         .. code-block:: python
 
-          import paddle.incubate as incubate
-          slots = []
-          data_1 = paddle.static.data(name='sequence', shape=[-1,1], dtype='int64', lod_level=1)
-          slots.append(data_1)
-          data_2 = paddle.static.data(name='sequence', shape=[-1,1], dtype='int64', lod_level=1)
-          slots.append(data_2)
-          embs = incubate.layers.pull_gpups_sparse(input=slots, size=[11, 35])
+            >>> import paddle.incubate as incubate
+            >>> import paddle
+            >>> paddle.enable_static()
+
+            >>> slots = []
+            >>> data_1 = paddle.static.data(name='sequence', shape=[-1,1], dtype='int64', lod_level=1)
+            >>> slots.append(data_1)
+            >>> data_2 = paddle.static.data(name='sequence', shape=[-1,1], dtype='int64', lod_level=1)
+            >>> slots.append(data_2)
+            >>> embs = incubate.layers.pull_gpups_sparse(input=slots, size=[11, 35])
     """
     helper = LayerHelper('pull_gpups_sparse', **locals())
     if dtype != 'float32':
@@ -1613,23 +1606,26 @@ def _pull_box_sparse(
     :attr:`input`.
 
     Args:
-        input(Tensor): Input is a Tensor<int64>, which
-            contains the IDs information.
-        size(int): The embedding size parameter, which indicates the size of
+        input (Tensor): Input is a Tensor<int64>, which contains the IDs information.
+        size (int): The embedding size parameter, which indicates the size of
             each embedding vector respectively.
-        dtype(str): The dtype refers to the data type of output tensor. Only supports
-        float32 now.
+        dtype (str, optional): The dtype refers to the data type of output tensor. Only supports float32 now. Default is float32.
+        is_distributed (bool, optional): Whether to use distributed mode. Default is False.
+        is_sparse (bool, optional): Whether to use sparse mode. Default is False.
 
     Returns:
-        Tensor: The tensor storing the embeddings of the \
-                  supplied inputs.
+        Tensor: The tensor storing the embeddings of the supplied inputs.
 
     Examples:
         .. code-block:: python
 
-          import paddle.incubate as incubate
-          data = paddle.static.data(name='sequence', shape=[-1,1], dtype='int64', lod_level=1)
-          emb = incubate.layers.pull_box_sparse(input=data, size=[11])
+            >>> import paddle.incubate as incubate
+            >>> import paddle
+            >>> paddle.enable_static()
+
+            >>> x = paddle.static.data(name='x', shape=[-1, 1], dtype='int64', lod_level=1)
+            >>> y = paddle.static.data(name='y', shape=[-1, 1], dtype='int64', lod_level=1)
+            >>> emb_x, emb_y = incubate.layers._pull_box_sparse([x, y], size=1)
     """
     helper = LayerHelper('pull_box_sparse', **locals())
     if dtype != 'float32':
diff --git a/python/paddle/incubate/nn/functional/fused_dropout_add.py b/python/paddle/incubate/nn/functional/fused_dropout_add.py
index 79f5adfcc33..4c47a2302ef 100644
--- a/python/paddle/incubate/nn/functional/fused_dropout_add.py
+++ b/python/paddle/incubate/nn/functional/fused_dropout_add.py
@@ -51,15 +51,27 @@ def fused_dropout_add(
 
     Examples:
 
-        ..  code-block:: python
-
-            # required: gpu
-            import paddle
-            from paddle.incubate.nn.functional import fused_dropout_add
-
-            x = paddle.randn([4, 10], dtype='float16')
-            y = paddle.randn([4, 10], dtype='float16')
-            out = fused_dropout_add(x, y, p=0.5)
+        .. code-block:: python
+
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> from paddle.incubate.nn.functional import fused_dropout_add
+
+            >>> paddle.set_device('gpu')
+            >>> paddle.seed(2023)
+            >>> x = paddle.randn([4, 10], dtype="float32")
+            >>> y = paddle.randn([4, 10], dtype="float32")
+            >>> out = fused_dropout_add(x, y, p=0.5)
+            >>> print(out)
+            Tensor(shape=[4, 10], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            [[-0.49133155,  0.53819323, -2.58393312,  0.06336236, -1.09908366,
+               0.22085167,  2.19751787,  0.05034769,  0.53417486,  0.84864247],
+             [ 0.78248203, -1.59652555, -0.14399840, -0.77985179, -0.17006736,
+              -0.30991879, -0.36593807, -0.51025450,  1.46401680,  0.61627960],
+             [ 4.50472546, -0.48472026,  0.60729283,  0.33509624, -0.25593102,
+              -1.45173049,  1.06727099,  0.00440830, -0.77340341,  0.67393088],
+             [ 1.29453969,  0.07568165,  0.71947742, -0.71768606, -2.57172823,
+               1.89179027,  3.26482797,  1.10493207, -1.04569530, -1.04862499]])
     """
     if isinstance(p, (int, float)):
         # fast return for p == 0
diff --git a/python/paddle/incubate/nn/functional/fused_ec_moe.py b/python/paddle/incubate/nn/functional/fused_ec_moe.py
index ca2057fc016..9f067acbb0d 100644
--- a/python/paddle/incubate/nn/functional/fused_ec_moe.py
+++ b/python/paddle/incubate/nn/functional/fused_ec_moe.py
@@ -37,25 +37,20 @@ def fused_ec_moe(
     Examples:
         .. code-block:: python
 
-            # required: gpu
-            import paddle
-            from paddle.incubate.nn.functional import fused_ec_moe
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> from paddle.incubate.nn.functional import fused_ec_moe
 
-            batch = 10
-            seq_len = 128
-            d_model = 1024
-            d_feed_forward = d_model * 4
-            num_expert = 8
-
-            x = paddle.randn([batch, seq_len, d_model])
-            gate = paddle.randn([batch, seq_len, num_expert])
-            bmm0_weight = paddle.randn([num_expert, d_model, d_feed_forward])
-            bmm0_bias = paddle.randn([num_expert, d_model, d_feed_forward])
-            bmm1_weight = paddle.randn([num_expert, d_model, d_feed_forward])
-            bmm1_bias = paddle.randn([num_expert, d_model, d_feed_forward])
-            out = fused_ec_moe(x, gate, bmm0_weight, bmm0_bias, bmm1_weight, bmm1_bias, act_type="gelu")
-
-            print(out.shape) # [batch, seq_len, num_expert]
+            >>> paddle.set_device('gpu')
+            >>> x = paddle.randn([10, 128, 1024])
+            >>> gate = paddle.randn([10, 128, 8])
+            >>> bmm0_weight = paddle.randn([8, 1024, 4096])
+            >>> bmm0_bias = paddle.randn([8, 1024, 4096])
+            >>> bmm1_weight = paddle.randn([8, 1024, 4096])
+            >>> bmm1_bias = paddle.randn([8, 1024, 4096])
+            >>> out = fused_ec_moe(x, gate, bmm0_weight, bmm0_bias, bmm1_weight, bmm1_bias, act_type="gelu")
+            >>> print(out.shape)
+            [10, 128, 1024]
     """
     helper = LayerHelper('fused_moe', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
diff --git a/python/paddle/incubate/nn/functional/fused_gate_attention.py b/python/paddle/incubate/nn/functional/fused_gate_attention.py
index 5bc2211c33c..560d6717fda 100644
--- a/python/paddle/incubate/nn/functional/fused_gate_attention.py
+++ b/python/paddle/incubate/nn/functional/fused_gate_attention.py
@@ -39,7 +39,7 @@ def fused_gate_attention(
     to information from different representation subspaces. This API only
     support self_attention. The pseudo code is as follows:
 
-    .. code-block:: python
+    .. code-block:: text
 
         c = c ** (-0.5)
         q = paddle.einsum('nbqa,ahc->nbqhc', q_data, query_w) * c
@@ -64,20 +64,20 @@ def fused_gate_attention(
     Args:
         query (Tensor): The input query tensor. The shape is [batch_size, msa_len, res_len, q_dim].
         key (Tensor, optional): The input key tensor, which can be set when
-                                merge_qkv is False. The shape is [batch_size, msa_len, m_size, kv_dim].
-        query_weight (Tensor, optional): The weight of query linear, which
-                                         should be set when input key is not None. The shape is [q_dim, num_heads, head_dim].
-        key_weight (Tensor, optional): The weight of key linear, which should
-                                       be set when input key is not None. The shape is [kv_dim, num_heads, head_dim].
-        value_weight (Tensor, optional): The weight of value linear, which should
-                                         be set when input key is not None. The shape is [kv_dim, num_heads, head_dim].
-        qkv_weight (Tensor, optional): The weight of qkv linear, which should
-                                       be set when merge_qkv is True. The shape is [3, num_heads, head_dim, q_dim].
-        gate_linear_weight (Tensor, optional): The weight of gating linear,
-                                       which should be set when has_gating is True. The shape is [q_dim, num_heads, head_dim].
-        gate_linear_bias (Tensor, optional): The bias of gating linear, which
-                                             should be set when has_gating is True. The shape is [num_heads, head_dim]. Default None.
-        out_linear_weight (Tensor, optional): The weight of output linear. The shape is [num_heads, head_dim, q_dim].
+            merge_qkv is False. The shape is [batch_size, msa_len, m_size, kv_dim]. Default None.
+        query_weight (Tensor, optional): The weight of query linear, which should be set when input
+            key is not None. The shape is [q_dim, num_heads, head_dim]. Default None.
+        key_weight (Tensor, optional): The weight of key linear, which should be set when input key
+            is not None. The shape is [kv_dim, num_heads, head_dim]. Default None.
+        value_weight (Tensor, optional): The weight of value linear, which should be set when input
+            key is not None. The shape is [kv_dim, num_heads, head_dim]. Default None.
+        qkv_weight (Tensor, optional): The weight of qkv linear, which should be set when merge_qkv
+            is True. The shape is [3, num_heads, head_dim, q_dim]. Default None.
+        gate_linear_weight (Tensor, optional): The weight of gating linear, which should be set when
+            has_gating is True. The shape is [q_dim, num_heads, head_dim]. Default None.
+        gate_linear_bias (Tensor, optional): The bias of gating linear, which should be set when
+            has_gating is True. The shape is [num_heads, head_dim]. Default None.
+        out_linear_weight (Tensor, optional): The weight of output linear. The shape is [num_heads, head_dim, q_dim]. Default None.
         out_linear_bias (Tensor): The bias of output linear, the shape is [q_dim]. Default None.
         nonbatched_bias (Tensor, optional): The extra bias. The shape is [batch_size, 1, num_heads, res_len, m_size]. Default None.
         attn_mask (Tensor, optional):  The attention mask. The shape is [batch_size, msa_len, 1, 1, res_len]. Default None.
@@ -92,54 +92,54 @@ def fused_gate_attention(
 
         .. code-block:: python
 
-            # required: gpu
-            import paddle
-            import paddle.incubate.nn.functional as F
-
-            # batch_size = 2
-            # msa_len = 4
-            # res_len = 2
-            # q_dim = 4
-            # num_heads = 8
-            # head_dim = 4
-            # m_size = res_len (when merge_qkv is True)
-
-            # query: [batch_size, msa_len, res_len, q_dim]
-            query = paddle.rand(shape=[2, 4, 2, 4], dtype="float32")
-
-            # qkv_weight:  [3, n_heads, head_dim, q_dim]
-            qkv_weight = paddle.rand(shape=[3, 8, 4, 4], dtype="float32")
-
-            # nonbatched_bias: [batch_size, 1, num_heads, res_len, m_size]
-            nonbatched_bias = paddle.rand(shape=[2, 1, 8, 2, 2], dtype="float32")
-
-            # attn_mask: [batch_size, msa_len, 1, 1, m_size]
-            attn_mask = paddle.rand(shape=[2, 4, 1, 1, 2], dtype="float32")
-
-            # gate_linear_weight: [q_dim, num_heads, head_dim]
-            gate_linear_weight = paddle.rand(shape=[4, 8, 4], dtype="float32")
-            # gate_bias: [num_heads, head_dim]
-            gate_linear_bias = paddle.rand(shape=[8, 4], dtype="float32")
-
-            # out_linear_weight: [num_heads, head_dim, q_dim]
-            out_linear_weight = paddle.rand(shape=[8, 4, 4], dtype="float32")
-            # out_linear_bias: [q_dim]
-            out_linear_bias = paddle.rand(shape=[4], dtype="float32")
-
-            # output: [batch_size, msa_len, res_len, q_dim]
-            output = F.fused_gate_attention(
-                query=query,
-                qkv_weight=qkv_weight,
-                gate_linear_weight=gate_linear_weight,
-                gate_linear_bias=gate_linear_bias,
-                out_linear_weight=out_linear_weight,
-                out_linear_bias=out_linear_bias,
-                nonbatched_bias=nonbatched_bias,
-                attn_mask=attn_mask,
-                has_gating=True,
-                merge_qkv=True)
-            print(output.shape)
-            # [2, 4, 2, 4]
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> import paddle.incubate.nn.functional as F
+
+            >>> # batch_size = 2
+            >>> # msa_len = 4
+            >>> # res_len = 2
+            >>> # q_dim = 4
+            >>> # num_heads = 8
+            >>> # head_dim = 4
+            >>> # m_size = res_len (when merge_qkv is True)
+
+            >>> # query: [batch_size, msa_len, res_len, q_dim]
+            >>> query = paddle.rand(shape=[2, 4, 2, 4], dtype="float32")
+
+            >>> # qkv_weight:  [3, n_heads, head_dim, q_dim]
+            >>> qkv_weight = paddle.rand(shape=[3, 8, 4, 4], dtype="float32")
+
+            >>> # nonbatched_bias: [batch_size, 1, num_heads, res_len, m_size]
+            >>> nonbatched_bias = paddle.rand(shape=[2, 1, 8, 2, 2], dtype="float32")
+
+            >>> # attn_mask: [batch_size, msa_len, 1, 1, m_size]
+            >>> attn_mask = paddle.rand(shape=[2, 4, 1, 1, 2], dtype="float32")
+
+            >>> # gate_linear_weight: [q_dim, num_heads, head_dim]
+            >>> gate_linear_weight = paddle.rand(shape=[4, 8, 4], dtype="float32")
+            >>> # gate_bias: [num_heads, head_dim]
+            >>> gate_linear_bias = paddle.rand(shape=[8, 4], dtype="float32")
+
+            >>> # out_linear_weight: [num_heads, head_dim, q_dim]
+            >>> out_linear_weight = paddle.rand(shape=[8, 4, 4], dtype="float32")
+            >>> # out_linear_bias: [q_dim]
+            >>> out_linear_bias = paddle.rand(shape=[4], dtype="float32")
+
+            >>> # output: [batch_size, msa_len, res_len, q_dim]
+            >>> output = F.fused_gate_attention(
+            ...     query=query,
+            ...     qkv_weight=qkv_weight,
+            ...     gate_linear_weight=gate_linear_weight,
+            ...     gate_linear_bias=gate_linear_bias,
+            ...     out_linear_weight=out_linear_weight,
+            ...     out_linear_bias=out_linear_bias,
+            ...     nonbatched_bias=nonbatched_bias,
+            ...     attn_mask=attn_mask,
+            ...     has_gating=True,
+            ...     merge_qkv=True)
+            >>> print(output.shape)
+            [2, 4, 2, 4]
 
     """
     if in_dynamic_mode():
diff --git a/python/paddle/incubate/nn/functional/fused_matmul_bias.py b/python/paddle/incubate/nn/functional/fused_matmul_bias.py
index 0fbb63025e3..526f8a3fec0 100644
--- a/python/paddle/incubate/nn/functional/fused_matmul_bias.py
+++ b/python/paddle/incubate/nn/functional/fused_matmul_bias.py
@@ -28,11 +28,11 @@ def fused_matmul_bias(
     Args:
         x (Tensor): the first input Tensor to be multiplied.
         y (Tensor): the second input Tensor to be multiplied. Its rank must be 2.
-        bias (Tensor|None): the input bias Tensor. If it is None, no bias addition would
-            be performed. Otherwise, the bias is added to the matrix multiplication result.
-        transpose_x (bool): Whether to transpose :math:`x` before multiplication.
-        transpose_y (bool): Whether to transpose :math:`y` before multiplication.
-        name(str|None): For detailed information, please refer to
+        bias (Tensor, optional): the input bias Tensor. If it is None, no bias addition would
+            be performed. Otherwise, the bias is added to the matrix multiplication result. Default: None.
+        transpose_x (bool, optional): Whether to transpose :math:`x` before multiplication. Default: False.
+        transpose_y (bool, optional): Whether to transpose :math:`y` before multiplication. Default: False.
+        name (str, optional): For detailed information, please refer to
             :ref:`api_guide_Name` . Usually name is no need to set and None by default.
 
     Returns:
@@ -41,15 +41,18 @@ def fused_matmul_bias(
     Examples:
         .. code-block:: python
 
-            # required: gpu
-            import paddle
-            from paddle.incubate.nn.functional import fused_matmul_bias
-
-            x = paddle.randn([3, 4])
-            y = paddle.randn([4, 5])
-            bias = paddle.randn([5])
-            out = fused_matmul_bias(x, y, bias)
-            print(out.shape) # [3, 5]
+            >>> # doctest: +SKIP('fused_gemm_epilogue is only supported when CUDA version >= 11.6')
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> from paddle.incubate.nn.functional import fused_matmul_bias
+
+            >>> paddle.set_device('gpu')
+            >>> x = paddle.randn([3, 5])
+            >>> y = paddle.randn([4, 5])
+            >>> bias = paddle.randn([5])
+            >>> out = fused_matmul_bias(x, y, bias)
+            >>> print(out.shape)
+            [3, 5]
     """
     if bias is None:
         return matmul(x, y, transpose_x, transpose_y, name)
@@ -76,10 +79,10 @@ def fused_linear(x, weight, bias=None, transpose_weight=False, name=None):
     Args:
         x (Tensor): the input Tensor to be multiplied.
         weight (Tensor): the weight Tensor to be multiplied. Its rank must be 2.
-        bias (Tensor|None): the input bias Tensor. If it is None, no bias addition would
-            be performed. Otherwise, the bias is added to the matrix multiplication result.
-        transpose_weight (bool): Whether to transpose :math:`weight` before multiplication.
-        name(str|None): For detailed information, please refer to
+        bias (Tensor, optional): the input bias Tensor. If it is None, no bias addition would
+            be performed. Otherwise, the bias is added to the matrix multiplication result. Default: None.
+        transpose_weight (bool, optional): Whether to transpose :math:`weight` before multiplication. Default: False.
+        name (str, optional): For detailed information, please refer to
             :ref:`api_guide_Name` . Usually name is no need to set and None by default.
 
     Returns:
@@ -88,15 +91,18 @@ def fused_linear(x, weight, bias=None, transpose_weight=False, name=None):
     Examples:
         .. code-block:: python
 
-            # required: gpu
-            import paddle
-            from paddle.incubate.nn.functional import fused_linear
-
-            x = paddle.randn([3, 4])
-            weight = paddle.randn([4, 5])
-            bias = paddle.randn([5])
-            out = fused_linear(x, weight, bias)
-            print(out.shape) # [3, 5]
+            >>> # doctest: +SKIP('fused_gemm_epilogue is only supported when CUDA version >= 11.6')
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> from paddle.incubate.nn.functional import fused_linear
+
+            >>> paddle.set_device('gpu')
+            >>> x = paddle.randn([3, 4])
+            >>> weight = paddle.randn([4, 5])
+            >>> bias = paddle.randn([5])
+            >>> out = fused_linear(x, weight, bias)
+            >>> print(out.shape)
+            [3, 5]
     """
     return fused_matmul_bias(x, weight, bias, False, transpose_weight, name)
 
@@ -109,25 +115,32 @@ def fused_linear_activation(
 
     Args:
         x (Tensor): the input Tensor to be multiplied.
-        weight (Tensor): the weight Tensor to be multiplied. Its rank must be 2.
+        y (Tensor): the weight Tensor to be multiplied. Its rank must be 2.
         bias (Tensor): the input bias Tensor, the bias is added to the matrix multiplication result.
-        transpose_weight (bool): Whether to transpose :math:`weight` before multiplication.
-        activation(str|None): Activation function, Currently, the available activation functions are limited to "gelu" (Gaussian Error Linear Unit) and "relu" (Rectified Linear Unit). These activation functions are applied to the output of the bias add.
+        trans_x (bool, optional): Whether to transpose :math:`x` before multiplication.
+        trans_y (bool, optional): Whether to transpose :math:`y` before multiplication.
+        activation (str, optional): Activation function, Currently, the available activation functions are
+            limited to "gelu" (Gaussian Error Linear Unit) and "relu" (Rectified Linear Unit).
+            These activation functions are applied to the output of the bias add. Default: None.
+
     Returns:
         Tensor: the output Tensor.
 
     Examples:
         .. code-block:: python
 
-            # required: gpu
-            import paddle
-            from paddle.incubate.nn.functional import fused_linear_activation
-
-            x = paddle.randn([3, 4])
-            weight = paddle.randn([4, 5])
-            bias = paddle.randn([5])
-            out = fused_linear_activation(x, weight, bias)
-            print(out.shape) # [3, 5]
+            >>> # doctest: +SKIP('fused_gemm_epilogue is only supported when CUDA version >= 11.6')
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> from paddle.incubate.nn.functional import fused_linear_activation
+
+            >>> paddle.set_device('gpu')
+            >>> x = paddle.randn([3, 4])
+            >>> weight = paddle.randn([4, 5])
+            >>> bias = paddle.randn([5])
+            >>> out = fused_linear_activation(x, weight, bias)
+            >>> print(out.shape)
+            [3, 5]
     """
     if activation is None:
         activation = "none"
diff --git a/python/paddle/incubate/nn/functional/fused_rotary_position_embedding.py b/python/paddle/incubate/nn/functional/fused_rotary_position_embedding.py
index f68dfb1dcd5..0b667687c11 100644
--- a/python/paddle/incubate/nn/functional/fused_rotary_position_embedding.py
+++ b/python/paddle/incubate/nn/functional/fused_rotary_position_embedding.py
@@ -44,14 +44,13 @@ def fused_rotary_position_embedding(
 
     Examples:
 
-        ..  code-block:: python
+        .. code-block:: python
 
-            >>> # required: gpu
             >>> # doctest: +REQUIRES(env:GPU)
             >>> import paddle
             >>> from paddle.incubate.nn.functional import fused_rotary_position_embedding
 
-            >>> paddle.device.set_device('gpu')
+            >>> paddle.set_device('gpu')
 
             >>> # batch_size = 2
             >>> # seq_len = 2
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 0e522533c85..c8ed387c5b7 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -45,44 +45,44 @@ def transpose(x, perm, name=None):
     Args:
         x (Tensor): The input Tensor. It is a N-D Tensor of data types bool, float32, float64, int32.
         perm (list|tuple): Permute the input according to the data of perm.
-        name (str): The name of this layer. It is optional.
+        name (str, optional): The name of this layer. For more information, please refer to :ref:`api_guide_Name`. Default is None.
 
     Returns:
         Tensor: A transposed n-D Tensor, with data type being bool, float32, float64, int32, int64.
 
-    For Example:
+    Examples:
 
         .. code-block:: text
 
-         x = [[[ 1  2  3  4] [ 5  6  7  8] [ 9 10 11 12]]
-             [[13 14 15 16] [17 18 19 20] [21 22 23 24]]]
-         shape(x) =  [2,3,4]
-
-         # Example 1
-         perm0 = [1,0,2]
-         y_perm0 = [[[ 1  2  3  4] [13 14 15 16]]
-                   [[ 5  6  7  8]  [17 18 19 20]]
-                   [[ 9 10 11 12]  [21 22 23 24]]]
-         shape(y_perm0) = [3,2,4]
-
-         # Example 2
-         perm1 = [2,1,0]
-         y_perm1 = [[[ 1 13] [ 5 17] [ 9 21]]
-                   [[ 2 14] [ 6 18] [10 22]]
-                   [[ 3 15]  [ 7 19]  [11 23]]
-                   [[ 4 16]  [ 8 20]  [12 24]]]
-         shape(y_perm1) = [4,3,2]
+            x = [[[ 1  2  3  4] [ 5  6  7  8] [ 9 10 11 12]]
+                 [[13 14 15 16] [17 18 19 20] [21 22 23 24]]]
+            shape(x) =  [2,3,4]
+
+            # Example 1
+            perm0 = [1,0,2]
+            y_perm0 = [[[ 1  2  3  4] [13 14 15 16]]
+                       [[ 5  6  7  8]  [17 18 19 20]]
+                       [[ 9 10 11 12]  [21 22 23 24]]]
+            shape(y_perm0) = [3,2,4]
+
+            # Example 2
+            perm1 = [2,1,0]
+            y_perm1 = [[[ 1 13] [ 5 17] [ 9 21]]
+                       [[ 2 14] [ 6 18] [10 22]]
+                       [[ 3 15]  [ 7 19]  [11 23]]
+                       [[ 4 16]  [ 8 20]  [12 24]]]
+            shape(y_perm1) = [4,3,2]
 
     Examples:
 
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.randn([2, 3, 4])
-            x_transposed = paddle.transpose(x, perm=[1, 0, 2])
-            print(x_transposed.shape)
-            # [3L, 2L, 4L]
+            >>> x = paddle.randn([2, 3, 4])
+            >>> x_transposed = paddle.transpose(x, perm=[1, 0, 2])
+            >>> print(x_transposed.shape)
+            [3, 2, 4]
 
     """
     if in_dynamic_mode():
@@ -180,10 +180,9 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
     Args:
         x (Tensor): The input tensor which is a Tensor.
         y (Tensor): The input tensor which is a Tensor.
-        transpose_x (bool, optional): Whether to transpose :math:`x` before multiplication.
-        transpose_y (bool, optional): Whether to transpose :math:`y` before multiplication.
-        name(str, optional): A name for this layer(optional). If set None, the layer
-            will be named automatically.
+        transpose_x (bool, optional): Whether to transpose :math:`x` before multiplication. Default is False.
+        transpose_y (bool, optional): Whether to transpose :math:`y` before multiplication. Default is False.
+        name (str, optional): If set None, the layer will be named automatically. For more information, please refer to :ref:`api_guide_Name`. Default is None.
 
     Returns:
         Tensor: The output Tensor.
@@ -192,42 +191,42 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
 
         .. code-block:: python
 
-            import paddle
-
-            # vector * vector
-            x = paddle.rand([10])
-            y = paddle.rand([10])
-            z = paddle.matmul(x, y)
-            print(z.shape)
-            # ()
-
-            # matrix * vector
-            x = paddle.rand([10, 5])
-            y = paddle.rand([5])
-            z = paddle.matmul(x, y)
-            print(z.shape)
-            # (10,)
-
-            # batched matrix * broadcasted vector
-            x = paddle.rand([10, 5, 2])
-            y = paddle.rand([2])
-            z = paddle.matmul(x, y)
-            print(z.shape)
-            # (10, 5)
-
-            # batched matrix * batched matrix
-            x = paddle.rand([10, 5, 2])
-            y = paddle.rand([10, 2, 5])
-            z = paddle.matmul(x, y)
-            print(z.shape)
-            # (10, 5, 5)
-
-            # batched matrix * broadcasted matrix
-            x = paddle.rand([10, 1, 5, 2])
-            y = paddle.rand([1, 3, 2, 5])
-            z = paddle.matmul(x, y)
-            print(z.shape)
-            # (10, 3, 5, 5)
+            >>> import paddle
+
+            >>> # vector * vector
+            >>> x = paddle.rand([10])
+            >>> y = paddle.rand([10])
+            >>> z = paddle.matmul(x, y)
+            >>> print(z.shape)
+            []
+
+            >>> # matrix * vector
+            >>> x = paddle.rand([10, 5])
+            >>> y = paddle.rand([5])
+            >>> z = paddle.matmul(x, y)
+            >>> print(z.shape)
+            [10]
+
+            >>> # batched matrix * broadcasted vector
+            >>> x = paddle.rand([10, 5, 2])
+            >>> y = paddle.rand([2])
+            >>> z = paddle.matmul(x, y)
+            >>> print(z.shape)
+            [10, 5]
+
+            >>> # batched matrix * batched matrix
+            >>> x = paddle.rand([10, 5, 2])
+            >>> y = paddle.rand([10, 2, 5])
+            >>> z = paddle.matmul(x, y)
+            >>> print(z.shape)
+            [10, 5, 5]
+
+            >>> # batched matrix * broadcasted matrix
+            >>> x = paddle.rand([10, 1, 5, 2])
+            >>> y = paddle.rand([1, 3, 2, 5])
+            >>> z = paddle.matmul(x, y)
+            >>> print(z.shape)
+            [10, 3, 5, 5]
 
     """
     if in_dynamic_mode():
@@ -305,54 +304,61 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            x = paddle.arange(24, dtype="float32").reshape([2, 3, 4]) - 12
-            # x: Tensor(shape=[2, 3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #          [[[-12., -11., -10., -9. ],
-            #            [-8. , -7. , -6. , -5. ],
-            #            [-4. , -3. , -2. , -1. ]],
-
-            #           [[ 0. ,  1. ,  2. ,  3. ],
-            #            [ 4. ,  5. ,  6. ,  7. ],
-            #            [ 8. ,  9. ,  10.,  11.]]])
-
-            # compute frobenius norm along last two dimensions.
-            out_fro = paddle.linalg.norm(x, p='fro', axis=[0,1])
-            # out_fro: Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #                 [17.43559647, 16.91153526, 16.73320007, 16.91153526])
-
-            # compute 2-order vector norm along last dimension.
-            out_pnorm = paddle.linalg.norm(x, p=2, axis=-1)
-            # out_pnorm: Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #                [[21.11871147, 13.19090557, 5.47722578 ],
-            #                 [3.74165750 , 11.22497177, 19.13112640]])
-
-            # compute 2-order  norm along [0,1] dimension.
-            out_pnorm = paddle.linalg.norm(x, p=2, axis=[0,1])
-            # out_pnorm: Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #                  [17.43559647, 16.91153526, 16.73320007, 16.91153526])
-
-            # compute inf-order  norm
-            out_pnorm = paddle.linalg.norm(x, p=float("inf"))
-            # out_pnorm  = Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #                    12.)
-
-            out_pnorm = paddle.linalg.norm(x, p=float("inf"), axis=0)
-            # out_pnorm: Tensor(shape=[3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #                 [[12., 11., 10., 9. ],
-            #                  [8. , 7. , 6. , 7. ],
-            #                  [8. , 9. , 10., 11.]])
-
-            # compute -inf-order  norm
-            out_pnorm = paddle.linalg.norm(x, p=-float("inf"))
-            # out_pnorm: Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #                  0.)
-
-            out_pnorm = paddle.linalg.norm(x, p=-float("inf"), axis=0)
-            # out_pnorm: Tensor(shape=[3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #                  [[0., 1., 2., 3.],
-            #                  [4., 5., 6., 5.],
-            #                  [4., 3., 2., 1.]])
+            >>> import paddle
+            >>> x = paddle.arange(24, dtype="float32").reshape([2, 3, 4]) - 12
+            >>> print(x)
+            Tensor(shape=[2, 3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[-12., -11., -10., -9. ],
+              [-8. , -7. , -6. , -5. ],
+              [-4. , -3. , -2. , -1. ]],
+             [[ 0. ,  1. ,  2. ,  3. ],
+              [ 4. ,  5. ,  6. ,  7. ],
+              [ 8. ,  9. ,  10.,  11.]]])
+
+            >>> # compute frobenius norm along last two dimensions.
+            >>> out_fro = paddle.linalg.norm(x, p='fro', axis=[0,1])
+            >>> print(out_fro)
+            Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [17.43559647, 16.91153526, 16.73320007, 16.91153526])
+
+            >>> # compute 2-order vector norm along last dimension.
+            >>> out_pnorm = paddle.linalg.norm(x, p=2, axis=-1)
+            >>> print(out_pnorm)
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[21.11871147, 13.19090557, 5.47722578 ],
+             [3.74165750 , 11.22497177, 19.13112640]])
+
+            >>> # compute 2-order  norm along [0,1] dimension.
+            >>> out_pnorm = paddle.linalg.norm(x, p=2, axis=[0,1])
+            >>> print(out_pnorm)
+            Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [17.43559647, 16.91153526, 16.73320007, 16.91153526])
+
+            >>> # compute inf-order  norm
+            >>> out_pnorm = paddle.linalg.norm(x, p=float("inf"))
+            >>> print(out_pnorm)
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+            12.)
+
+            >>> out_pnorm = paddle.linalg.norm(x, p=float("inf"), axis=0)
+            >>> print(out_pnorm)
+            Tensor(shape=[3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[12., 11., 10., 9. ],
+             [8. , 7. , 6. , 7. ],
+             [8. , 9. , 10., 11.]])
+
+            >>> # compute -inf-order  norm
+            >>> out_pnorm = paddle.linalg.norm(x, p=-float("inf"))
+            >>> print(out_pnorm)
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+            0.)
+
+            >>> out_pnorm = paddle.linalg.norm(x, p=-float("inf"), axis=0)
+            >>> print(out_pnorm)
+            Tensor(shape=[3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0., 1., 2., 3.],
+             [4., 5., 6., 5.],
+             [4., 3., 2., 1.]])
     """
 
     def frobenius_norm(input, dim=None, keepdim=False, name=None):
@@ -360,8 +366,10 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
         The frobenius norm OP is to calculate the frobenius norm of certain two dimensions of Tensor `input`.
         Args:
           input (Variable): Tensor, data type float32, float64.
-          dim (list, optional): None for last two dimensions.
+          dim (list, optional): None for last two dimensions. Default None.
           keepdim (bool, optional): Whether keep the dimensions as the `input`, Default False.
+          name (str, optional): The default value is None. Normally there is no need for
+              user to set this property. For more information, please refer to :ref:`api_guide_Name`.
         """
         if dim is not None and not (isinstance(dim, list) and len(dim) == 2):
             raise ValueError(
@@ -400,9 +408,12 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
         Calculate the p-order vector norm for certain  dimension of Tensor `input`.
         Args:
           input (Variable): Tensor, data type float32, float64.
-          porder (float, optional): None for porder=2.0.
-          axis (int, optional): None for last dimension.
+          porder (float, optional): None for porder=2.0. Default None.
+          axis (int, optional): None for last dimension. Default None.
           keepdim (bool, optional): Whether keep the dimensions as the `input`, Default False.
+          asvector (bool, optional): Whether keep the result as a vector, Default False.
+          name (str, optional): The default value is None. Normally there is no need for
+              user to set this property. For more information, please refer to :ref:`api_guide_Name`.
         """
         if in_dynamic_mode():
             if axis is None:
@@ -682,21 +693,29 @@ def dist(x, y, p=2, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.to_tensor([[3, 3],[3, 3]], dtype="float32")
-            y = paddle.to_tensor([[3, 3],[3, 1]], dtype="float32")
-            out = paddle.dist(x, y, 0)
-            print(out) # out = 1.
-
-            out = paddle.dist(x, y, 2)
-            print(out) # out = 2.
-
-            out = paddle.dist(x, y, float("inf"))
-            print(out) # out = 2.
-
-            out = paddle.dist(x, y, float("-inf"))
-            print(out) # out = 0.
+            >>> import paddle
+
+            >>> x = paddle.to_tensor([[3, 3],[3, 3]], dtype="float32")
+            >>> y = paddle.to_tensor([[3, 3],[3, 1]], dtype="float32")
+            >>> out = paddle.dist(x, y, 0)
+            >>> print(out)
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+            1.)
+
+            >>> out = paddle.dist(x, y, 2)
+            >>> print(out)
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+            2.)
+
+            >>> out = paddle.dist(x, y, float("inf"))
+            >>> print(out)
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+            2.)
+
+            >>> out = paddle.dist(x, y, float("-inf"))
+            >>> print(out)
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+            0.)
     """
     if in_dynamic_mode():
         return _C_ops.dist(x, y, p)
@@ -740,83 +759,95 @@ def cond(x, p=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.to_tensor([[1., 0, -1], [0, 1, 0], [1, 0, 1]])
-
-            # compute conditional number when p is None
-            out = paddle.linalg.cond(x)
-            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        1.41421342)
-
-            # compute conditional number when order of the norm is 'fro'
-            out_fro = paddle.linalg.cond(x, p='fro')
-            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        3.16227770)
-
-            # compute conditional number when order of the norm is 'nuc'
-            out_nuc = paddle.linalg.cond(x, p='nuc')
-            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        9.24263859)
-
-            # compute conditional number when order of the norm is 1
-            out_1 = paddle.linalg.cond(x, p=1)
-            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        2.)
-
-            # compute conditional number when order of the norm is -1
-            out_minus_1 = paddle.linalg.cond(x, p=-1)
-            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        1.)
-
-            # compute conditional number when order of the norm is 2
-            out_2 = paddle.linalg.cond(x, p=2)
-            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        1.41421342)
-
-            # compute conditional number when order of the norm is -1
-            out_minus_2 = paddle.linalg.cond(x, p=-2)
-            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        0.70710683)
-
-            # compute conditional number when order of the norm is inf
-            out_inf = paddle.linalg.cond(x, p=float("inf"))
-            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        2.)
-
-            # compute conditional number when order of the norm is -inf
-            out_minus_inf = paddle.linalg.cond(x, p=-float("inf"))
-            # Tensor(shape=[], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        1.)
-
-            a = paddle.randn([2, 4, 4])
-            # Tensor(shape=[2, 4, 4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[[-0.06784091, -0.07095790,  1.31792855, -0.58959651],
-            #          [ 0.20818676, -0.85640615, -0.89998871, -1.47439921],
-            #          [-0.49132481,  0.42250812, -0.77383220, -2.19794774],
-            #          [-0.33551720, -1.70003879, -1.09795380, -0.63737559]],
-
-            #         [[ 1.12026262, -0.16119350, -1.21157813,  2.74383283],
-            #          [-0.15999718,  0.18798758, -0.69392562,  1.35720372],
-            #          [-0.53013402, -2.26304483,  1.40843511, -1.02288902],
-            #          [ 0.69533503,  2.05261683, -0.02251151, -1.43127477]]])
-
-            a_cond_fro = paddle.linalg.cond(a, p='fro')
-            # Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [8.86691189 , 75.23817444])
-
-            b = paddle.randn([2, 3, 4])
-            # Tensor(shape=[2, 3, 4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[[-0.43754861,  1.80796063, -0.78729683, -1.82264030],
-            #          [-0.27670753,  0.06620564,  0.29072434, -0.31155765],
-            #          [ 0.34123746, -0.05444612,  0.05001324, -1.46877074]],
-
-            #         [[-0.64331555, -1.51103854, -1.26277697, -0.68024760],
-            #          [ 2.59375715, -1.06665540,  0.96575671, -0.73330832],
-            #          [-0.47064447, -0.23945692, -0.95150250, -1.07125998]]])
-            b_cond_2 = paddle.linalg.cond(b, p=2)
-            # Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [6.64228773, 3.89068866])
+            >>> import paddle
+            >>> paddle.seed(2023)
+            >>> x = paddle.to_tensor([[1., 0, -1], [0, 1, 0], [1, 0, 1]])
+
+            >>> # compute conditional number when p is None
+            >>> out = paddle.linalg.cond(x)
+            >>> print(out)
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+            1.41421378)
+
+            >>> # compute conditional number when order of the norm is 'fro'
+            >>> out_fro = paddle.linalg.cond(x, p='fro')
+            >>> print(out_fro)
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+            3.16227770)
+
+            >>> # compute conditional number when order of the norm is 'nuc'
+            >>> out_nuc = paddle.linalg.cond(x, p='nuc')
+            >>> print(out_nuc)
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+            9.24264145)
+
+            >>> # compute conditional number when order of the norm is 1
+            >>> out_1 = paddle.linalg.cond(x, p=1)
+            >>> print(out_1)
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+            2.)
+
+            >>> # compute conditional number when order of the norm is -1
+            >>> out_minus_1 = paddle.linalg.cond(x, p=-1)
+            >>> print(out_minus_1)
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+            1.)
+
+            >>> # compute conditional number when order of the norm is 2
+            >>> out_2 = paddle.linalg.cond(x, p=2)
+            >>> print(out_2)
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+            1.41421378)
+
+            >>> # compute conditional number when order of the norm is -1
+            >>> out_minus_2 = paddle.linalg.cond(x, p=-2)
+            >>> print(out_minus_2)
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+            0.70710671)
+
+            >>> # compute conditional number when order of the norm is inf
+            >>> out_inf = paddle.linalg.cond(x, p=float("inf"))
+            >>> print(out_inf)
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+            2.)
+
+            >>> # compute conditional number when order of the norm is -inf
+            >>> out_minus_inf = paddle.linalg.cond(x, p=-float("inf"))
+            >>> print(out_minus_inf)
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+            1.)
+
+            >>> a = paddle.randn([2, 4, 4])
+            >>> print(a)
+            Tensor(shape=[2, 4, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[ 0.06132207,  1.11349595,  0.41906244, -0.24858207],
+              [-1.85169315, -1.50370061,  1.73954511,  0.13331604],
+              [ 1.66359663, -0.55764782, -0.59911072, -0.57773495],
+              [-1.03176904, -0.33741450, -0.29695082, -1.50258386]],
+             [[ 0.67233968, -1.07747352,  0.80170447, -0.06695852],
+              [-1.85003340, -0.23008066,  0.65083790,  0.75387722],
+              [ 0.61212337, -0.52664012,  0.19209868, -0.18707706],
+              [-0.00711021,  0.35236868, -0.40404350,  1.28656745]]])
+
+            >>> a_cond_fro = paddle.linalg.cond(a, p='fro')
+            >>> print(a_cond_fro)
+            Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [6.37173700 , 35.15114594])
+
+            >>> b = paddle.randn([2, 3, 4])
+            >>> print(b)
+            Tensor(shape=[2, 3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[ 0.03306439,  0.70149767,  0.77064633, -0.55978841],
+              [-0.84461296,  0.99335045, -1.23486686,  0.59551388],
+              [-0.63035583, -0.98797107,  0.09410731,  0.47007179]],
+             [[ 0.85850012, -0.98949534, -1.63086998,  1.07340240],
+              [-0.05492965,  1.04750168, -2.33754158,  1.16518629],
+              [ 0.66847134, -1.05326962, -0.05703246, -0.48190674]]])
+
+            >>> b_cond_2 = paddle.linalg.cond(b, p=2)
+            >>> print(b_cond_2)
+            Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [2.86566353, 6.85834455])
 
     """
 
@@ -1081,7 +1112,7 @@ def dot(x, y, name=None):
 
     Parameters:
         x(Tensor): 1-D or 2-D ``Tensor``. Its dtype should be ``float32``, ``float64``, ``int32``, ``int64``, ``complex64``, ``complex128``
-        y(Tensor): 1-D or 2-D ``Tensor``. Its dtype soulde be ``float32``, ``float64``, ``int32``, ``int64``, ``complex64``, ``complex128``
+        y(Tensor): 1-D or 2-D ``Tensor``. Its dtype should be ``float32``, ``float64``, ``int32``, ``int64``, ``complex64``, ``complex128``
         name(str, optional): Name of the output. Default is None. It's used to print debug info for developers. Details: :ref:`api_guide_Name`
 
     Returns:
@@ -1089,21 +1120,25 @@ def dot(x, y, name=None):
 
     Examples:
 
-    .. code-block:: python
+        .. code-block:: python
 
-        import paddle
+            >>> import paddle
 
-        # 1-D Tensor * 1-D Tensor
-        x = paddle.to_tensor([1, 2, 3])
-        y = paddle.to_tensor([4, 5, 6])
-        z = paddle.dot(x, y)
-        print(z)  # 32
+            >>> # 1-D Tensor * 1-D Tensor
+            >>> x = paddle.to_tensor([1, 2, 3])
+            >>> y = paddle.to_tensor([4, 5, 6])
+            >>> z = paddle.dot(x, y)
+            >>> print(z)
+            Tensor(shape=[], dtype=int64, place=Place(cpu), stop_gradient=True,
+            32)
 
-        # 2-D Tensor * 2-D Tensor
-        x = paddle.to_tensor([[1, 2, 3], [2, 4, 6]])
-        y = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
-        z = paddle.dot(x, y)
-        print(z)  # [32, 64]
+            >>> # 2-D Tensor * 2-D Tensor
+            >>> x = paddle.to_tensor([[1, 2, 3], [2, 4, 6]])
+            >>> y = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
+            >>> z = paddle.dot(x, y)
+            >>> print(z)
+            Tensor(shape=[2], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [32, 64])
 
     """
     if in_dynamic_mode():
@@ -1167,31 +1202,30 @@ def cov(x, rowvar=True, ddof=True, fweights=None, aweights=None, name=None):
     element Cij is the covariance of xi and xj. The element Cii is the variance of xi itself.
 
     Parameters:
-        x(Tensor): A N-D(N<=2) Tensor containing multiple variables and observations. By default, each row of x represents a variable. Also see rowvar below.
-        rowvar(Bool, optional): If rowvar is True (default), then each row represents a variable, with observations in the columns. Default: True
-        ddof(Bool, optional): If ddof=True will return the unbiased estimate, and ddof=False will return the simple average. Default: True
-        fweights(Tensor, optional): 1-D Tensor of integer frequency weights; The number of times each observation vector should be repeated. Default: None
-        aweights(Tensor, optional): 1-D Tensor of observation vector weights. How important of the observation vector, larger data means this element is more important. Default: None
-        name(str, optional): Name of the output. Default is None. It's used to print debug info for developers. Details: :ref:`api_guide_Name`
+        x (Tensor): A N-D(N<=2) Tensor containing multiple variables and observations. By default, each row of x represents a variable. Also see rowvar below.
+        rowvar (Bool, optional): If rowvar is True (default), then each row represents a variable, with observations in the columns. Default: True.
+        ddof (Bool, optional): If ddof=True will return the unbiased estimate, and ddof=False will return the simple average. Default: True.
+        fweights (Tensor, optional): 1-D Tensor of integer frequency weights; The number of times each observation vector should be repeated. Default: None.
+        aweights (Tensor, optional): 1-D Tensor of observation vector weights. How important of the observation vector, larger data means this element is more important. Default: None.
+        name (str, optional): Name of the output. Default is None. It's used to print debug info for developers. Details: :ref:`api_guide_Name` .
 
     Returns:
         Tensor: The covariance matrix Tensor of the variables.
 
     Examples:
 
-    .. code-block:: python
-
-        import paddle
+        .. code-block:: python
 
-        xt = paddle.rand((3, 4))
-        paddle.linalg.cov(xt)
+            >>> import paddle
+            >>> paddle.seed(2023)
 
-        '''
-        Tensor(shape=[3, 3], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            [[0.07918842, 0.06127326, 0.01493049],
-                [0.06127326, 0.06166256, 0.00302668],
-                [0.01493049, 0.00302668, 0.01632146]])
-        '''
+            >>> xt = paddle.rand((3, 4))
+            >>> paddle.linalg.cov(xt)
+            >>> print(xt)
+            Tensor(shape=[3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0.86583614, 0.52014720, 0.25960937, 0.90525323],
+             [0.42400089, 0.40641287, 0.97020894, 0.74437362],
+             [0.51785129, 0.73292869, 0.97786582, 0.04315904]])
     """
     op_type = 'cov'
     if len(x.shape) > 2 or len(x.shape) < 1:
@@ -1289,35 +1323,48 @@ def t(input, name=None):
 
     Args:
         input (Tensor): The input Tensor. It is a N-D (N<=2) Tensor of data types float32, float64, int32, int64.
-        name(str, optional): The default value is None.  Normally there is no need for
-            user to set this property.  For more information, please refer to :ref:`api_guide_Name`
+        name (str, optional): The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
+
     Returns:
         Tensor: A transposed n-D Tensor, with data type being float16, float32, float64, int32, int64.
 
     Examples:
 
         .. code-block:: python
-           :name: code-example
-             import paddle
-
-             # Example 1 (0-D tensor)
-             x = paddle.to_tensor([0.79])
-             paddle.t(x) # [0.79]
-
-             # Example 2 (1-D tensor)
-             x = paddle.to_tensor([0.79, 0.84, 0.32])
-             paddle.t(x) # [0.79000002, 0.83999997, 0.31999999]
-             paddle.t(x).shape # [3]
-
-             # Example 3 (2-D tensor)
-             x = paddle.to_tensor([[0.79, 0.84, 0.32],
-                                  [0.64, 0.14, 0.57]])
-             x.shape # [2, 3]
-             paddle.t(x)
-             # [[0.79000002, 0.63999999],
-             #  [0.83999997, 0.14000000],
-             #  [0.31999999, 0.56999999]]
-             paddle.t(x).shape # [3, 2]
+            :name: code-example
+
+            >>> import paddle
+
+            >>> # Example 1 (0-D tensor)
+            >>> x = paddle.to_tensor([0.79])
+            >>> out = paddle.t(x)
+            >>> print(out)
+            Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [0.79000002])
+
+            >>> # Example 2 (1-D tensor)
+            >>> x = paddle.to_tensor([0.79, 0.84, 0.32])
+            >>> out2 = paddle.t(x)
+            >>> print(out2)
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [0.79000002, 0.83999997, 0.31999999])
+            >>> print(paddle.t(x).shape)
+            [3]
+
+            >>> # Example 3 (2-D tensor)
+            >>> x = paddle.to_tensor([[0.79, 0.84, 0.32],
+            ...                       [0.64, 0.14, 0.57]])
+            >>> print(x.shape)
+            [2, 3]
+            >>> out3 = paddle.t(x)
+            >>> print(out3)
+            Tensor(shape=[3, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0.79000002, 0.63999999],
+             [0.83999997, 0.14000000],
+             [0.31999999, 0.56999999]])
+            >>> print(paddle.t(x).shape)
+            [3, 2]
 
     """
     if len(input.shape) > 2:
@@ -1375,24 +1422,28 @@ def cross(x, y, axis=9, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.to_tensor([[1.0, 1.0, 1.0],
-                                  [2.0, 2.0, 2.0],
-                                  [3.0, 3.0, 3.0]])
-            y = paddle.to_tensor([[1.0, 1.0, 1.0],
-                                  [1.0, 1.0, 1.0],
-                                  [1.0, 1.0, 1.0]])
-
-            z1 = paddle.cross(x, y)
-            # [[-1. -1. -1.]
-            #  [ 2.  2.  2.]
-            #  [-1. -1. -1.]]
-
-            z2 = paddle.cross(x, y, axis=1)
-            # [[0. 0. 0.]
-            #  [0. 0. 0.]
-            #  [0. 0. 0.]]
+            >>> import paddle
+
+            >>> x = paddle.to_tensor([[1.0, 1.0, 1.0],
+            ...                         [2.0, 2.0, 2.0],
+            ...                         [3.0, 3.0, 3.0]])
+            >>> y = paddle.to_tensor([[1.0, 1.0, 1.0],
+            ...                         [1.0, 1.0, 1.0],
+            ...                         [1.0, 1.0, 1.0]])
+            ...
+            >>> z1 = paddle.cross(x, y)
+            >>> print(z1)
+            Tensor(shape=[3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-1., -1., -1.],
+             [ 2.,  2.,  2.],
+             [-1., -1., -1.]])
+
+            >>> z2 = paddle.cross(x, y, axis=1)
+            >>> print(z2)
+            Tensor(shape=[3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0., 0., 0.],
+             [0., 0., 0.],
+             [0., 0., 0.]])
     """
     if in_dynamic_mode():
         axis = K_DEFAULT_DIM if axis is None else axis
@@ -1439,7 +1490,7 @@ def cholesky(x, upper=False, name=None):
             where * is zero or more batch dimensions, and matrices on the
             inner-most 2 dimensions all should be symmetric positive-definite.
             Its data type should be float32 or float64.
-        upper (bool): The flag indicating whether to return upper or lower
+        upper (bool, optional): The flag indicating whether to return upper or lower
             triangular matrices. Default: False.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
@@ -1451,14 +1502,19 @@ def cholesky(x, upper=False, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
+            >>> paddle.seed(2023)
 
-            a = paddle.rand([3, 3], dtype="float32")
-            a_t = paddle.transpose(a, [1, 0])
-            x = paddle.matmul(a, a_t) + 1e-03
+            >>> a = paddle.rand([3, 3], dtype="float32")
+            >>> a_t = paddle.transpose(a, [1, 0])
+            >>> x = paddle.matmul(a, a_t) + 1e-03
 
-            out = paddle.linalg.cholesky(x, upper=False)
-            print(out)
+            >>> out = paddle.linalg.cholesky(x, upper=False)
+            >>> print(out)
+            Tensor(shape=[3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[1.04337072, 0.        , 0.        ],
+             [1.06467664, 0.17859250, 0.        ],
+             [1.30602181, 0.08326444, 0.22790681]])
     """
     if in_dynamic_mode():
         return _C_ops.cholesky(x, upper)
@@ -1486,12 +1542,12 @@ def matrix_rank(x, tol=None, hermitian=False, name=None):
     Args:
         x (Tensor): The input tensor. Its shape should be `[..., m, n]`, where `...` is zero or more batch dimensions. If `x` is a batch
             of matrices then the output has the same batch dimensions. The data type of `x` should be float32 or float64.
-        tol (float,Tensor,optional): the tolerance value. Default: None. If `tol` is not specified, and `sigma` is the largest
-            singular value (or eigenvalues in absolute value), and `eps` is the epsilon value for the dtype of `x`, then `tol` is computed
-            with formula `tol=sigma * max(m,n) * eps`. Note that if `x` is a batch of matrices, `tol` is computed this way for every batch.
-        hermitian (bool,optional): indicates whether `x` is Hermitian. Default: False. When hermitian=True, `x` is assumed to be Hermitian,
+        tol (float|Tensor, optional): the tolerance value. If `tol` is not specified, and `sigma` is the largest singular value
+            (or eigenvalues in absolute value), and `eps` is the epsilon value for the dtype of `x`, then `tol` is computed with formula
+            `tol=sigma * max(m,n) * eps`. Note that if `x` is a batch of matrices, `tol` is computed this way for every batch. Default: None.
+        hermitian (bool, optional): indicates whether `x` is Hermitian. Default: False. When hermitian=True, `x` is assumed to be Hermitian,
             enabling a more efficient method for finding eigenvalues, but `x` is not checked inside the function. Instead, We just use
-            the lower triangular of the matrix to compute.
+            the lower triangular of the matrix to compute. Default: False.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -1500,19 +1556,21 @@ def matrix_rank(x, tol=None, hermitian=False, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            a = paddle.eye(10)
-            b = paddle.linalg.matrix_rank(a)
-            print(b)
-            # b = 10
+            >>> a = paddle.eye(10)
+            >>> b = paddle.linalg.matrix_rank(a)
+            >>> print(b)
+            Tensor(shape=[], dtype=int32, place=Place(cpu), stop_gradient=True,
+            10)
 
-            c = paddle.ones(shape=[3, 4, 5, 5])
-            d = paddle.linalg.matrix_rank(c, tol=0.01, hermitian=True)
-            print(d)
-            # d = [[1, 1, 1, 1],
-            #      [1, 1, 1, 1],
-            #      [1, 1, 1, 1]]
+            >>> c = paddle.ones(shape=[3, 4, 5, 5])
+            >>> d = paddle.linalg.matrix_rank(c, tol=0.01, hermitian=True)
+            >>> print(d)
+            Tensor(shape=[3, 4], dtype=int32, place=Place(cpu), stop_gradient=True,
+            [[1, 1, 1, 1],
+             [1, 1, 1, 1],
+             [1, 1, 1, 1]])
 
     """
     if in_dynamic_mode():
@@ -1567,13 +1625,13 @@ def bmm(x, y, name=None):
 
     Both of the two input tensors must be three-dementional and share the same batch size.
 
-    if x is a (b, m, k) tensor, y is a (b, k, n) tensor, the output will be a (b, m, n) tensor.
+    If x is a (b, m, k) tensor, y is a (b, k, n) tensor, the output will be a (b, m, n) tensor.
 
     Args:
         x (Tensor): The input Tensor.
         y (Tensor): The input Tensor.
-        name(str|None): A name for this layer(optional). If set None, the layer
-            will be named automatically.
+        name (str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically. Default: None.
 
     Returns:
         Tensor: The product Tensor.
@@ -1581,23 +1639,23 @@ def bmm(x, y, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            # In imperative mode:
-            # size x: (2, 2, 3) and y: (2, 3, 2)
-            x = paddle.to_tensor([[[1.0, 1.0, 1.0],
-                                [2.0, 2.0, 2.0]],
-                                [[3.0, 3.0, 3.0],
-                                [4.0, 4.0, 4.0]]])
-            y = paddle.to_tensor([[[1.0, 1.0],[2.0, 2.0],[3.0, 3.0]],
-                                [[4.0, 4.0],[5.0, 5.0],[6.0, 6.0]]])
-            out = paddle.bmm(x, y)
-            # Tensor(shape=[2, 2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #        [[[6. , 6. ],
-            #          [12., 12.]],
-
-            #         [[45., 45.],
-            #          [60., 60.]]])
+            >>> import paddle
+
+            >>> # In imperative mode:
+            >>> # size x: (2, 2, 3) and y: (2, 3, 2)
+            >>> x = paddle.to_tensor([[[1.0, 1.0, 1.0],
+            ...                     [2.0, 2.0, 2.0]],
+            ...                     [[3.0, 3.0, 3.0],
+            ...                     [4.0, 4.0, 4.0]]])
+            >>> y = paddle.to_tensor([[[1.0, 1.0],[2.0, 2.0],[3.0, 3.0]],
+            ...                     [[4.0, 4.0],[5.0, 5.0],[6.0, 6.0]]])
+            >>> out = paddle.bmm(x, y)
+            >>> print(out)
+            Tensor(shape=[2, 2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[6. , 6. ],
+              [12., 12.]],
+             [[45., 45.],
+              [60., 60.]]])
 
     """
     if in_dynamic_mode():
@@ -1639,9 +1697,9 @@ def histogram(input, bins=100, min=0, max=0, name=None):
     Args:
         input (Tensor): A Tensor(or LoDTensor) with shape :math:`[N_1, N_2,..., N_k]` . The data type of the input Tensor
             should be float32, float64, int32, int64.
-        bins (int, optional): number of histogram bins.
-        min (int, optional): lower end of the range (inclusive).
-        max (int, optional): upper end of the range (inclusive).
+        bins (int, optional): number of histogram bins. Default: 100.
+        min (int, optional): lower end of the range (inclusive). Default: 0.
+        max (int, optional): upper end of the range (inclusive). Default: 0.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
@@ -1650,11 +1708,13 @@ def histogram(input, bins=100, min=0, max=0, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            inputs = paddle.to_tensor([1, 2, 1])
-            result = paddle.histogram(inputs, bins=4, min=0, max=3)
-            print(result) # [0, 2, 1, 0]
+            >>> inputs = paddle.to_tensor([1, 2, 1])
+            >>> result = paddle.histogram(inputs, bins=4, min=0, max=3)
+            >>> print(result)
+            Tensor(shape=[4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [0, 2, 1, 0])
     """
     if in_dynamic_mode():
         return _C_ops.histogram(input, bins, min, max)
@@ -1681,8 +1741,8 @@ def bincount(x, weights=None, minlength=0, name=None):
         x (Tensor): A Tensor with non-negative integer. Should be 1-D tensor.
         weights (Tensor, optional): Weight for each value in the input tensor. Should have the same shape as input. Default is None.
         minlength (int, optional): Minimum number of bins. Should be non-negative integer. Default is 0.
-        name(str, optional): The default value is None.  Normally there is no need for user to set this
-            property.  For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name`. Default is None.
 
     Returns:
         Tensor: The tensor of frequency.
@@ -1690,15 +1750,19 @@ def bincount(x, weights=None, minlength=0, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([1, 2, 1, 4, 5])
-            result1 = paddle.bincount(x)
-            print(result1) # [0, 2, 1, 0, 1, 1]
+            >>> x = paddle.to_tensor([1, 2, 1, 4, 5])
+            >>> result1 = paddle.bincount(x)
+            >>> print(result1)
+            Tensor(shape=[6], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [0, 2, 1, 0, 1, 1])
 
-            w = paddle.to_tensor([2.1, 0.4, 0.1, 0.5, 0.5])
-            result2 = paddle.bincount(x, weights=w)
-            print(result2) # [0., 2.19999981, 0.40000001, 0., 0.50000000, 0.50000000]
+            >>> w = paddle.to_tensor([2.1, 0.4, 0.1, 0.5, 0.5])
+            >>> result2 = paddle.bincount(x, weights=w)
+            >>> print(result2)
+            Tensor(shape=[6], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [0.        , 2.19999981, 0.40000001, 0.        , 0.50000000, 0.50000000])
     """
     if x.dtype not in [paddle.int32, paddle.int64]:
         raise TypeError("Elements in Input(x) should all be integers")
@@ -1738,8 +1802,8 @@ def mv(x, vec, name=None):
             should be one of float32, float64.
         vec (Tensor): A tensor with shape :math:`[N]` , The data type of the input Tensor x
             should be one of float32, float64.
-        name(str, optional): The default value is None.  Normally there is no need for user to set this
-            property.  For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name`. Default is None.
 
     Returns:
         Tensor: The tensor which is producted by x and vec.
@@ -1747,17 +1811,17 @@ def mv(x, vec, name=None):
     Examples:
         .. code-block:: python
 
-            # x: [M, N], vec: [N]
-            # paddle.mv(x, vec)  # out: [M]
+            >>> # x: [M, N], vec: [N]
+            >>> # paddle.mv(x, vec)  # out: [M]
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([[2, 1, 3], [3, 0, 1]]).astype("float64")
-            vec = paddle.to_tensor([3, 5, 1]).astype("float64")
-            out = paddle.mv(x, vec)
-            print(out)
-            # Tensor(shape=[2], dtype=float64, place=Place(cpu), stop_gradient=True,
-            #        [14., 10.])
+            >>> x = paddle.to_tensor([[2, 1, 3], [3, 0, 1]]).astype("float64")
+            >>> vec = paddle.to_tensor([3, 5, 1]).astype("float64")
+            >>> out = paddle.mv(x, vec)
+            >>> print(out)
+            Tensor(shape=[2], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [14., 10.])
     """
     if in_dynamic_mode():
         return _C_ops.mv(x, vec)
@@ -1803,8 +1867,8 @@ def det(x, name=None):
         x (Tensor): the input matrix of size `(n, n)` or the
             batch of matrices of size `(*, n, n)` where `*` is one or more
             batch dimensions.
-        name(str, optional): Name of the output. Default is None. It's used
-            to print debug info for developers. Details: :ref:`api_guide_Name`
+        name (str, optional): Name of the output.It's used to print debug info for
+            developers. Details: :ref:`api_guide_Name`. Default is None.
 
     Returns:
         Tensor, the determinant value of a square matrix or batches of square matrices.
@@ -1812,15 +1876,13 @@ def det(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x =  paddle.randn([3,3,3])
-
-            A = paddle.linalg.det(x)
-
-            print(A)
-
-            # [ 0.02547996,  2.52317095, -6.15900707])
+            >>> import paddle
+            >>> paddle.seed(2023)
+            >>> x =  paddle.randn([3,3,3])
+            >>> A = paddle.linalg.det(x)
+            >>> print(A)
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [-1.29280925,  0.77832544,  0.89754158])
 
 
     """
@@ -1854,15 +1916,17 @@ def slogdet(x, name=None):
     """
 
     Calculates the sign and natural logarithm of the absolute value of a square matrix's or batches square matrices' determinant.
-    The determinant can be computed with ``sign * exp`` (logabsdet)
+    The determinant can be computed with ``sign * exp`` (logabsdet).
 
-    Supports input of float, double
+    Supports input of float, double.
 
-    Note that for matrices that have zero determinant, this returns ``(0, -inf)``
+    Note that for matrices that have zero determinant, this returns ``(0, -inf)``.
 
     Args:
         x (Tensor): the batch of matrices of size :math:`(*, n, n)`
             where math:`*` is one or more batch dimensions.
+        name (str, optional): Name of the output.It's used to print debug info for
+            developers. Details: :ref:`api_guide_Name`. Default is None.
 
     Returns:
         y (Tensor), A tensor containing the sign of the determinant and the natural logarithm
@@ -1871,16 +1935,16 @@ def slogdet(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x =  paddle.randn([3,3,3])
-
-            A = paddle.linalg.slogdet(x)
-
-            print(A)
-
-            # [[ 1.        ,  1.        , -1.        ],
-            # [-0.98610914, -0.43010661, -0.10872950]])
+            >>> import paddle
+            >>> paddle.seed(2023)
+            >>> x =  paddle.randn([3,3,3])
+            >>> A = paddle.linalg.slogdet(x)
+            >>> print(A)
+            >>> # doctest: +SKIP
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-1.        ,  1.        ,  1.        ],
+             [ 0.25681755, -0.25061053, -0.10809582]])
+            >>> # doctest: -SKIP
 
     """
     if in_dynamic_mode():
@@ -1931,8 +1995,8 @@ def svd(x, full_matrices=False, name=None):
             If full_matrices = False, svd op will use a economic method to store U and V.
             which means shape of U is `[..., N, K]`, shape of V is `[..., M, K]`. K = min(M, N).
             Default value is False.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): Name for the operation. For more information,
+            please refer to :ref:`api_guide_Name`. Default value is None.
 
     Returns:
         - U (Tensor), is the singular value decomposition result U.
@@ -1944,25 +2008,29 @@ def svd(x, full_matrices=False, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.to_tensor([[1.0, 2.0], [1.0, 3.0], [4.0, 6.0]]).astype('float64')
-            x = x.reshape([3, 2])
-            u, s, vh = paddle.linalg.svd(x)
-            print (u)
-            #U = [[ 0.27364809, -0.21695147  ],
-            #      [ 0.37892198, -0.87112408 ],
-            #      [ 0.8840446 ,  0.44053933 ]]
-
-            print (s)
-            #S = [8.14753743, 0.78589688]
-            print (vh)
-            #VT= [[ 0.51411221,  0.85772294],
-            #     [ 0.85772294, -0.51411221]]
-
-            # one can verify : U * S * VT == X
-            #                  U * UH == I
-            #                  V * VH == I
+            >>> import paddle
+
+            >>> x = paddle.to_tensor([[1.0, 2.0], [1.0, 3.0], [4.0, 6.0]]).astype('float64')
+            >>> x = x.reshape([3, 2])
+            >>> u, s, vh = paddle.linalg.svd(x)
+            >>> print (u)
+            Tensor(shape=[3, 2], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[-0.27364809, -0.21695147],
+             [-0.37892198, -0.87112408],
+             [-0.88404460,  0.44053933]])
+
+            >>> print (s)
+            Tensor(shape=[2], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [8.14753743, 0.78589688])
+
+            >>> print (vh)
+            Tensor(shape=[2, 2], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[-0.51411221, -0.85772294],
+             [ 0.85772294, -0.51411221]])
+
+            >>> # one can verify : U * S * VT == X
+            >>> #                  U * UH == I
+            >>> #                  V * VH == I
     """
 
     if in_dynamic_mode():
@@ -2002,8 +2070,9 @@ def pca_lowrank(x, q=None, center=True, niter=2, name=None):
             Default value is :math:`q=min(6,N,M)`.
         center (bool, optional): if True, center the input tensor.
             Default value is True.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        niter (int, optional): number of iterations to perform. Default: 2.
+        name (str, optional): Name for the operation. For more information,
+            please refer to :ref:`api_guide_Name`. Default: None.
 
     Returns:
         - Tensor U, is N x q matrix.
@@ -2015,29 +2084,30 @@ def pca_lowrank(x, q=None, center=True, niter=2, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.randn((5, 5), dtype='float64')
-            U, S, V = paddle.linalg.pca_lowrank(x)
-            print(U)
-            # Tensor(shape=[5, 5], dtype=float64, place=Place(gpu:0), stop_gradient=True,
-            #        [[ 0.41057070,  0.40364287,  0.59099574, -0.34529432,  0.44721360],
-            #         [-0.30243321,  0.55670611, -0.15025419,  0.61321785,  0.44721360],
-            #         [ 0.57427340, -0.15936327, -0.66414981, -0.06097905,  0.44721360],
-            #         [-0.63897516, -0.09968973, -0.17298615, -0.59316819,  0.44721360],
-            #         [-0.04343573, -0.70129598,  0.39639442,  0.38622370,  0.44721360]])
-
-            print(S)
-            # Tensor(shape=[5], dtype=float64, place=Place(gpu:0), stop_gradient=True,
-            #        [3.33724265, 2.57573259, 1.69479048, 0.68069312, 0.00000000])
-
-            print(V)
-            # Tensor(shape=[5, 5], dtype=float64, place=Place(gpu:0), stop_gradient=True,
-            #        [[ 0.09800724, -0.32627008, -0.23593953,  0.81840445,  0.39810690],
-            #         [-0.60100303,  0.63741176, -0.01953663,  0.09023999,  0.47326173],
-            #         [ 0.25073864, -0.21305240, -0.32662950, -0.54786156,  0.69634740],
-            #         [ 0.33057205,  0.48282641, -0.75998527,  0.06744040, -0.27472705],
-            #         [ 0.67604895,  0.45688227,  0.50959437,  0.13179682,  0.23908071]])
+            >>> import paddle
+            >>> paddle.seed(2023)
+
+            >>> x = paddle.randn((5, 5), dtype='float64')
+            >>> U, S, V = paddle.linalg.pca_lowrank(x)
+            >>> print(U)
+           Tensor(shape=[5, 5], dtype=float64, place=Place(cpu), stop_gradient=True,
+           [[ 0.80131563,  0.11962647,  0.27667179, -0.25891214,  0.44721360],
+            [-0.12642301,  0.69917551, -0.17899393,  0.51296394,  0.44721360],
+            [ 0.08997135, -0.69821706, -0.20059228,  0.51396579,  0.44721360],
+            [-0.23871837, -0.02815453, -0.59888153, -0.61932365,  0.44721360],
+            [-0.52614559, -0.09243040,  0.70179595, -0.14869394,  0.44721360]])
+
+            >>> print(S)
+            Tensor(shape=[5], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [2.60101614, 2.40554940, 1.49768346, 0.19064830, 0.00000000])
+
+            >>> print(V)
+            Tensor(shape=[5, 5], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[ 0.58339481, -0.17143771,  0.00522143,  0.57976310,  0.54231640],
+             [ 0.22334335,  0.72963474, -0.30148399, -0.39388750,  0.41438019],
+             [ 0.05416913,  0.34666487,  0.93549758,  0.00063507,  0.04162998],
+             [-0.39519094,  0.53074980, -0.16687419,  0.71175586, -0.16638919],
+             [-0.67131070, -0.19071018,  0.07795789, -0.04615811,  0.71046714]])
     """
 
     def conjugate(x):
@@ -2172,25 +2242,28 @@ def matrix_power(x, n, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.to_tensor([[1, 2, 3],
-                                  [1, 4, 9],
-                                  [1, 8, 27]], dtype='float64')
-            print(paddle.linalg.matrix_power(x, 2))
-            # [[6.  , 34. , 102.],
-            #  [14. , 90. , 282.],
-            #  [36. , 250., 804.]]
-
-            print(paddle.linalg.matrix_power(x, 0))
-            # [[1., 0., 0.],
-            #  [0., 1., 0.],
-            #  [0., 0., 1.]]
-
-            print(paddle.linalg.matrix_power(x, -2))
-            # [[ 12.91666667, -12.75000000,  2.83333333 ],
-            #  [-7.66666667 ,  8.         , -1.83333333 ],
-            #  [ 1.80555556 , -1.91666667 ,  0.44444444 ]]
+            >>> import paddle
+
+            >>> x = paddle.to_tensor([[1, 2, 3],
+            ...                       [1, 4, 9],
+            ...                       [1, 8, 27]], dtype='float64')
+            >>> print(paddle.linalg.matrix_power(x, 2))
+            Tensor(shape=[3, 3], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[6.  , 34. , 102.],
+             [14. , 90. , 282.],
+             [36. , 250., 804.]])
+
+            >>> print(paddle.linalg.matrix_power(x, 0))
+            Tensor(shape=[3, 3], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[1., 0., 0.],
+             [0., 1., 0.],
+             [0., 0., 1.]])
+
+            >>> print(paddle.linalg.matrix_power(x, -2))
+            Tensor(shape=[3, 3], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[ 12.91666667, -12.75000000,  2.83333333 ],
+             [-7.66666667 ,  8.         , -1.83333333 ],
+             [ 1.80555556 , -1.91666667 ,  0.44444444 ]])
     """
     if in_dynamic_mode():
         return _C_ops.matrix_power(x, n)
@@ -2218,14 +2291,14 @@ def qr(x, mode="reduced", name=None):
         x (Tensor): The input tensor. Its shape should be `[..., M, N]`,
             where ... is zero or more batch dimensions. M and N can be arbitrary
             positive number. The data type of x should be float32 or float64.
-        mode (str, optional): A flag to control the behavior of qr, the default is "reduced".
+        mode (str, optional): A flag to control the behavior of qr.
             Suppose x's shape is `[..., M, N]` and denoting `K = min(M, N)`:
             If mode = "reduced", qr op will return reduced Q and R matrices,
             which means Q's shape is `[..., M, K]` and R's shape is `[..., K, N]`.
             If mode = "complete", qr op will return complete Q and R matrices,
             which means Q's shape is `[..., M, M]` and R's shape is `[..., M, N]`.
             If mode = "r", qr op will only return reduced R matrix, which means
-            R's shape is `[..., K, N]`.
+            R's shape is `[..., K, N]`. Default: "reduced".
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
 
@@ -2236,21 +2309,21 @@ def qr(x, mode="reduced", name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.to_tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]).astype('float64')
-            q, r = paddle.linalg.qr(x)
-            print (q)
-            print (r)
-
-            # Q = [[-0.16903085,  0.89708523],
-            #      [-0.50709255,  0.27602622],
-            #      [-0.84515425, -0.34503278]])
-
-            # R = [[-5.91607978, -7.43735744],
-            #      [ 0.        ,  0.82807867]])
-
-            # one can verify : X = Q * R ;
+            >>> import paddle
+
+            >>> x = paddle.to_tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]).astype('float64')
+            >>> q, r = paddle.linalg.qr(x)
+            >>> print (q)
+            Tensor(shape=[3, 2], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[-0.16903085,  0.89708523],
+             [-0.50709255,  0.27602622],
+             [-0.84515425, -0.34503278]])
+            >>> print (r)
+            Tensor(shape=[2, 2], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[-5.91607978, -7.43735744],
+             [ 0.        ,  0.82807867]])
+
+            >>> # one can verify : X = Q * R ;
     """
     if in_dynamic_mode():
         q, r = _C_ops.qr(x, mode)
@@ -2318,42 +2391,41 @@ def lu(x, pivot=True, get_infos=False, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.to_tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]).astype('float64')
-            lu,p,info = paddle.linalg.lu(x, get_infos=True)
-
-            # >>> lu:
-            # Tensor(shape=[3, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            #    [[5.        , 6.        ],
-            #        [0.20000000, 0.80000000],
-            #        [0.60000000, 0.50000000]])
-            # >>> p
-            # Tensor(shape=[2], dtype=int32, place=CUDAPlace(0), stop_gradient=True,
-            #    [3, 3])
-            # >>> info
-            # Tensor(shape=[], dtype=int32, place=CUDAPlace(0), stop_gradient=True,
-            #    0)
-
-            P,L,U = paddle.linalg.lu_unpack(lu,p)
-
-            # >>> P
-            # (Tensor(shape=[3, 3], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            # [[0., 1., 0.],
-            # [0., 0., 1.],
-            # [1., 0., 0.]]),
-            # >>> L
-            # Tensor(shape=[3, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            # [[1.        , 0.        ],
-            # [0.20000000, 1.        ],
-            # [0.60000000, 0.50000000]]),
-            # >>> U
-            # Tensor(shape=[2, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            # [[5.        , 6.        ],
-            # [0.        , 0.80000000]]))
-
-
-            # one can verify : X = P @ L @ U ;
+            >>> import paddle
+
+            >>> x = paddle.to_tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]).astype('float64')
+            >>> lu,p,info = paddle.linalg.lu(x, get_infos=True)
+
+            >>> print(lu)
+            Tensor(shape=[3, 2], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[5.        , 6.        ],
+             [0.20000000, 0.80000000],
+             [0.60000000, 0.50000000]])
+            >>> print(p)
+            Tensor(shape=[2], dtype=int32, place=Place(cpu), stop_gradient=True,
+            [3, 3])
+            >>> print(info)
+            Tensor(shape=[1], dtype=int32, place=Place(cpu), stop_gradient=True,
+            [0])
+
+            >>> P,L,U = paddle.linalg.lu_unpack(lu,p)
+
+            >>> print(P)
+            Tensor(shape=[3, 3], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[0., 1., 0.],
+             [0., 0., 1.],
+             [1., 0., 0.]])
+            >>> print(L)
+            Tensor(shape=[3, 2], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[1.        , 0.        ],
+             [0.20000000, 1.        ],
+             [0.60000000, 0.50000000]])
+            >>> print(U)
+            Tensor(shape=[2, 2], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[5.        , 6.        ],
+             [0.        , 0.80000000]])
+
+            >>> # one can verify : X = P @ L @ U ;
     """
 
     if in_dynamic_mode():
@@ -2397,7 +2469,7 @@ def lu_unpack(x, y, unpack_ludata=True, unpack_pivots=True, name=None):
 
         y (Tensor): Pivots get from paddle.linalg.lu.
 
-        unpack_ludata (bool,optional): whether to unpack L and U from x. Default: True.
+        unpack_ludata (bool, optional): whether to unpack L and U from x. Default: True.
 
         unpack_pivots (bool, optional): whether to unpack permutation matrix P from Pivtos. Default: True.
 
@@ -2415,41 +2487,41 @@ def lu_unpack(x, y, unpack_ludata=True, unpack_pivots=True, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.to_tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]).astype('float64')
-            lu,p,info = paddle.linalg.lu(x, get_infos=True)
-
-            # >>> lu:
-            # Tensor(shape=[3, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            #    [[5.        , 6.        ],
-            #        [0.20000000, 0.80000000],
-            #        [0.60000000, 0.50000000]])
-            # >>> p
-            # Tensor(shape=[2], dtype=int32, place=CUDAPlace(0), stop_gradient=True,
-            #    [3, 3])
-            # >>> info
-            # Tensor(shape=[], dtype=int32, place=CUDAPlace(0), stop_gradient=True,
-            #    0)
-
-            P,L,U = paddle.linalg.lu_unpack(lu,p)
-
-            # >>> P
-            # (Tensor(shape=[3, 3], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            # [[0., 1., 0.],
-            # [0., 0., 1.],
-            # [1., 0., 0.]]),
-            # >>> L
-            # Tensor(shape=[3, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            # [[1.        , 0.        ],
-            # [0.20000000, 1.        ],
-            # [0.60000000, 0.50000000]]),
-            # >>> U
-            # Tensor(shape=[2, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            # [[5.        , 6.        ],
-            # [0.        , 0.80000000]]))
-
-            # one can verify : X = P @ L @ U ;
+            >>> import paddle
+
+            >>> x = paddle.to_tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]).astype('float64')
+            >>> lu,p,info = paddle.linalg.lu(x, get_infos=True)
+
+            >>> print(lu)
+            Tensor(shape=[3, 2], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[5.        , 6.        ],
+             [0.20000000, 0.80000000],
+             [0.60000000, 0.50000000]])
+            >>> print(p)
+            Tensor(shape=[2], dtype=int32, place=Place(cpu), stop_gradient=True,
+            [3, 3])
+            >>> print(info)
+            Tensor(shape=[1], dtype=int32, place=Place(cpu), stop_gradient=True,
+            [0])
+
+            >>> P,L,U = paddle.linalg.lu_unpack(lu,p)
+
+            >>> print(P)
+            Tensor(shape=[3, 3], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[0., 1., 0.],
+             [0., 0., 1.],
+             [1., 0., 0.]])
+            >>> print(L)
+            Tensor(shape=[3, 2], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[1.        , 0.        ],
+             [0.20000000, 1.        ],
+             [0.60000000, 0.50000000]])
+            >>> print(U)
+            Tensor(shape=[2, 2], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[5.        , 6.        ],
+             [0.        , 0.80000000]])
+
+            >>> # one can verify : X = P @ L @ U ;
     """
     if x.ndim < 2:
         raise ValueError(
@@ -2507,27 +2579,25 @@ def eig(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            paddle.device.set_device("cpu")
-
-            x = paddle.to_tensor([[1.6707249, 7.2249975, 6.5045543],
-                               [9.956216,  8.749598,  6.066444 ],
-                               [4.4251957, 1.7983172, 0.370647 ]])
-            w, v = paddle.linalg.eig(x)
-            print(v)
-            # Tensor(shape=[3, 3], dtype=complex128, place=CPUPlace, stop_gradient=False,
-            #       [[(-0.5061363550800655+0j) , (-0.7971760990842826+0j) ,
-            #         (0.18518077798279986+0j)],
-            #        [(-0.8308237755993192+0j) ,  (0.3463813401919749+0j) ,
-            #         (-0.6837005269141947+0j) ],
-            #        [(-0.23142567697893396+0j),  (0.4944999840400175+0j) ,
-            #         (0.7058765252952796+0j) ]])
-
-            print(w)
-            # Tensor(shape=[3], dtype=complex128, place=CPUPlace, stop_gradient=False,
-            #       [ (16.50471283351188+0j)  , (-5.5034820550763515+0j) ,
-            #         (-0.21026087843552282+0j)])
+            >>> import paddle
+
+            >>> x = paddle.to_tensor([[1.6707249, 7.2249975, 6.5045543],
+            ...                       [9.956216,  8.749598,  6.066444 ],
+            ...                       [4.4251957, 1.7983172, 0.370647 ]])
+            >>> w, v = paddle.linalg.eig(x)
+            >>> print(v)
+            Tensor(shape=[3, 3], dtype=complex64, place=Place(cpu), stop_gradient=True,
+            [[ (0.5061365365982056+0j) ,  (0.7971761226654053+0j) ,
+               (0.1851806491613388+0j) ],
+             [ (0.8308236598968506+0j) , (-0.3463813066482544+0j) ,
+               (-0.6837005615234375+0j) ],
+             [ (0.23142573237419128+0j), (-0.49449989199638367+0j),
+               (0.7058765292167664+0j) ]])
+
+            >>> print(w)
+            Tensor(shape=[3], dtype=complex64, place=Place(cpu), stop_gradient=True,
+            [ (16.50470733642578+0j)  , (-5.503481388092041+0j)  ,
+              (-0.21026138961315155+0j)])
     """
 
     if in_dynamic_mode():
@@ -2570,18 +2640,20 @@ def eigvals(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
+            >>> paddle.seed(2023)
 
-            paddle.set_device("cpu")
-            paddle.seed(1234)
+            >>> x = paddle.rand(shape=[3, 3], dtype='float64')
+            >>> print(x)
+            Tensor(shape=[3, 3], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[0.86583615, 0.52014721, 0.25960938],
+             [0.90525323, 0.42400090, 0.40641288],
+             [0.97020893, 0.74437359, 0.51785128]])
 
-            x = paddle.rand(shape=[3, 3], dtype='float64')
-            # [[0.02773777, 0.93004224, 0.06911496],
-            #  [0.24831591, 0.45733623, 0.07717843],
-            #  [0.48016702, 0.14235102, 0.42620817]])
-
-            print(paddle.linalg.eigvals(x))
-            # [(-0.27078833542132674+0j), (0.29962280156230725+0j), (0.8824477020120244+0j)] #complex128
+            >>> print(paddle.linalg.eigvals(x))
+            Tensor(shape=[3], dtype=complex128, place=Place(cpu), stop_gradient=True,
+            [ (1.788956694280852+0j)  ,  (0.16364484879581526+0j),
+              (-0.14491322408727625+0j)])
     """
 
     x_shape = list(x.shape)
@@ -2641,33 +2713,32 @@ def multi_dot(x, name=None):
 
     Args:
         x ([Tensor]): The input tensors which is a list Tensor.
-        name(str|None): A name for this layer(optional). If set None, the layer
-            will be named automatically.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: The output Tensor.
 
-
     Examples:
 
-    .. code-block:: python
+        .. code-block:: python
 
-        import paddle
+            >>> import paddle
 
-        # A * B
-        A = paddle.rand([3, 4])
-        B = paddle.rand([4, 5])
-        out = paddle.linalg.multi_dot([A, B])
-        print(out.shape)
-        # [3, 5]
+            >>> # A * B
+            >>> A = paddle.rand([3, 4])
+            >>> B = paddle.rand([4, 5])
+            >>> out = paddle.linalg.multi_dot([A, B])
+            >>> print(out.shape)
+            [3, 5]
 
-        # A * B * C
-        A = paddle.rand([10, 5])
-        B = paddle.rand([5, 8])
-        C = paddle.rand([8, 7])
-        out = paddle.linalg.multi_dot([A, B, C])
-        print(out.shape)
-        # [10, 7]
+            >>> # A * B * C
+            >>> A = paddle.rand([10, 5])
+            >>> B = paddle.rand([5, 8])
+            >>> C = paddle.rand([8, 7])
+            >>> out = paddle.linalg.multi_dot([A, B, C])
+            >>> print(out.shape)
+            [10, 7]
 
     """
     if in_dynamic_mode():
@@ -2703,9 +2774,9 @@ def eigh(x, UPLO='L', name=None):
     Args:
         x (Tensor): A tensor with shape :math:`[*, N, N]` , The data type of the input Tensor x
             should be one of float32, float64, complex64, complex128.
-        UPLO(str, optional): (string, default 'L'), 'L' represents the lower triangular matrix,
-                        "'U' represents the upper triangular matrix.".
-        name(str, optional): The default value is None.  Normally there is no need for user to set this
+        UPLO (str, optional): (string, default 'L'), 'L' represents the lower triangular matrix,
+            "'U' represents the upper triangular matrix.". Default: 'L'.
+        name (str, optional): The default value is None. Normally there is no need for user to set this
             property.  For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -2717,15 +2788,17 @@ def eigh(x, UPLO='L', name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([[1, -2j], [2j, 5]])
-            out_value, out_vector = paddle.linalg.eigh(x, UPLO='L')
-            print(out_value)
-            #[0.17157288, 5.82842712]
-            print(out_vector)
-            #[(-0.9238795325112867+0j), (-0.3826834323650898+0j)],
-            #[ 0.3826834323650898j    , -0.9238795325112867j    ]]
+            >>> x = paddle.to_tensor([[1, -2j], [2j, 5]])
+            >>> out_value, out_vector = paddle.linalg.eigh(x, UPLO='L')
+            >>> print(out_value)
+            Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [0.17157286, 5.82842731])
+            >>> print(out_vector)
+            Tensor(shape=[2, 2], dtype=complex64, place=Place(cpu), stop_gradient=True,
+            [[(-0.9238795042037964+0j), (-0.3826833963394165+0j)],
+             [ 0.3826833963394165j    , -0.9238795042037964j    ]])
 
     """
     if in_dynamic_mode():
@@ -2789,21 +2862,18 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
     If x is hermitian or symmetric matrix, svd will be replaced with eigh.
 
     Args:
-        x(Tensor): The input tensor. Its shape should be (*, m, n)
+        x (Tensor): The input tensor. Its shape should be (*, m, n)
             where * is zero or more batch dimensions. m and n can be
             arbitraty positive number. The data type of x should be
             float32 or float64 or complex64 or complex128. When data
             type is complex64 or cpmplex128, hermitian should be set
             True.
-
-        rcond(Tensor, optional): the tolerance value to determine
+        rcond (Tensor, optional): the tolerance value to determine
             when is a singular value zero. Default:1e-15.
-
-        hermitian(bool, optional): indicates whether x is Hermitian
+        hermitian (bool, optional): indicates whether x is Hermitian
             if complex or symmetric if real. Default: False.
-
-        name(str|None): A name for this layer(optional). If set None,
-            the layer will be named automatically.
+        name (str, optional): The default value is None. Normally there is no need for user to set this
+            property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: The tensor with same data type with x. it represents
@@ -2812,25 +2882,24 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.arange(15).reshape((3, 5)).astype('float64')
-            input = paddle.to_tensor(x)
-            out = paddle.linalg.pinv(input)
-            print(input)
-            print(out)
-
-            # input:
-            # [[0. , 1. , 2. , 3. , 4. ],
-            # [5. , 6. , 7. , 8. , 9. ],
-            # [10., 11., 12., 13., 14.]]
-
-            # out:
-            # [[-0.22666667, -0.06666667,  0.09333333],
-            # [-0.12333333, -0.03333333,  0.05666667],
-            # [-0.02000000,  0.00000000,  0.02000000],
-            # [ 0.08333333,  0.03333333, -0.01666667],
-            # [ 0.18666667,  0.06666667, -0.05333333]]
+            >>> import paddle
+
+            >>> x = paddle.arange(15).reshape((3, 5)).astype('float64')
+            >>> input = paddle.to_tensor(x)
+            >>> out = paddle.linalg.pinv(input)
+            >>> print(input)
+            Tensor(shape=[3, 5], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[0. , 1. , 2. , 3. , 4. ],
+             [5. , 6. , 7. , 8. , 9. ],
+             [10., 11., 12., 13., 14.]])
+
+            >>> print(out)
+            Tensor(shape=[5, 3], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[-0.22666667, -0.06666667,  0.09333333],
+             [-0.12333333, -0.03333333,  0.05666667],
+             [-0.02000000, -0.00000000,  0.02000000],
+             [ 0.08333333,  0.03333333, -0.01666667],
+             [ 0.18666667,  0.06666667, -0.05333333]])
 
             # one can verify : x * out * x = x ;
             # or              out * x * out = x ;
@@ -3034,7 +3103,7 @@ def solve(x, y, name=None):
             more batch dimensions. Its data type should be float32 or float64.
         y (Tensor): A vector/matrix or a batch of vectors/matrices. Its shape should be ``[*, M, K]``, where ``*`` is zero or
             more batch dimensions. Its data type should be float32 or float64.
-        name(str, optional): Name for the operation (optional, default is None).
+        name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -3045,18 +3114,19 @@ def solve(x, y, name=None):
 
         .. code-block:: python
 
-            # a square system of linear equations:
-            # 2*X0 + X1 = 9
-            # X0 + 2*X1 = 8
+            >>> # a square system of linear equations:
+            >>> # 2*X0 + X1 = 9
+            >>> # X0 + 2*X1 = 8
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([[3, 1],[1, 2]], dtype="float64")
-            y = paddle.to_tensor([9, 8], dtype="float64")
-            out = paddle.linalg.solve(x, y)
+            >>> x = paddle.to_tensor([[3, 1],[1, 2]], dtype="float64")
+            >>> y = paddle.to_tensor([9, 8], dtype="float64")
+            >>> out = paddle.linalg.solve(x, y)
 
-            print(out)
-            # [2., 3.])
+            >>> print(out)
+            Tensor(shape=[2], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [2., 3.])
     """
     if in_dynamic_mode():
         return _C_ops.solve(x, y)
@@ -3077,7 +3147,7 @@ def triangular_solve(
     x, y, upper=True, transpose=False, unitriangular=False, name=None
 ):
     r"""
-    Computes the solution of a system of equations with a triangular coefficient.  `x` is coefficient matrix
+    Computes the solution of a system of equations with a triangular coefficient. `x` is coefficient matrix
     `y` is multiple right-hand sides of equations.
 
     Input `x` and `y` is 2D matrices or batches of 2D matrices. If the inputs are batches, the outputs is also
@@ -3103,7 +3173,7 @@ def triangular_solve(
         transpose (bool, optional): whether `x` should be transposed before calculation. Default: False.
         unitriangular (bool, optional): whether `x` is unit triangular. If True, the diagonal elements of `x` are assumed
             to be 1 and not referenced from `x` . Default: False.
-        name(str, optional): Name for the operation (optional, default is None).
+        name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -3112,20 +3182,23 @@ def triangular_solve(
     Examples:
         .. code-block:: python
 
-            # a square system of linear equations:
-            # x1 +   x2  +   x3 = 0
-            #      2*x2  +   x3 = -9
-            #               -x3 = 5
-
-            import paddle
-            x = paddle.to_tensor([[1, 1, 1],
-                                  [0, 2, 1],
-                                  [0, 0,-1]], dtype="float64")
-            y = paddle.to_tensor([[0], [-9], [5]], dtype="float64")
-            out = paddle.linalg.triangular_solve(x, y, upper=True)
-
-            print(out)
-            # [7, -2, -5]
+            >>> # a square system of linear equations:
+            >>> # x1 +   x2  +   x3 = 0
+            >>> #      2*x2  +   x3 = -9
+            >>> #               -x3 = 5
+
+            >>> import paddle
+            >>> x = paddle.to_tensor([[1, 1, 1],
+            ...                       [0, 2, 1],
+            ...                       [0, 0,-1]], dtype="float64")
+            >>> y = paddle.to_tensor([[0], [-9], [5]], dtype="float64")
+            >>> out = paddle.linalg.triangular_solve(x, y, upper=True)
+
+            >>> print(out)
+            Tensor(shape=[3, 1], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[ 7.],
+             [-2.],
+             [-5.]])
     """
     if in_dynamic_mode():
         return _C_ops.triangular_solve(x, y, upper, transpose, unitriangular)
@@ -3166,7 +3239,7 @@ def cholesky_solve(x, y, upper=False, name=None):
         y (Tensor): The input matrix which is upper or lower triangular Cholesky factor of square matrix A. Its shape should be `[*, M, M]`, where `*` is zero or
             more batch dimensions. Its data type should be float32 or float64.
         upper (bool, optional): whether to consider the Cholesky factor as a lower or upper triangular matrix. Default: False.
-        name(str, optional): Name for the operation (optional, default is None).
+        name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -3175,16 +3248,19 @@ def cholesky_solve(x, y, upper=False, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            u = paddle.to_tensor([[1, 1, 1],
-                                    [0, 2, 1],
-                                    [0, 0,-1]], dtype="float64")
-            b = paddle.to_tensor([[0], [-9], [5]], dtype="float64")
-            out = paddle.linalg.cholesky_solve(b, u, upper=True)
+            >>> u = paddle.to_tensor([[1, 1, 1],
+            ...                       [0, 2, 1],
+            ...                       [0, 0,-1]], dtype="float64")
+            >>> b = paddle.to_tensor([[0], [-9], [5]], dtype="float64")
+            >>> out = paddle.linalg.cholesky_solve(b, u, upper=True)
 
-            print(out)
-            # [-2.5, -7, 9.5]
+            >>> print(out)
+            Tensor(shape=[3, 1], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[-2.50000000],
+             [-7.        ],
+             [ 9.50000000]])
     """
     if in_dynamic_mode():
         return _C_ops.cholesky_solve(x, y, upper)
@@ -3225,13 +3301,13 @@ def eigvalsh(x, UPLO='L', name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([[1, -2j], [2j, 5]])
-            out_value = paddle.eigvalsh(x, UPLO='L')
-            print(out_value)
-            # Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #        [0.17157286, 5.82842731])
+            >>> x = paddle.to_tensor([[1, -2j], [2j, 5]])
+            >>> out_value = paddle.eigvalsh(x, UPLO='L')
+            >>> print(out_value)
+            Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [0.17157286, 5.82842731])
     """
     if in_dynamic_mode():
         values, _ = _C_ops.eigvalsh(x, UPLO, x.stop_gradient)
@@ -3312,31 +3388,36 @@ def lstsq(x, y, rcond=None, driver=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            paddle.set_device("cpu")
-            x = paddle.to_tensor([[1, 3], [3, 2], [5, 6.]])
-            y = paddle.to_tensor([[3, 4, 6], [5, 3, 4], [1, 2, 1.]])
-            results = paddle.linalg.lstsq(x, y, driver="gelsd")
-            print(results[0])
-            # [[ 0.78350395, -0.22165027, -0.62371236],
-            # [-0.11340097,  0.78866047,  1.14948535]]
-            print(results[1])
-            # [19.81443405, 10.43814468, 30.56185532])
-            print(results[2])
-            # 2
-            print(results[3])
-            # [9.03455734, 1.54167950]
-
-            x = paddle.to_tensor([[10, 2, 3], [3, 10, 5], [5, 6, 12.]])
-            y = paddle.to_tensor([[4, 2, 9], [2, 0, 3], [2, 5, 3.]])
-            results = paddle.linalg.lstsq(x, y, driver="gels")
-            print(results[0])
-            # [[ 0.39386186,  0.10230173,  0.93606132],
-            # [ 0.10741687, -0.29028133,  0.11892585],
-            # [-0.05115091,  0.51918161, -0.19948854]]
-            print(results[1])
-            # []
+            >>> import paddle
+
+            >>> x = paddle.to_tensor([[1, 3], [3, 2], [5, 6.]])
+            >>> y = paddle.to_tensor([[3, 4, 6], [5, 3, 4], [1, 2, 1.]])
+            >>> results = paddle.linalg.lstsq(x, y, driver="gelsd")
+            >>> print(results[0])
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[ 0.78350395, -0.22165027, -0.62371236],
+             [-0.11340097,  0.78866047,  1.14948535]])
+            >>> print(results[1])
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [19.81443405, 10.43814468, 30.56185532])
+            >>> print(results[2])
+            Tensor(shape=[], dtype=int32, place=Place(cpu), stop_gradient=True,
+            2)
+            >>> print(results[3])
+            Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [9.03455734, 1.54167950])
+
+            >>> x = paddle.to_tensor([[10, 2, 3], [3, 10, 5], [5, 6, 12.]])
+            >>> y = paddle.to_tensor([[4, 2, 9], [2, 0, 3], [2, 5, 3.]])
+            >>> results = paddle.linalg.lstsq(x, y, driver="gels")
+            >>> print(results[0])
+            Tensor(shape=[3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[ 0.39386186,  0.10230169,  0.93606132],
+             [ 0.10741688, -0.29028130,  0.11892584],
+             [-0.05115093,  0.51918161, -0.19948851]])
+            >>> print(results[1])
+            Tensor(shape=[0], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [])
     """
     device = paddle.get_device()
     if device == "cpu":
@@ -3456,11 +3537,11 @@ def corrcoef(x, rowvar=True, name=None):
 
     The values of `R` are between -1 and 1.
 
-    Parameters:
+    Args:
 
-        x(Tensor): A N-D(N<=2) Tensor containing multiple variables and observations. By default, each row of x represents a variable. Also see rowvar below.
-        rowvar(Bool, optional): If rowvar is True (default), then each row represents a variable, with observations in the columns. Default: True.
-        name(str, optional): Name of the output. Default is None. It's used to print debug info for developers. Details: :ref:`api_guide_Name`.
+        x (Tensor): A N-D(N<=2) Tensor containing multiple variables and observations. By default, each row of x represents a variable. Also see rowvar below.
+        rowvar (bool, optional): If rowvar is True (default), then each row represents a variable, with observations in the columns. Default: True.
+        name (str, optional): Name of the output. It's used to print debug info for developers. Details: :ref:`api_guide_Name`. Default: None.
 
     Returns:
 
@@ -3469,15 +3550,15 @@ def corrcoef(x, rowvar=True, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            xt = paddle.rand((3,4))
-            print(paddle.linalg.corrcoef(xt))
+            >>> import paddle
+            >>> paddle.seed(2023)
 
-            # Tensor(shape=[3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
-            # [[ 1.        , -0.73702252,  0.66228950],
-            # [-0.73702258,  1.        , -0.77104872],
-            # [ 0.66228974, -0.77104825,  1.        ]])
+            >>> xt = paddle.rand((3,4))
+            >>> print(paddle.linalg.corrcoef(xt))
+            Tensor(shape=[3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[ 0.99999988, -0.47689581, -0.89559376],
+             [-0.47689593,  1.        ,  0.16345492],
+             [-0.89559382,  0.16345496,  1.        ]])
 
     """
     if len(x.shape) > 2 or len(x.shape) < 1:
@@ -3545,13 +3626,15 @@ def cdist(
     Examples:
         .. code-block:: python
 
-            import paddle
-            x = paddle.to_tensor([[0.9041,  0.0196], [-0.3108, -2.4423], [-0.4821,  1.059]], dtype=paddle.float32)
-            y = paddle.to_tensor([[-2.1763, -0.4713], [-0.6986,  1.3702]], dtype=paddle.float32)
-            distance = paddle.cdist(x, y)
-            print(distance)
-            # Tensor(shape=[3, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            # [[3.1193, 2.0959], [2.7138, 3.8322], [2.2830, 0.3791]])
+            >>> import paddle
+            >>> x = paddle.to_tensor([[0.9041,  0.0196], [-0.3108, -2.4423], [-0.4821,  1.059]], dtype=paddle.float32)
+            >>> y = paddle.to_tensor([[-2.1763, -0.4713], [-0.6986,  1.3702]], dtype=paddle.float32)
+            >>> distance = paddle.cdist(x, y)
+            >>> print(distance)
+            Tensor(shape=[3, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[3.11927032, 2.09589314],
+             [2.71384072, 3.83217239],
+             [2.28300953, 0.37910119]])
     """
 
     check_variable_and_dtype(x, 'x', ('float32', 'float64'), 'cdist')
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index 58a3b1fc0ea..7bf39eb2e59 100755
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -129,12 +129,15 @@ def logical_and(x, y, out=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
+
+            >>> x = paddle.to_tensor([True])
+            >>> y = paddle.to_tensor([True, False, True, False])
+            >>> res = paddle.logical_and(x, y)
+            >>> print(res)
+            Tensor(shape=[4], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [True , False, True , False])
 
-            x = paddle.to_tensor([True])
-            y = paddle.to_tensor([True, False, True, False])
-            res = paddle.logical_and(x, y)
-            print(res) # [True False True False]
     """
     if in_dynamic_mode():
         return _C_ops.logical_and(x, y)
@@ -188,15 +191,15 @@ def logical_or(x, y, out=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([True, False], dtype="bool").reshape([2, 1])
-            y = paddle.to_tensor([True, False, True, False], dtype="bool").reshape([2, 2])
-            res = paddle.logical_or(x, y)
-            print(res)
-            # Tensor(shape=[2, 2], dtype=bool, place=Place(cpu), stop_gradient=True,
-            #        [[True , True ],
-            #         [True , False]])
+            >>> x = paddle.to_tensor([True, False], dtype="bool").reshape([2, 1])
+            >>> y = paddle.to_tensor([True, False, True, False], dtype="bool").reshape([2, 2])
+            >>> res = paddle.logical_or(x, y)
+            >>> print(res)
+            Tensor(shape=[2, 2], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [[True , True ],
+             [True , False]])
     """
     if in_dynamic_mode():
         return _C_ops.logical_or(x, y)
@@ -249,15 +252,15 @@ def logical_xor(x, y, out=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([True, False], dtype="bool").reshape([2, 1])
-            y = paddle.to_tensor([True, False, True, False], dtype="bool").reshape([2, 2])
-            res = paddle.logical_xor(x, y)
-            print(res)
-            # Tensor(shape=[2, 2], dtype=bool, place=Place(cpu), stop_gradient=True,
-            #        [[False, True ],
-            #         [True , False]])
+            >>> x = paddle.to_tensor([True, False], dtype="bool").reshape([2, 1])
+            >>> y = paddle.to_tensor([True, False, True, False], dtype="bool").reshape([2, 2])
+            >>> res = paddle.logical_xor(x, y)
+            >>> print(res)
+            Tensor(shape=[2, 2], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [[False, True ],
+             [True , False]])
     """
     if in_dynamic_mode():
         return _C_ops.logical_xor(x, y)
@@ -300,6 +303,7 @@ def logical_not(x, out=None, name=None):
         .. _Introduction to Tensor: ../../guides/beginner/tensor_en.html#chapter5-broadcasting-of-tensor
 
     Args:
+
         x(Tensor):  Operand of logical_not operator. Must be a Tensor of type bool, int8, int16, in32, in64, float16, float32, or float64, complex64, complex128.
         out(Tensor): The ``Tensor`` that specifies the output of the operator, which can be any ``Tensor`` that has been created in the program. The default value is None, and a new ``Tensor` will be created to save the output.
         name(str|None): The default value is None. Normally there is no need for users to set this property. For more information, please refer to :ref:`api_guide_Name`.
@@ -310,11 +314,13 @@ def logical_not(x, out=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([True, False, True, False])
-            res = paddle.logical_not(x)
-            print(res) # [False  True False  True]
+            >>> x = paddle.to_tensor([True, False, True, False])
+            >>> res = paddle.logical_not(x)
+            >>> print(res)
+            Tensor(shape=[4], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [False, True , False, True ])
     """
     if in_dynamic_mode():
         return _C_ops.logical_not(x)
@@ -340,9 +346,7 @@ def is_empty(x, name=None):
 
     Args:
         x (Tensor): The Tensor to be tested.
-        name (str, optional): The default value is ``None`` . Normally users
-                            don't have to set this parameter. For more information,
-                            please refer to :ref:`api_guide_Name` .
+        name (str, optional): The default value is ``None`` . Normally users don't have to set this parameter. For more information, please refer to :ref:`api_guide_Name` .
 
     Returns:
         Tensor: A bool scalar Tensor. True if 'x' is an empty Tensor.
@@ -350,12 +354,13 @@ def is_empty(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            input = paddle.rand(shape=[4, 32, 32], dtype='float32')
-            res = paddle.is_empty(x=input)
-            # res: Tensor(shape=[], dtype=bool, place=Place(cpu), stop_gradient=True,
-            #        False)
+            >>> input = paddle.rand(shape=[4, 32, 32], dtype='float32')
+            >>> res = paddle.is_empty(x=input)
+            >>> print(res)
+            Tensor(shape=[], dtype=bool, place=Place(cpu), stop_gradient=True,
+            False)
 
     """
     if in_dynamic_mode():
@@ -394,15 +399,19 @@ def equal_all(x, y, name=None):
     Examples:
         .. code-block:: python
 
-          import paddle
-
-          x = paddle.to_tensor([1, 2, 3])
-          y = paddle.to_tensor([1, 2, 3])
-          z = paddle.to_tensor([1, 4, 3])
-          result1 = paddle.equal_all(x, y)
-          print(result1) # result1 = True
-          result2 = paddle.equal_all(x, z)
-          print(result2) # result2 = False
+            >>> import paddle
+
+            >>> x = paddle.to_tensor([1, 2, 3])
+            >>> y = paddle.to_tensor([1, 2, 3])
+            >>> z = paddle.to_tensor([1, 4, 3])
+            >>> result1 = paddle.equal_all(x, y)
+            >>> print(result1)
+            Tensor(shape=[], dtype=bool, place=Place(cpu), stop_gradient=True,
+            True)
+            >>> result2 = paddle.equal_all(x, z)
+            >>> print(result2)
+            Tensor(shape=[], dtype=bool, place=Place(cpu), stop_gradient=True,
+            False)
     """
     if in_dynamic_mode():
         return _C_ops.equal_all(x, y)
@@ -429,11 +438,11 @@ def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
     two tensors are elementwise equal within a tolerance.
 
     Args:
-        x(Tensor): The input tensor, it's data type should be float16, float32, float64..
-        y(Tensor): The input tensor, it's data type should be float16, float32, float64..
-        rtol(rtoltype, optional): The relative tolerance. Default: :math:`1e-5` .
-        atol(atoltype, optional): The absolute tolerance. Default: :math:`1e-8` .
-        equal_nan(equalnantype, optional): ${equal_nan_comment}.
+        x (Tensor): The input tensor, it's data type should be float16, float32, float64.
+        y (Tensor): The input tensor, it's data type should be float16, float32, float64.
+        rtol (rtoltype, optional): The relative tolerance. Default: :math:`1e-5` .
+        atol (atoltype, optional): The absolute tolerance. Default: :math:`1e-8` .
+        equal_nan (equalnantype, optional): ${equal_nan_comment}. Default: False.
         name (str, optional): Name for the operation. For more information, please
             refer to :ref:`api_guide_Name`. Default: None.
 
@@ -443,27 +452,28 @@ def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
     Examples:
         .. code-block:: python
 
-          import paddle
-
-          x = paddle.to_tensor([10000., 1e-07])
-          y = paddle.to_tensor([10000.1, 1e-08])
-          result1 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
-                                  equal_nan=False, name="ignore_nan")
-          # False
-
-          result2 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
-                                      equal_nan=True, name="equal_nan")
-          # False
-
-          x = paddle.to_tensor([1.0, float('nan')])
-          y = paddle.to_tensor([1.0, float('nan')])
-          result1 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
-                                  equal_nan=False, name="ignore_nan")
-          # False
-
-          result2 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
-                                      equal_nan=True, name="equal_nan")
-          # True
+            >>> import paddle
+
+            >>> x = paddle.to_tensor([10000., 1e-07])
+            >>> y = paddle.to_tensor([10000.1, 1e-08])
+            >>> result1 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name="ignore_nan")
+            >>> print(result1)
+            Tensor(shape=[], dtype=bool, place=Place(cpu), stop_gradient=True,
+            False)
+            >>> result2 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=True, name="equal_nan")
+            >>> print(result2)
+            Tensor(shape=[], dtype=bool, place=Place(cpu), stop_gradient=True,
+            False)
+            >>> x = paddle.to_tensor([1.0, float('nan')])
+            >>> y = paddle.to_tensor([1.0, float('nan')])
+            >>> result1 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name="ignore_nan")
+            >>> print(result1)
+            Tensor(shape=[], dtype=bool, place=Place(cpu), stop_gradient=True,
+            False)
+            >>> result2 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=True, name="equal_nan")
+            >>> print(result2)
+            Tensor(shape=[], dtype=bool, place=Place(cpu), stop_gradient=True,
+            True)
     """
 
     if in_dynamic_mode():
@@ -502,9 +512,9 @@ def equal(x, y, name=None):
         The output has no gradient.
 
     Args:
-        x(Tensor): Tensor, data type is bool, float16, float32, float64, int32, int64.
-        y(Tensor): Tensor, data type is bool, float16, float32, float64, int32, int64.
-        name(str, optional): The default value is None.  Normally there is no need for
+        x (Tensor): Tensor, data type is bool, float16, float32, float64, int32, int64.
+        y (Tensor): Tensor, data type is bool, float16, float32, float64, int32, int64.
+        name (str, optional): The default value is None. Normally there is no need for
             user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -514,12 +524,14 @@ def equal(x, y, name=None):
     Examples:
         .. code-block:: python
 
-          import paddle
+            >>> import paddle
 
-          x = paddle.to_tensor([1, 2, 3])
-          y = paddle.to_tensor([1, 3, 2])
-          result1 = paddle.equal(x, y)
-          print(result1)  # result1 = [True False False]
+            >>> x = paddle.to_tensor([1, 2, 3])
+            >>> y = paddle.to_tensor([1, 3, 2])
+            >>> result1 = paddle.equal(x, y)
+            >>> print(result1)
+            Tensor(shape=[3], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [True , False, False])
     """
     if not isinstance(y, (int, bool, float, Variable)):
         raise TypeError(
@@ -599,9 +611,9 @@ def greater_equal(x, y, name=None):
         The output has no gradient.
 
     Args:
-        x(Tensor): First input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
-        y(Tensor): Second input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
-        name(str, optional): The default value is None.  Normally there is no need for
+        x (Tensor): First input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
+        y (Tensor): Second input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
+        name (str, optional): The default value is None.  Normally there is no need for
             user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
     Returns:
         Tensor: The output shape is same as input :attr:`x`. The output data type is bool.
@@ -609,12 +621,14 @@ def greater_equal(x, y, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([1, 2, 3])
-            y = paddle.to_tensor([1, 3, 2])
-            result1 = paddle.greater_equal(x, y)
-            print(result1)  # result1 = [True False True]
+            >>> x = paddle.to_tensor([1, 2, 3])
+            >>> y = paddle.to_tensor([1, 3, 2])
+            >>> result1 = paddle.greater_equal(x, y)
+            >>> print(result1)
+            Tensor(shape=[3], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [True , False, True ])
     """
     if in_dynamic_mode():
         return _C_ops.greater_equal(x, y)
@@ -685,9 +699,9 @@ def greater_than(x, y, name=None):
         The output has no gradient.
 
     Args:
-        x(Tensor): First input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
-        y(Tensor): Second input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
-        name(str, optional): The default value is None.  Normally there is no need for
+        x (Tensor): First input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
+        y (Tensor): Second input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
+        name (str, optional): The default value is None.  Normally there is no need for
             user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
     Returns:
         Tensor: The output shape is same as input :attr:`x`. The output data type is bool.
@@ -695,12 +709,14 @@ def greater_than(x, y, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([1, 2, 3])
-            y = paddle.to_tensor([1, 3, 2])
-            result1 = paddle.greater_than(x, y)
-            print(result1)  # result1 = [False False True]
+            >>> x = paddle.to_tensor([1, 2, 3])
+            >>> y = paddle.to_tensor([1, 3, 2])
+            >>> result1 = paddle.greater_than(x, y)
+            >>> print(result1)
+            Tensor(shape=[3], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [False, False, True ])
     """
     if in_dynamic_mode():
         return _C_ops.greater_than(x, y)
@@ -771,9 +787,9 @@ def less_equal(x, y, name=None):
         The output has no gradient.
 
     Args:
-        x(Tensor): First input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
-        y(Tensor): Second input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
-        name(str, optional): The default value is None.  Normally there is no need for
+        x (Tensor): First input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
+        y (Tensor): Second input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
+        name (str, optional): The default value is None.  Normally there is no need for
             user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -782,12 +798,14 @@ def less_equal(x, y, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([1, 2, 3])
-            y = paddle.to_tensor([1, 3, 2])
-            result1 = paddle.less_equal(x, y)
-            print(result1)  # result1 = [True True False]
+            >>> x = paddle.to_tensor([1, 2, 3])
+            >>> y = paddle.to_tensor([1, 3, 2])
+            >>> result1 = paddle.less_equal(x, y)
+            >>> print(result1)
+            Tensor(shape=[3], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [True , True , False])
     """
     if in_dynamic_mode():
         return _C_ops.less_equal(x, y)
@@ -858,9 +876,9 @@ def less_than(x, y, name=None):
         The output has no gradient.
 
     Args:
-        x(Tensor): First input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
-        y(Tensor): Second input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
-        name(str, optional): The default value is None.  Normally there is no need for
+        x (Tensor): First input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
+        y (Tensor): Second input to compare which is N-D tensor. The input data type should be bool, float16, float32, float64, int32, int64.
+        name (str, optional): The default value is None.  Normally there is no need for
             user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -869,12 +887,14 @@ def less_than(x, y, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([1, 2, 3])
-            y = paddle.to_tensor([1, 3, 2])
-            result1 = paddle.less_than(x, y)
-            print(result1)  # result1 = [False True False]
+            >>> x = paddle.to_tensor([1, 2, 3])
+            >>> y = paddle.to_tensor([1, 3, 2])
+            >>> result1 = paddle.less_than(x, y)
+            >>> print(result1)
+            Tensor(shape=[3], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [False, True , False])
     """
     if in_dynamic_mode():
         return _C_ops.less_than(x, y)
@@ -945,9 +965,9 @@ def not_equal(x, y, name=None):
         The output has no gradient.
 
     Args:
-        x(Tensor): First input to compare which is N-D tensor. The input data type should be bool, float32, float64, int32, int64.
-        y(Tensor): Second input to compare which is N-D tensor. The input data type should be bool, float32, float64, int32, int64.
-        name(str, optional): The default value is None.  Normally there is no need for
+        x (Tensor): First input to compare which is N-D tensor. The input data type should be bool, float32, float64, int32, int64.
+        y (Tensor): Second input to compare which is N-D tensor. The input data type should be bool, float32, float64, int32, int64.
+        name (str, optional): The default value is None.  Normally there is no need for
             user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -956,12 +976,14 @@ def not_equal(x, y, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.to_tensor([1, 2, 3])
-            y = paddle.to_tensor([1, 3, 2])
-            result1 = paddle.not_equal(x, y)
-            print(result1)  # result1 = [False True True]
+            >>> x = paddle.to_tensor([1, 2, 3])
+            >>> y = paddle.to_tensor([1, 3, 2])
+            >>> result1 = paddle.not_equal(x, y)
+            >>> print(result1)
+            Tensor(shape=[3], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [False, True , True ])
     """
     if in_dynamic_mode():
         return _C_ops.not_equal(x, y)
@@ -1037,15 +1059,17 @@ def is_tensor(x):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            input1 = paddle.rand(shape=[2, 3, 5], dtype='float32')
-            check = paddle.is_tensor(input1)
-            print(check)  #True
+            >>> input1 = paddle.rand(shape=[2, 3, 5], dtype='float32')
+            >>> check = paddle.is_tensor(input1)
+            >>> print(check)
+            True
 
-            input3 = [1, 4]
-            check = paddle.is_tensor(input3)
-            print(check)  #False
+            >>> input3 = [1, 4]
+            >>> check = paddle.is_tensor(input3)
+            >>> print(check)
+            False
 
     """
     if in_dynamic_mode():
@@ -1113,7 +1137,9 @@ def bitwise_and(x, y, out=None, name=None):
     Args:
         x (Tensor): Input Tensor of ``bitwise_and`` . It is a N-D Tensor of bool, uint8, int8, int16, int32, int64.
         y (Tensor): Input Tensor of ``bitwise_and`` . It is a N-D Tensor of bool, uint8, int8, int16, int32, int64.
-        out(Tensor): Result of ``bitwise_and`` . It is a N-D Tensor with the same data type of input Tensor.
+        out (Tensor, optional): Result of ``bitwise_and`` . It is a N-D Tensor with the same data type of input Tensor. Default: None.
+        name (str, optional): The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: Result of ``bitwise_and`` . It is a N-D Tensor with the same data type of input Tensor.
@@ -1121,11 +1147,13 @@ def bitwise_and(x, y, out=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            x = paddle.to_tensor([-5, -1, 1])
-            y = paddle.to_tensor([4,  2, -3])
-            res = paddle.bitwise_and(x, y)
-            print(res)  # [0, 2, 1]
+            >>> import paddle
+            >>> x = paddle.to_tensor([-5, -1, 1])
+            >>> y = paddle.to_tensor([4,  2, -3])
+            >>> res = paddle.bitwise_and(x, y)
+            >>> print(res)
+            Tensor(shape=[3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [0, 2, 1])
     """
     if in_dynamic_mode() and out is None:
         return _C_ops.bitwise_and(x, y)
@@ -1167,7 +1195,9 @@ def bitwise_or(x, y, out=None, name=None):
     Args:
         x (Tensor): Input Tensor of ``bitwise_or`` . It is a N-D Tensor of bool, uint8, int8, int16, int32, int64.
         y (Tensor): Input Tensor of ``bitwise_or`` . It is a N-D Tensor of bool, uint8, int8, int16, int32, int64.
-        out(Tensor): Result of ``bitwise_or`` . It is a N-D Tensor with the same data type of input Tensor.
+        out (Tensor, optional): Result of ``bitwise_or`` . It is a N-D Tensor with the same data type of input Tensor. Default: None.
+        name (str, optional): The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: Result of ``bitwise_or`` . It is a N-D Tensor with the same data type of input Tensor.
@@ -1175,11 +1205,13 @@ def bitwise_or(x, y, out=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            x = paddle.to_tensor([-5, -1, 1])
-            y = paddle.to_tensor([4,  2, -3])
-            res = paddle.bitwise_or(x, y)
-            print(res)  # [-1, -1, -3]
+            >>> import paddle
+            >>> x = paddle.to_tensor([-5, -1, 1])
+            >>> y = paddle.to_tensor([4,  2, -3])
+            >>> res = paddle.bitwise_or(x, y)
+            >>> print(res)
+            Tensor(shape=[3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [-1, -1, -3])
     """
     if in_dynamic_mode() and out is None:
         return _C_ops.bitwise_or(x, y)
@@ -1222,7 +1254,9 @@ def bitwise_xor(x, y, out=None, name=None):
     Args:
         x (Tensor): Input Tensor of ``bitwise_xor`` . It is a N-D Tensor of bool, uint8, int8, int16, int32, int64.
         y (Tensor): Input Tensor of ``bitwise_xor`` . It is a N-D Tensor of bool, uint8, int8, int16, int32, int64.
-        out(Tensor): Result of ``bitwise_xor`` . It is a N-D Tensor with the same data type of input Tensor.
+        out (Tensor, optional): Result of ``bitwise_xor`` . It is a N-D Tensor with the same data type of input Tensor. Default: None.
+        name (str, optional): The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: Result of ``bitwise_xor`` . It is a N-D Tensor with the same data type of input Tensor.
@@ -1230,11 +1264,13 @@ def bitwise_xor(x, y, out=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            x = paddle.to_tensor([-5, -1, 1])
-            y = paddle.to_tensor([4,  2, -3])
-            res = paddle.bitwise_xor(x, y)
-            print(res) # [-1, -3, -4]
+            >>> import paddle
+            >>> x = paddle.to_tensor([-5, -1, 1])
+            >>> y = paddle.to_tensor([4,  2, -3])
+            >>> res = paddle.bitwise_xor(x, y)
+            >>> print(res)
+            Tensor(shape=[3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [-1, -3, -4])
     """
     if in_dynamic_mode() and out is None:
         return _C_ops.bitwise_xor(x, y)
@@ -1275,7 +1311,9 @@ def bitwise_not(x, out=None, name=None):
 
     Args:
         x (Tensor): Input Tensor of ``bitwise_not`` . It is a N-D Tensor of bool, uint8, int8, int16, int32, int64.
-        out(Tensor): Result of ``bitwise_not`` . It is a N-D Tensor with the same data type of input Tensor.
+        out (Tensor, optional): Result of ``bitwise_not`` . It is a N-D Tensor with the same data type of input Tensor. Default: None.
+        name (str, optional): The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: Result of ``bitwise_not`` . It is a N-D Tensor with the same data type of input Tensor.
@@ -1283,10 +1321,12 @@ def bitwise_not(x, out=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            x = paddle.to_tensor([-5, -1, 1])
-            res = paddle.bitwise_not(x)
-            print(res) # [4, 0, -2]
+            >>> import paddle
+            >>> x = paddle.to_tensor([-5, -1, 1])
+            >>> res = paddle.bitwise_not(x)
+            >>> print(res)
+            Tensor(shape=[3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [ 4,  0, -2])
     """
     if in_dynamic_mode() and out is None:
         return _C_ops.bitwise_not(x)
@@ -1334,25 +1374,32 @@ def isclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
     Examples:
         .. code-block:: python
 
-          import paddle
-
-          x = paddle.to_tensor([10000., 1e-07])
-          y = paddle.to_tensor([10000.1, 1e-08])
-          result1 = paddle.isclose(x, y, rtol=1e-05, atol=1e-08,
-                                  equal_nan=False, name="ignore_nan")
-          # [True, False]
-          result2 = paddle.isclose(x, y, rtol=1e-05, atol=1e-08,
-                                      equal_nan=True, name="equal_nan")
-          # [True, False]
-
-          x = paddle.to_tensor([1.0, float('nan')])
-          y = paddle.to_tensor([1.0, float('nan')])
-          result1 = paddle.isclose(x, y, rtol=1e-05, atol=1e-08,
-                                  equal_nan=False, name="ignore_nan")
-          # [True, False]
-          result2 = paddle.isclose(x, y, rtol=1e-05, atol=1e-08,
-                                      equal_nan=True, name="equal_nan")
-          # [True, True]
+            >>> import paddle
+
+            >>> x = paddle.to_tensor([10000., 1e-07])
+            >>> y = paddle.to_tensor([10000.1, 1e-08])
+            >>> result1 = paddle.isclose(x, y, rtol=1e-05, atol=1e-08,
+            ...                          equal_nan=False, name="ignore_nan")
+            >>> print(result1)
+            Tensor(shape=[2], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [True , False])
+            >>> result2 = paddle.isclose(x, y, rtol=1e-05, atol=1e-08,
+            ...                          equal_nan=True, name="equal_nan")
+            >>> print(result2)
+            Tensor(shape=[2], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [True , False])
+            >>> x = paddle.to_tensor([1.0, float('nan')])
+            >>> y = paddle.to_tensor([1.0, float('nan')])
+            >>> result1 = paddle.isclose(x, y, rtol=1e-05, atol=1e-08,
+            ...                          equal_nan=False, name="ignore_nan")
+            >>> print(result1)
+            Tensor(shape=[2], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [True , False])
+            >>> result2 = paddle.isclose(x, y, rtol=1e-05, atol=1e-08,
+            ...                          equal_nan=True, name="equal_nan")
+            >>> print(result2)
+            Tensor(shape=[2], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [True, True])
     """
 
     if in_dynamic_mode():
-- 
GitLab