From 7bfd799d001aeee13d749b0f67621b19c1409fa3 Mon Sep 17 00:00:00 2001
From: cnn <liuhui29@baidu.com>
Date: Thu, 22 Oct 2020 07:23:12 +0800
Subject: [PATCH] Release 2.0rc cherry pick api rename #28108 (#28184)

* rename count_include_pad-->exclusive  return_indices-->return_mask

* remove track_running_stats

* fix typo.

* rename xxxd-->xxxxD

* solve conflicts
---
 .../parallel_dygraph_sync_batch_norm.py       |   5 +-
 .../unittests/test_adaptive_max_pool2d.py     |   2 +-
 .../fluid/tests/unittests/test_pool1d_api.py  |  11 +-
 .../fluid/tests/unittests/test_pool2d_api.py  |  10 +-
 .../fluid/tests/unittests/test_pool3d_api.py  |  10 +-
 python/paddle/nn/functional/pooling.py        | 120 +++++++++---------
 python/paddle/nn/layer/norm.py                |  56 +-------
 python/paddle/nn/layer/pooling.py             | 100 +++++++--------
 8 files changed, 132 insertions(+), 182 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
index dcf5151578a..d525009fbd7 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
@@ -63,10 +63,7 @@ class TestLayer(fluid.dygraph.Layer):
             bias_attr=False)
 
         self._sync_batch_norm2 = SyncBatchNorm(
-            num_filters,
-            weight_attr=False,
-            bias_attr=False,
-            track_running_stats=False)
+            num_filters, weight_attr=False, bias_attr=False)
 
     def forward(self, inputs):
         y = self._conv(inputs)
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
index 944725fab64..18860db9dae 100644
--- a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
@@ -150,7 +150,7 @@ class TestAdaptiveMaxPool2DAPI(unittest.TestCase):
             x = paddle.to_tensor(self.x_np)
 
             out_1 = paddle.nn.functional.adaptive_max_pool2d(
-                x=x, return_indices=False, output_size=[3, 3])
+                x=x, return_mask=False, output_size=[3, 3])
 
             out_2 = paddle.nn.functional.adaptive_max_pool2d(x=x, output_size=5)
 
diff --git a/python/paddle/fluid/tests/unittests/test_pool1d_api.py b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
index cc2490d1f12..00f75337baa 100644
--- a/python/paddle/fluid/tests/unittests/test_pool1d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
@@ -148,11 +148,7 @@ class TestPool1D_API(unittest.TestCase):
             input_np = np.random.random([2, 3, 32]).astype("float32")
             input = fluid.dygraph.to_variable(input_np)
             result = F.avg_pool1d(
-                input,
-                kernel_size=2,
-                stride=2,
-                padding=[1],
-                count_include_pad=True)
+                input, kernel_size=2, stride=2, padding=[1], exclusive=True)
 
             result_np = avg_pool1D_forward_naive(
                 input_np, ksize=[2], strides=[2], paddings=[1], exclusive=False)
@@ -160,7 +156,8 @@ class TestPool1D_API(unittest.TestCase):
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
             avg_pool1d_dg = paddle.nn.AvgPool1D(
-                kernel_size=2, stride=None, padding=1, count_include_pad=True)
+                kernel_size=2, stride=None, padding=1, exclusive=True)
+
             result = avg_pool1d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
@@ -200,7 +197,7 @@ class TestPool1D_API(unittest.TestCase):
             input_np = np.random.random([2, 3, 32]).astype("float32")
             input = fluid.dygraph.to_variable(input_np)
             result, index = F.max_pool1d(
-                input, kernel_size=2, stride=2, padding=0, return_indices=True)
+                input, kernel_size=2, stride=2, padding=0, return_mask=True)
 
             result_np = max_pool1D_forward_naive(
                 input_np, ksize=[2], strides=[2], paddings=[0])
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_api.py b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
index 66505327c2d..f4432bf3386 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
@@ -134,7 +134,7 @@ class TestPool2D_API(unittest.TestCase):
             input_np = np.random.random([2, 3, 32, 32]).astype("float32")
             input = fluid.dygraph.to_variable(input_np)
             result = max_pool2d(
-                input, kernel_size=2, stride=2, padding=0, return_indices=False)
+                input, kernel_size=2, stride=2, padding=0, return_mask=False)
 
             result_np = pool2D_forward_naive(
                 input_np,
@@ -159,7 +159,7 @@ class TestPool2D_API(unittest.TestCase):
                 kernel_size=2,
                 stride=2,
                 padding=0,
-                return_indices=False,
+                return_mask=False,
                 data_format="NHWC")
 
             result_np = pool2D_forward_naive(
@@ -222,7 +222,7 @@ class TestPool2D_API(unittest.TestCase):
                 kernel_size=2,
                 stride=None,
                 padding="SAME",
-                return_indices=True)
+                return_mask=True)
 
             result_np = pool2D_forward_naive(
                 input_np,
@@ -269,7 +269,7 @@ class TestPool2D_API(unittest.TestCase):
                 kernel_size=2,
                 stride=2,
                 padding=padding,
-                return_indices=False)
+                return_mask=False)
 
             result_np = pool2D_forward_naive(
                 input_np,
@@ -490,7 +490,7 @@ class TestPool2DError_API(unittest.TestCase):
                     padding=0,
                     ceil_mode=False,
                     data_format='NHWC',
-                    return_indices=True)
+                    return_mask=True)
 
         self.assertRaises(ValueError, run9)
 
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_api.py b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
index b2700303ee4..91158fe674b 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
@@ -83,7 +83,7 @@ class TestPool3D_API(unittest.TestCase):
                 stride=2,
                 padding=1,
                 ceil_mode=False,
-                count_include_pad=True)
+                exclusive=True)
 
             result_np = avg_pool3D_forward_naive(
                 input_np,
@@ -100,7 +100,7 @@ class TestPool3D_API(unittest.TestCase):
                 stride=None,
                 padding=1,
                 ceil_mode=False,
-                count_include_pad=True)
+                exclusive=True)
             result = avg_pool3d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
@@ -175,7 +175,7 @@ class TestPool3D_API(unittest.TestCase):
                 stride=2,
                 padding=0,
                 data_format="NDHWC",
-                return_indices=False)
+                return_mask=False)
 
             result_np = pool3D_forward_naive(
                 input_np,
@@ -239,7 +239,7 @@ class TestPool3D_API(unittest.TestCase):
                 kernel_size=2,
                 stride=None,
                 padding="SAME",
-                return_indices=True)
+                return_mask=True)
 
             result_np = pool3D_forward_naive(
                 input_np,
@@ -467,7 +467,7 @@ class TestPool3DError_API(unittest.TestCase):
                     stride=2,
                     padding=0,
                     data_format='NDHWC',
-                    return_indices=True)
+                    return_mask=True)
 
         self.assertRaises(ValueError, run10)
 
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 73652ff1266..73e3cb31221 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -157,7 +157,7 @@ def avg_pool1d(x,
                kernel_size,
                stride=None,
                padding=0,
-               count_include_pad=True,
+               exclusive=True,
                ceil_mode=False,
                name=None):
     """
@@ -179,7 +179,7 @@ def avg_pool1d(x,
             4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
-        count_include_pad (bool): Whether to exclude padding points in average pooling
+        exclusive (bool): Whether to exclude padding points in average pooling
                           mode, default is `True`.
         ceil_mode (bool): ${ceil_mode_comment}Whether to use the ceil function to calculate output height and width.
             If it is set to False, the floor function will be used. The default value is False.
@@ -230,8 +230,8 @@ def avg_pool1d(x,
             x, 'pooling_type', 'avg', 'ksize', kernel_size, 'global_pooling',
             False, 'strides', stride, 'paddings', padding, 'padding_algorithm',
             padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
-            'use_mkldnn', False, 'exclusive', not count_include_pad,
-            'data_format', data_format)
+            'use_mkldnn', False, 'exclusive', not exclusive, 'data_format',
+            data_format)
         return squeeze(output, [2])
 
     op_type = 'pool2d'
@@ -253,7 +253,7 @@ def avg_pool1d(x,
             "use_cudnn": True,
             "ceil_mode": ceil_mode,
             "use_mkldnn": False,
-            "exclusive": not count_include_pad,
+            "exclusive": not exclusive,
             "data_format": data_format,
         })
 
@@ -265,7 +265,7 @@ def avg_pool2d(x,
                stride=None,
                padding=0,
                ceil_mode=False,
-               count_include_pad=True,
+               exclusive=True,
                divisor_override=None,
                data_format="NCHW",
                name=None):
@@ -294,7 +294,7 @@ def avg_pool2d(x,
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
         ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        count_include_pad (bool): Whether to exclude padding points in average pooling
+        exclusive (bool): Whether to exclude padding points in average pooling
                           mode, default is `true`.
         divisor_override (float): if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
         data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NHWC"`.
@@ -338,8 +338,8 @@ def avg_pool2d(x,
             x, 'pooling_type', 'avg', 'ksize', kernel_size, 'global_pooling',
             False, 'padding_algorithm', padding_algorithm, 'strides', stride,
             'paddings', padding, 'use_cudnn', True, 'ceil_mode', ceil_mode,
-            'use_mkldnn', False, 'exclusive', not count_include_pad,
-            'data_format', data_format)
+            'use_mkldnn', False, 'exclusive', not exclusive, 'data_format',
+            data_format)
         if divisor_override is None:
             return output
         else:
@@ -365,7 +365,7 @@ def avg_pool2d(x,
             "use_cudnn": True,
             "ceil_mode": ceil_mode,
             "use_mkldnn": False,
-            "exclusive": not count_include_pad,
+            "exclusive": not exclusive,
             "data_format": data_format,
         })
 
@@ -381,7 +381,7 @@ def avg_pool3d(x,
                stride=None,
                padding=0,
                ceil_mode=False,
-               count_include_pad=True,
+               exclusive=True,
                divisor_override=None,
                data_format="NCDHW",
                name=None):
@@ -408,7 +408,7 @@ def avg_pool3d(x,
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
         ceil_mode (bool): ${ceil_mode_comment}
-        count_include_pad (bool): Whether to exclude padding points in average pooling
+        exclusive (bool): Whether to exclude padding points in average pooling
                           mode, default is True.
         divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
         data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
@@ -452,8 +452,8 @@ def avg_pool3d(x,
             x, 'pooling_type', 'avg', 'ksize', kernel_size, 'strides', stride,
             'paddings', padding, 'global_pooling', False, 'padding_algorithm',
             padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
-            'use_mkldnn', False, 'exclusive', not count_include_pad,
-            'data_format', data_format)
+            'use_mkldnn', False, 'exclusive', not exclusive, 'data_format',
+            data_format)
         if divisor_override is None:
             return output
         else:
@@ -481,7 +481,7 @@ def avg_pool3d(x,
             "use_cudnn": True,
             "ceil_mode": ceil_mode,
             "use_mkldnn": False,
-            "exclusive": not count_include_pad,
+            "exclusive": not exclusive,
             "data_format": data_format,
         })
 
@@ -497,7 +497,7 @@ def max_pool1d(x,
                kernel_size,
                stride=None,
                padding=0,
-               return_indices=False,
+               return_mask=False,
                ceil_mode=False,
                name=None):
     """
@@ -519,7 +519,7 @@ def max_pool1d(x,
             4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
-        return_indices (bool): Whether return the max indices along with the outputs. default is `False`.
+        return_mask (bool): Whether return the max indices along with the outputs. default is `False`.
         ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default.
             If it is set to False, the floor function will be used. Default False.
         name(str, optional): For detailed information, please refer
@@ -542,7 +542,7 @@ def max_pool1d(x,
           data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
           pool_out = F.max_pool1d(data, kernel_size=2, stride=2, padding=0)
           # pool_out shape: [1, 3, 16]
-          pool_out, indices = F.max_pool1d(data, kernel_size=2, stride=2, padding=0, return_indices=True)
+          pool_out, indices = F.max_pool1d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
           # pool_out shape: [1, 3, 16],  indices shape: [1, 3, 16]
     """
     """NCL to NCHW"""
@@ -563,16 +563,16 @@ def max_pool1d(x,
     padding = _expand_low_nd_padding(padding)
 
     if in_dygraph_mode():
-        if return_indices:
+        if return_mask:
             pool_out = core.ops.max_pool2d_with_index(
                 x, 'ksize', kernel_size, 'global_pooling', False, 'strides',
                 stride, 'paddings', padding, 'padding_algorithm',
                 padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
                 'use_mkldnn', False, 'exclusive', True, 'data_format',
                 data_format)
-            return (squeeze(pool_out[0], [2]), squeeze(
-                pool_out[1],
-                [2])) if return_indices else squeeze(pool_out[0], [2])
+            return (squeeze(pool_out[0], [2]),
+                    squeeze(pool_out[1],
+                            [2])) if return_mask else squeeze(pool_out[0], [2])
         else:
             pool_out = core.ops.pool2d(
                 x, 'pooling_type', 'max', 'ksize', kernel_size,
@@ -582,7 +582,7 @@ def max_pool1d(x,
                 'data_format', data_format)
             return squeeze(pool_out, [2])
 
-    op_type = 'max_pool2d_with_index' if return_indices else "pool2d"
+    op_type = 'max_pool2d_with_index' if return_mask else "pool2d"
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
@@ -608,14 +608,14 @@ def max_pool1d(x,
         })
 
     return (squeeze(pool_out, [2]),
-            squeeze(mask, [2])) if return_indices else squeeze(pool_out, [2])
+            squeeze(mask, [2])) if return_mask else squeeze(pool_out, [2])
 
 
 def max_pool2d(x,
                kernel_size,
                stride=None,
                padding=0,
-               return_indices=False,
+               return_mask=False,
                ceil_mode=False,
                data_format="NCHW",
                name=None):
@@ -643,7 +643,7 @@ def max_pool2d(x,
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
         ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        return_indices (bool): Whether to return the max indices along with the outputs. Default False, only support `"NCHW"` data format
+        return_mask (bool): Whether to return the max indices along with the outputs. Default False, only support `"NCHW"` data format
         data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NHWC"`.
                         The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
                         `[batch_size, input_channels, input_height, input_width]`.
@@ -668,12 +668,12 @@ def max_pool2d(x,
                                 kernel_size=2,
                                 stride=2, padding=0)
           # output.shape [1, 3, 16, 16]
-          # for return_indices=True
+          # for return_mask=True
           out, max_indices = F.max_pool2d(x,
                                              kernel_size=2,
                                              stride=2,
                                              padding=0,
-                                             return_indices=True)
+                                             return_mask=True)
           # out.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
     """
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool2d')
@@ -693,20 +693,20 @@ def max_pool2d(x,
     padding, padding_algorithm = _update_padding_nd(
         padding, num_dims=2, channel_last=channel_last, ceil_mode=ceil_mode)
 
-    if data_format == "NHWC" and return_indices:
+    if data_format == "NHWC" and return_mask:
         raise ValueError(
-            "When setting return_indices to true, data_format must be set to NCHW in API:max_pool2d"
+            "When setting return_mask to true, data_format must be set to NCHW in API:max_pool2d"
         )
 
     if in_dygraph_mode():
-        if return_indices:
+        if return_mask:
             output = core.ops.max_pool2d_with_index(
                 x, 'ksize', kernel_size, 'global_pooling', False, 'strides',
                 stride, 'paddings', padding, 'padding_algorithm',
                 padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
                 'use_mkldnn', False, 'exclusive', True, 'data_format',
                 data_format)
-            return output if return_indices else output[0]
+            return output if return_mask else output[0]
         else:
             output = core.ops.pool2d(
                 x, 'pooling_type', 'max', 'ksize', kernel_size,
@@ -716,7 +716,7 @@ def max_pool2d(x,
                 'data_format', data_format)
             return output
 
-    op_type = 'max_pool2d_with_index' if return_indices else "pool2d"
+    op_type = 'max_pool2d_with_index' if return_mask else "pool2d"
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
@@ -741,14 +741,14 @@ def max_pool2d(x,
             "data_format": data_format,
         })
 
-    return (pool_out, mask) if return_indices else pool_out
+    return (pool_out, mask) if return_mask else pool_out
 
 
 def max_pool3d(x,
                kernel_size,
                stride=None,
                padding=0,
-               return_indices=False,
+               return_mask=False,
                ceil_mode=False,
                data_format="NCDHW",
                name=None):
@@ -773,7 +773,7 @@ def max_pool3d(x,
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
         ceil_mode (bool): ${ceil_mode_comment}
-        return_indices (bool): Whether to return the max indices along with the outputs. Default False. Only support "NDCHW" data_format.
+        return_mask (bool): Whether to return the max indices along with the outputs. Default False. Only support "NDCHW" data_format.
         data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
                         The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
                         `[batch_size, input_channels, input_depth, input_height, input_width]`.
@@ -798,13 +798,13 @@ def max_pool3d(x,
                                 kernel_size=2,
                                 stride=2, padding=0)
           output.shape [1, 3, 16, 16, 16]
-          # for return_indices=True
+          # for return_mask=True
           x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
           output, max_indices = paddle.nn.functional.max_pool3d(x,
                                         kernel_size = 2,
                                         stride = 2,
                                         padding=0,
-                                        return_indices=True)
+                                        return_mask=True)
           # output.shape [None, 3, 16, 16, 16], max_indices.shape [None, 3, 16, 16, 16],
     """
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool3d')
@@ -819,20 +819,20 @@ def max_pool3d(x,
     padding, padding_algorithm = _update_padding_nd(
         padding, 3, channel_last=channel_last, ceil_mode=ceil_mode)
 
-    if data_format == "NDHWC" and return_indices:
+    if data_format == "NDHWC" and return_mask:
         raise ValueError(
-            "When setting return_indices to true, data_format must be set to NCDHW in API:max_pool3d"
+            "When setting return_mask to true, data_format must be set to NCDHW in API:max_pool3d"
         )
 
     if in_dygraph_mode():
-        if return_indices:
+        if return_mask:
             output = core.ops.max_pool3d_with_index(
                 x, 'pooling_type', 'max', 'ksize', kernel_size, 'strides',
                 stride, 'paddings', padding, 'global_pooling', False,
                 'padding_algorithm', padding_algorithm, 'use_cudnn', True,
                 'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
                 'data_format', data_format)
-            return output if return_indices else output[0]
+            return output if return_mask else output[0]
         else:
             output = core.ops.pool3d(
                 x, 'pooling_type', 'max', 'ksize', kernel_size,
@@ -842,7 +842,7 @@ def max_pool3d(x,
                 'data_format', data_format)
             return output
 
-    op_type = "max_pool3d_with_index" if return_indices else "pool3d"
+    op_type = "max_pool3d_with_index" if return_mask else "pool3d"
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
@@ -867,7 +867,7 @@ def max_pool3d(x,
             "data_format": data_format,
         })
 
-    return (pool_out, mask) if return_indices else pool_out
+    return (pool_out, mask) if return_mask else pool_out
 
 
 def adaptive_avg_pool1d(x, output_size, name=None):
@@ -1148,7 +1148,7 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
     return pool_out
 
 
-def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
+def adaptive_max_pool1d(x, output_size, return_mask=False, name=None):
     """
     This API implements adaptive max pooling 1d operation.
     See more details in :ref:`api_nn_pooling_AdaptiveMaxPool1d` .
@@ -1159,7 +1159,7 @@ def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
                               where N is batch size, C is the number of channels, L is the
                               length of the feature. The data type is float32 or float64.
         output_size (int): The pool kernel size. The value should be an integer.
-        return_indices (bool): If true, the index of max pooling point will be returned along
+        return_mask (bool): If true, the index of max pooling point will be returned along
                 with outputs. It cannot be set in average pooling type. Default False.
         name(str, optional): For detailed information, please refer
                                  to :ref:`api_guide_Name`. Usually name is no need to set and
@@ -1190,7 +1190,7 @@ def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
               data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
               pool_out = F.adaptive_max_pool1d(data, output_size=16)
               # pool_out shape: [1, 3, 16])
-              pool_out, indices = F.adaptive_max_pool1d(data, output_size=16, return_indices=True)
+              pool_out, indices = F.adaptive_max_pool1d(data, output_size=16, return_mask=True)
               # pool_out shape: [1, 3, 16] indices  shape: [1, 3, 16]
     """
     pool_type = 'max'
@@ -1198,7 +1198,7 @@ def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
                              'adaptive_max_pool1d')
     _check_input(x, 3)
     check_type(output_size, 'pool_size', int, 'adaptive_max_pool1d')
-    check_type(return_indices, 'return_indices', bool, 'adaptive_max_pool1d')
+    check_type(return_mask, 'return_mask', bool, 'adaptive_max_pool1d')
 
     pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
 
@@ -1209,7 +1209,7 @@ def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
         pool_out = core.ops.max_pool2d_with_index(
             x, 'pooling_type', pool_type, 'ksize', pool_size, 'adaptive', True)
         return (squeeze(pool_out[0], [2]), squeeze(
-            pool_out[1], [2])) if return_indices else squeeze(pool_out[0], [2])
+            pool_out[1], [2])) if return_mask else squeeze(pool_out[0], [2])
 
     helper = LayerHelper(l_type, **locals())
     dtype = helper.input_dtype()
@@ -1229,10 +1229,10 @@ def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
         })
 
     return (squeeze(pool_out, [2]),
-            squeeze(mask, [2])) if return_indices else squeeze(pool_out, [2])
+            squeeze(mask, [2])) if return_mask else squeeze(pool_out, [2])
 
 
-def adaptive_max_pool2d(x, output_size, return_indices=False, name=None):
+def adaptive_max_pool2d(x, output_size, return_mask=False, name=None):
     """
         This operation applies a 2D adaptive max pooling on input tensor.
         See more details in :ref:`api_nn_pooling_AdaptiveMaxPool2d` .
@@ -1240,7 +1240,7 @@ def adaptive_max_pool2d(x, output_size, return_indices=False, name=None):
         Args:
             x (Tensor): The input tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type can be float16, float32, float64, int32 or int64.
             output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two elements, (H, W). H and W can be either a int, or None which means the size will be the same as that of the input.
-            return_indices (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
+            return_mask (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
             name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
 
         Returns:
@@ -1280,7 +1280,7 @@ def adaptive_max_pool2d(x, output_size, return_indices=False, name=None):
                                  'adaptive_max_pool2d')
     _check_input(x, 4)
     #check_type(output_size, 'pool_size', (int), 'adaptive_max_pool2d')
-    check_type(return_indices, 'return_indices', bool, 'adaptive_max_pool2d')
+    check_type(return_mask, 'return_mask', bool, 'adaptive_max_pool2d')
 
     in_h, in_w = x.shape[2:4]
     if isinstance(output_size, int):
@@ -1295,7 +1295,7 @@ def adaptive_max_pool2d(x, output_size, return_indices=False, name=None):
     if in_dygraph_mode():
         pool_out = core.ops.max_pool2d_with_index(
             x, 'pooling_type', 'max', 'ksize', output_size, 'adaptive', True)
-        return pool_out if return_indices else pool_out[0]
+        return pool_out if return_mask else pool_out[0]
 
     l_type = 'max_pool2d_with_index'
 
@@ -1315,11 +1315,11 @@ def adaptive_max_pool2d(x, output_size, return_indices=False, name=None):
             "ksize": output_size,
             "adaptive": True,
         })
-    #return (pool_out, mask) if return_indices else pool_out
+    #return (pool_out, mask) if return_mask else pool_out
     return pool_out
 
 
-def adaptive_max_pool3d(x, output_size, return_indices=False, name=None):
+def adaptive_max_pool3d(x, output_size, return_mask=False, name=None):
     """
         This operation applies a 3D adaptive max pooling on input tensor.
         See more details in :ref:`api_nn_pooling_AdaptiveMaxPool3d` .
@@ -1327,7 +1327,7 @@ def adaptive_max_pool3d(x, output_size, return_indices=False, name=None):
         Args:
             x (Tensor): The input tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type can be float32, float64.
             output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means the size will be the same as that of the input.
-            return_indices (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
+            return_mask (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
             name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
 
         Returns:
@@ -1371,7 +1371,7 @@ def adaptive_max_pool3d(x, output_size, return_indices=False, name=None):
                                  'adaptive_max_pool3d')
     _check_input(x, 5)
     #check_type(output_size, 'pool_size', (int), 'adaptive_max_pool3d')
-    check_type(return_indices, 'return_indices', bool, 'adaptive_max_pool3d')
+    check_type(return_mask, 'return_mask', bool, 'adaptive_max_pool3d')
 
     in_l, in_h, in_w = x.shape[2:5]
     if isinstance(output_size, int):
@@ -1388,7 +1388,7 @@ def adaptive_max_pool3d(x, output_size, return_indices=False, name=None):
     if in_dygraph_mode():
         pool_out = core.ops.max_pool3d_with_index(
             x, 'pooling_type', 'max', 'ksize', output_size, 'adaptive', True)
-        return pool_out if return_indices else pool_out[0]
+        return pool_out if return_mask else pool_out[0]
 
     l_type = 'max_pool3d_with_index'
 
@@ -1409,4 +1409,4 @@ def adaptive_max_pool3d(x, output_size, return_indices=False, name=None):
             "adaptive": True,
         })
 
-    return (pool_out, mask) if return_indices else pool_out
+    return (pool_out, mask) if return_mask else pool_out
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index a996844c8f5..5e2292d40d2 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -73,7 +73,6 @@ class _InstanceNormBase(layers.Layer):
                  momentum=0.9,
                  weight_attr=None,
                  bias_attr=None,
-                 track_running_stats=False,
                  data_format="NCHW",
                  name=None):
         super(_InstanceNormBase, self).__init__()
@@ -135,9 +134,6 @@ class InstanceNorm1D(_InstanceNormBase):
         epsilon(float, optional): A value added to the denominator for
             numerical stability. Default is 1e-5.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
-        track_running_stats(bool, optional): Whether to use global mean and
-            variance. In train mode, when setting track_running_stats True, the global mean
-            and variance are also used during train period. Default: False.
         weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
              of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
 	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
@@ -159,9 +155,6 @@ class InstanceNorm1D(_InstanceNormBase):
     Returns:
         None.
 
-    **Note**:
-        Momentum and track_running_stats is not effective. The next version will fix the problem .
-
 
     Examples:
 
@@ -214,9 +207,6 @@ class InstanceNorm2D(_InstanceNormBase):
         epsilon(float, optional): A value added to the denominator for
             numerical stability. Default is 1e-5.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
-        track_running_stats(bool, optional): Whether to use global mean and
-            variance. In train mode, when setting track_running_stats True, the global mean
-            and variance are also used during train period. Default: False.
         weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
              of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
 	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
@@ -237,8 +227,6 @@ class InstanceNorm2D(_InstanceNormBase):
     Returns:
         None.
 
-    **Note**:
-        Momentum and track_running_stats is not effective. The next version will fix the problem .
 
     Examples:
 
@@ -290,9 +278,6 @@ class InstanceNorm3D(_InstanceNormBase):
         epsilon(float, optional): A value added to the denominator for
             numerical stability. Default is 1e-5.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
-        track_running_stats(bool, optional): Whether to use global mean and
-            variance. In train mode, when setting track_running_stats True, the global mean
-            and variance are also used during train period. Default: False.
         weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
              of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
 	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
@@ -313,8 +298,6 @@ class InstanceNorm3D(_InstanceNormBase):
     Returns:
         None.
 
-    **Note**:
-        Momentum and track_running_stats is not effective. The next version will fix the problem .
 
     Examples:
 
@@ -570,7 +553,6 @@ class _BatchNormBase(layers.Layer):
                  weight_attr=None,
                  bias_attr=None,
                  data_format='NCHW',
-                 track_running_stats=True,
                  name=None):
         super(_BatchNormBase, self).__init__()
         self._num_features = num_features
@@ -636,7 +618,6 @@ class _BatchNormBase(layers.Layer):
         self._momentum = momentum
         self._epsilon = epsilon
         self._fuse_with_relu = False
-        self._track_running_stats = track_running_stats
         self._name = name
 
     def _check_input_dim(self, input):
@@ -651,11 +632,7 @@ class _BatchNormBase(layers.Layer):
 
         self._check_input_dim(input)
 
-        if not self.training and not self._track_running_stats:
-            raise ValueError(
-                'When inference, expected track_running_stats is True.')
-
-        if self.training and not self._track_running_stats:
+        if self.training:
             warnings.warn(
                 "When training, we now always track global mean and variance.")
 
@@ -720,9 +697,6 @@ class BatchNorm1D(_BatchNormBase):
             will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
             If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
         data_format(str, optional): Specify the input data format, may be "NC", "NCL" or "NLC". Defalut "NCL".
-        track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
-            True will track global mean and variance used for inference. When inference, track_running_stats must be 
-            True. Default: True.
         name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
     Shape:
@@ -732,9 +706,6 @@ class BatchNorm1D(_BatchNormBase):
 
     Returns:
         None.
-
-    **Note**:
-        Now track_running_stats is actucal always true. The next version will fix the problem .
     
 
     Examples:
@@ -817,9 +788,6 @@ class BatchNorm2D(_BatchNormBase):
             will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
             If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
         data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
-        track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
-            True will track global mean and variance used for inference. When inference, track_running_stats must be 
-            True. Default: True.
         name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
     Shape:
@@ -830,9 +798,6 @@ class BatchNorm2D(_BatchNormBase):
     Returns:
         None
 
-    **Note**:
-        Now track_running_stats is actucal always true. The next version will fix the problem .
-
     Examples:
         .. code-block:: python
 
@@ -912,9 +877,6 @@ class BatchNorm3D(_BatchNormBase):
             will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
             If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
         data_format(str, optional): Specify the input data format, the data format can be "NCDHW" or "NDHWC. Default: NCDHW.
-        track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
-            True will track global mean and variance used for inference. When inference, track_running_stats must be 
-            True. Default: True.
         name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
     Shape:
@@ -925,9 +887,6 @@ class BatchNorm3D(_BatchNormBase):
     Returns:
         None
 
-    **Note**:
-        Now track_running_stats is actucal always true. The next version will fix the problem .
-
     Examples:
         .. code-block:: python
 
@@ -1024,8 +983,6 @@ class SyncBatchNorm(_BatchNormBase):
              will create ParamAttr as bias_attr. If the Initializer of the bias_attr
              is not set, the bias is initialized zero. If it is set to False, this layer will not 
              have trainable bias parameter. Default: None.
-        track_running_stats(bool, optional): Whether to compute global stats, which including running mean and 
-             running variance. Default: True.
 
     Shapes:
         input: Tensor that the dimension from 2 to 5.
@@ -1055,11 +1012,10 @@ class SyncBatchNorm(_BatchNormBase):
                  weight_attr=None,
                  bias_attr=None,
                  data_format='NCHW',
-                 track_running_stats=True,
                  name=None):
         super(SyncBatchNorm,
               self).__init__(num_features, momentum, epsilon, weight_attr,
-                             bias_attr, data_format, track_running_stats, name)
+                             bias_attr, data_format, name)
 
     def forward(self, x):
         # create output
@@ -1147,10 +1103,10 @@ class SyncBatchNorm(_BatchNormBase):
         """
         layer_output = layer
         if isinstance(layer, _BatchNormBase):
-            layer_output = SyncBatchNorm(
-                layer._num_features, layer._momentum, layer._epsilon,
-                layer._weight_attr, layer._bias_attr, layer._data_format,
-                layer._track_running_stats, layer._name)
+            layer_output = SyncBatchNorm(layer._num_features, layer._momentum,
+                                         layer._epsilon, layer._weight_attr,
+                                         layer._bias_attr, layer._data_format,
+                                         layer._name)
 
             if layer._weight_attr != False and layer._bias_attr != False:
                 with no_grad():
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 9e544cb02e7..0b0a4909f85 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -35,7 +35,7 @@ __all__ = [
 class AvgPool1D(layers.Layer):
     """
     This operation applies a 1D average pooling over an input signal composed
-    of several input planes, based on the input, output_size, return_indices parameters.
+    of several input planes, based on the input, output_size, return_mask parameters.
     Input(X) and output(Out) are in NCL format, where N is batch
     size, C is the number of channels, L is the length of the feature.
     The output tensor shape will be [N, C, output_size].
@@ -61,7 +61,7 @@ class AvgPool1D(layers.Layer):
             4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
-        count_include_pad (bool): Whether to exclude padding points in average pooling
+        exclusive (bool): Whether to exclude padding points in average pooling
                           mode, default is `True`.
         ceil_mode (bool): ${ceil_mode_comment}Whether to use the ceil function to calculate output height and width.
             If it is set to False, the floor function will be used. The default value is False.
@@ -103,7 +103,7 @@ class AvgPool1D(layers.Layer):
                  kernel_size,
                  stride=None,
                  padding=0,
-                 count_include_pad=True,
+                 exclusive=True,
                  ceil_mode=False,
                  name=None):
         super(AvgPool1D, self).__init__()
@@ -111,12 +111,12 @@ class AvgPool1D(layers.Layer):
         self.stride = stride
         self.padding = padding
         self.ceil_mode = ceil_mode
-        self.count_include_pad = count_include_pad
+        self.exclusive = exclusive
         self.name = name
 
     def forward(self, x):
         out = F.avg_pool1d(x, self.kernel_size, self.stride, self.padding,
-                           self.count_include_pad, self.ceil_mode, self.name)
+                           self.exclusive, self.ceil_mode, self.name)
         return out
 
 
@@ -156,7 +156,7 @@ class AvgPool2D(layers.Layer):
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
         ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        count_include_pad (bool): Whether to exclude padding points in average pooling
+        exclusive (bool): Whether to exclude padding points in average pooling
                           mode, default is `true`.
         divisor_override (float): if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
         data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
@@ -197,7 +197,7 @@ class AvgPool2D(layers.Layer):
                  stride=None,
                  padding=0,
                  ceil_mode=False,
-                 count_include_pad=True,
+                 exclusive=True,
                  divisor_override=None,
                  data_format="NCHW",
                  name=None):
@@ -206,7 +206,7 @@ class AvgPool2D(layers.Layer):
         self.stride = stride
         self.padding = padding
         self.ceil_mode = ceil_mode
-        self.count_include_pad = count_include_pad
+        self.exclusive = exclusive
         self.divisor = divisor_override
         self.data_format = data_format
         self.name = name
@@ -218,7 +218,7 @@ class AvgPool2D(layers.Layer):
             stride=self.stride,
             padding=self.padding,
             ceil_mode=self.ceil_mode,
-            count_include_pad=self.count_include_pad,
+            exclusive=self.exclusive,
             divisor_override=self.divisor,
             data_format=self.data_format,
             name=self.name)
@@ -247,7 +247,7 @@ class AvgPool3D(layers.Layer):
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
         ceil_mode (bool): ${ceil_mode_comment}
-        count_include_pad (bool): Whether to exclude padding points in average pooling
+        exclusive (bool): Whether to exclude padding points in average pooling
                           mode, default is True.
         divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
         data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
@@ -289,7 +289,7 @@ class AvgPool3D(layers.Layer):
                  stride,
                  padding=0,
                  ceil_mode=False,
-                 count_include_pad=True,
+                 exclusive=True,
                  divisor_override=None,
                  data_format="NCDHW",
                  name=None):
@@ -298,7 +298,7 @@ class AvgPool3D(layers.Layer):
         self.stride = stride
         self.padding = padding
         self.ceil_mode = ceil_mode
-        self.count_include_pad = count_include_pad
+        self.exclusive = exclusive
         self.divisor = divisor_override
         self.data_format = data_format
         self.name = name
@@ -310,7 +310,7 @@ class AvgPool3D(layers.Layer):
             stride=self.stride,
             padding=self.padding,
             ceil_mode=self.ceil_mode,
-            count_include_pad=self.count_include_pad,
+            exclusive=self.exclusive,
             divisor_override=self.divisor,
             data_format=self.data_format,
             name=self.name)
@@ -319,7 +319,7 @@ class AvgPool3D(layers.Layer):
 class MaxPool1D(layers.Layer):
     """
     Applies a 1D max pooling over an input signal composed of several input planes based
-    on the input, output_size, return_indices parameters.
+    on the input, output_size, return_mask parameters.
     Input(X) and output(Out) are in NCL format, where N is batch
     size, C is the number of channels, L is the length of the feature.
 
@@ -343,7 +343,7 @@ class MaxPool1D(layers.Layer):
             4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
-        return_indices (bool): Whether return the max indices along with the outputs. default is `False`.
+        return_mask (bool): Whether return the max indices along with the outputs. default is `False`.
         ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default.
             If it is set to False, the floor function will be used. Default False.
         name(str, optional): For detailed information, please refer
@@ -377,7 +377,7 @@ class MaxPool1D(layers.Layer):
           pool_out = MaxPool1D(data)
           # pool_out shape: [1, 3, 16]
 
-          MaxPool1D = nn.MaxPool1D(kernel_size=2, stride=2, padding=0, return_indices=True)
+          MaxPool1D = nn.MaxPool1D(kernel_size=2, stride=2, padding=0, return_mask=True)
           pool_out, indices = MaxPool1D(data)
           # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
 
@@ -387,7 +387,7 @@ class MaxPool1D(layers.Layer):
                  kernel_size,
                  stride=None,
                  padding=0,
-                 return_indices=False,
+                 return_mask=False,
                  ceil_mode=False,
                  name=None):
         super(MaxPool1D, self).__init__()
@@ -395,12 +395,12 @@ class MaxPool1D(layers.Layer):
         self.stride = stride
         self.padding = padding
         self.ceil_mode = ceil_mode
-        self.return_indices = return_indices
+        self.return_mask = return_mask
         self.name = name
 
     def forward(self, input):
         out = F.max_pool1d(input, self.kernel_size, self.stride, self.padding,
-                           self.return_indices, self.ceil_mode, self.name)
+                           self.return_mask, self.ceil_mode, self.name)
         return out
 
 
@@ -440,7 +440,7 @@ class MaxPool2D(layers.Layer):
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
         ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        return_indices (bool): Whether to return the max indices along with the outputs.
+        return_mask (bool): Whether to return the max indices along with the outputs.
         data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
                         The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
                         `[batch_size, input_channels, input_height, input_width]`.
@@ -473,8 +473,8 @@ class MaxPool2D(layers.Layer):
           output = MaxPool2D(input)
           # output.shape [1, 3, 16, 16]
 
-          # for return_indices=True
-          MaxPool2D = nn.MaxPool2D(kernel_size=2,stride=2, padding=0, return_indices=True)
+          # for return_mask=True
+          MaxPool2D = nn.MaxPool2D(kernel_size=2, stride=2, padding=0, return_mask=True)
           output, max_indices = MaxPool2D(input)
           # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
     """
@@ -483,7 +483,7 @@ class MaxPool2D(layers.Layer):
                  kernel_size,
                  stride=None,
                  padding=0,
-                 return_indices=False,
+                 return_mask=False,
                  ceil_mode=False,
                  data_format="NCHW",
                  name=None):
@@ -491,7 +491,7 @@ class MaxPool2D(layers.Layer):
         self.ksize = kernel_size
         self.stride = stride
         self.padding = padding
-        self.return_indices = return_indices
+        self.return_mask = return_mask
         self.ceil_mode = ceil_mode
         self.data_format = data_format
         self.name = name
@@ -502,7 +502,7 @@ class MaxPool2D(layers.Layer):
             kernel_size=self.ksize,
             stride=self.stride,
             padding=self.padding,
-            return_indices=self.return_indices,
+            return_mask=self.return_mask,
             data_format=self.data_format,
             name=self.name)
 
@@ -530,7 +530,7 @@ class MaxPool3D(layers.Layer):
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
         ceil_mode (bool): ${ceil_mode_comment}
-        return_indices (bool): Whether to return the max indices along with the outputs.
+        return_mask (bool): Whether to return the max indices along with the outputs.
         data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
                         The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
                         `[batch_size, input_channels, input_depth, input_height, input_width]`.
@@ -564,8 +564,8 @@ class MaxPool3D(layers.Layer):
           output = MaxPool3D(input)
           # output.shape [1, 2, 3, 16, 16]
 
-          # for return_indices=True
-          MaxPool3D = nn.MaxPool3D(kernel_size=2,stride=2, padding=0, return_indices=True)
+          # for return_mask=True
+          MaxPool3D = nn.MaxPool3D(kernel_size=2, stride=2, padding=0, return_mask=True)
           output, max_indices = MaxPool3D(input)
           # output.shape [1, 2, 3, 16, 16], max_indices.shape [1, 2, 3, 16, 16],
     """
@@ -574,7 +574,7 @@ class MaxPool3D(layers.Layer):
                  kernel_size,
                  stride,
                  padding,
-                 return_indices=False,
+                 return_mask=False,
                  ceil_mode=False,
                  data_format="NCDHW",
                  name=None):
@@ -582,7 +582,7 @@ class MaxPool3D(layers.Layer):
         self.ksize = kernel_size
         self.stride = stride
         self.padding = padding
-        self.return_indices = return_indices
+        self.return_mask = return_mask
         self.ceil_mode = ceil_mode
         self.data_format = data_format
         self.name = name
@@ -593,7 +593,7 @@ class MaxPool3D(layers.Layer):
             kernel_size=self.ksize,
             stride=self.stride,
             padding=self.padding,
-            return_indices=self.return_indices,
+            return_mask=self.return_mask,
             data_format=self.data_format,
             name=self.name)
 
@@ -602,7 +602,7 @@ class AdaptiveAvgPool1D(layers.Layer):
     """
 
     This operation applies a 1D adaptive average pooling over an input signal composed
-    of several input planes, based on the input, output_size, return_indices parameters.
+    of several input planes, based on the input, output_size, return_mask parameters.
     Input(X) and output(Out) are in NCL format, where N is batch
     size, C is the number of channels, L is the length of the feature.
     The output tensor shape will be [N, C, output_size].
@@ -841,7 +841,7 @@ class AdaptiveMaxPool1D(layers.Layer):
     """
 
     This operation applies a 1D adaptive max pooling over an input signal composed
-    of several input planes, based on the input, output_size, return_indices parameters.
+    of several input planes, based on the input, output_size, return_mask parameters.
     Input(X) and output(Out) are in NCL format, where N is batch
     size, C is the number of channels, L is the length of the feature.
     The output tensor shape will be [N, C, output_size].
@@ -859,7 +859,7 @@ class AdaptiveMaxPool1D(layers.Layer):
     Args:
         output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
              it must contain one int.
-        return_indices (bool): If true, the index of max pooling point will be returned along
+        return_mask (bool): If true, the index of max pooling point will be returned along
             with outputs. It cannot be set in average pooling type. Default False.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
@@ -898,22 +898,22 @@ class AdaptiveMaxPool1D(layers.Layer):
           pool_out = AdaptiveMaxPool1D(data)
           # pool_out shape: [1, 3, 16]
 
-          # for return_indices = true
-          AdaptiveMaxPool1D = nn.AdaptiveMaxPool1D(output_size=16, return_indices=True)
+          # for return_mask = true
+          AdaptiveMaxPool1D = nn.AdaptiveMaxPool1D(output_size=16, return_mask=True)
           pool_out, indices = AdaptiveMaxPool1D(data)
           # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
 
     """
 
-    def __init__(self, output_size, return_indices=False, name=None):
+    def __init__(self, output_size, return_mask=False, name=None):
         super(AdaptiveMaxPool1D, self).__init__()
         self.output_size = output_size
-        self.return_indices = return_indices
+        self.return_mask = return_mask
         self.name = name
 
     def forward(self, input):
-        return F.adaptive_max_pool1d(input, self.output_size,
-                                     self.return_indices, self.name)
+        return F.adaptive_max_pool1d(input, self.output_size, self.return_mask,
+                                     self.name)
 
 
 class AdaptiveMaxPool2D(layers.Layer):
@@ -932,7 +932,7 @@ class AdaptiveMaxPool2D(layers.Layer):
        Output(i ,j) &= max(Input[hstart:hend, wstart:wend])
     Parameters:
         output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two element, (H, W). H and W can be either a int, or None which means the size will be the same as that of the input.
-        return_indices (bool): If true, the index of max pooling point will be returned along with outputs. It cannot be set in average pooling type. Default False.
+        return_mask (bool): If true, the index of max pooling point will be returned along with outputs. It cannot be set in average pooling type. Default False.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
@@ -965,21 +965,21 @@ class AdaptiveMaxPool2D(layers.Layer):
             paddle.disable_static()
             input_data = np.random.rand(2, 3, 32, 32)
             x = paddle.to_tensor(input_data)
-            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2D(output_size=3, return_indices=True)
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2D(output_size=3, return_mask=True)
             pool_out, indices = adaptive_max_pool(x = x)
     """
 
-    def __init__(self, output_size, return_indices=False, name=None):
+    def __init__(self, output_size, return_mask=False, name=None):
         super(AdaptiveMaxPool2D, self).__init__()
         self._output_size = output_size
-        self._return_indices = return_indices
+        self._return_mask = return_mask
         self._name = name
 
     def forward(self, x):
         return F.adaptive_max_pool2d(
             x,
             output_size=self._output_size,
-            return_indices=self._return_indices,
+            return_mask=self._return_mask,
             name=self._name)
 
 
@@ -1002,7 +1002,7 @@ class AdaptiveMaxPool3D(layers.Layer):
 
     Parameters:
         output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means the size will be the same as that of the input.
-        return_indices (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
+        return_mask (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
@@ -1040,21 +1040,21 @@ class AdaptiveMaxPool3D(layers.Layer):
             pool = paddle.nn.AdaptiveMaxPool3D(output_size=4)
             out = pool(x)
             # out shape: [2, 3, 4, 4, 4]
-            pool = paddle.nn.AdaptiveMaxPool3D(output_size=3, return_indices=True)
+            pool = paddle.nn.AdaptiveMaxPool3D(output_size=3, return_mask=True)
             out, indices = pool(x)
             # out shape: [2, 3, 4, 4, 4], indices shape: [2, 3, 4, 4, 4]
 
     """
 
-    def __init__(self, output_size, return_indices=False, name=None):
+    def __init__(self, output_size, return_mask=False, name=None):
         super(AdaptiveMaxPool3D, self).__init__()
         self._output_size = output_size
-        self._return_indices = return_indices
+        self._return_mask = return_mask
         self._name = name
 
     def forward(self, x):
         return F.adaptive_max_pool3d(
             x,
             output_size=self._output_size,
-            return_indices=self._return_indices,
+            return_mask=self._return_mask,
             name=self._name)
-- 
GitLab