diff --git a/python/paddle/fluid/tests/unittests/test_bce_loss.py b/python/paddle/fluid/tests/unittests/test_bce_loss.py
index 738d30ad93804a11b34022de794a2a2d5f0c4fa4..21571e0981065a0a1e2a5db03e91b4df0ea55d9a 100644
--- a/python/paddle/fluid/tests/unittests/test_bce_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_bce_loss.py
@@ -66,18 +66,20 @@ class TestBCELoss(unittest.TestCase):
                 self.assertTrue(np.allclose(dy_result, expected))
 
     def test_BCELoss_weight(self):
-        input_np = np.random.random(size=(20, 30)).astype(np.float64)
-        label_np = np.random.random(size=(20, 30)).astype(np.float64)
-        weight_np = np.random.random(size=(20, 30)).astype(np.float64)
+        input_np = np.random.random(size=(2, 3, 4, 10)).astype(np.float64)
+        label_np = np.random.random(size=(2, 3, 4, 10)).astype(np.float64)
+        weight_np = np.random.random(size=(3, 4, 10)).astype(np.float64)
         prog = fluid.Program()
         startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(name='input', shape=[None, 30], dtype='float64')
-            label = fluid.data(name='label', shape=[None, 30], dtype='float64')
+            input = fluid.data(
+                name='input', shape=[None, 3, 4, 10], dtype='float64')
+            label = fluid.data(
+                name='label', shape=[None, 3, 4, 10], dtype='float64')
             weight = fluid.data(
-                name='weight', shape=[None, 30], dtype='float64')
+                name='weight', shape=[3, 4, 10], dtype='float64')
             bce_loss = paddle.nn.loss.BCELoss(weight=weight)
             res = bce_loss(input, label)
 
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 594777a43b3720d5c3db644b042779849ca46785..e9c1485bff59dc4e226bd14835a26d7256f13923 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -315,40 +315,56 @@ class L1Loss(fluid.dygraph.Layer):
 
 class BCELoss(fluid.dygraph.Layer):
     """
-    This op accepts input predictions and target label and returns binary 
-    cross entropy error.
-    For predictions label, and target label, the loss is calculated as follows.
+    This interface is used to construct a callable object of the ``BCELoss`` class.
+    The BCELoss layer measures the binary_cross_entropy loss between input predictions 
+    and target labels. The binary_cross_entropy loss can be described as:
+
     If :attr:`weight` is set, the loss is:
+
+    .. math::
         Out = -1 * weight * (label * log(input) + (1 - label) * log(1 - input))
     If :attr:`weight` is None, the loss is:
+
+    .. math::
         Out = -1 * (label * log(input) + (1 - label) * log(1 - input))
 
     If :attr:`reduction` set to ``'none'``, the unreduced loss is:
+
     .. math::
         Out = Out
     If :attr:`reduction` set to ``'mean'``, the reduced mean loss is:
+
     .. math::
         Out = MEAN(Out)
     If :attr:`reduction` set to ``'sum'``, the reduced sum loss is:
+
     .. math::
         Out = SUM(Out)
+
+    Note that the input predictions always be the output of sigmoid, and the target labels 
+    should be numbers between 0 and 1.
+
+    The shape of input predictions and target labels are [N, *], where N is batch_size and `*` 
+    means any number of additional dimensions. If ``reduction`` is ``'none'``, the shape of 
+    output is scalar, else the shape of output is same as input.
+
     Parameters:
-        input (Variable): Input tensor, the data type is float32,
-            float64. Input must in (0, 1).
-        label (Variable): Label tensor, has the same shape with input, 
-            the data type is float32, float64.
-        weight (Variable, optional): Weight tensor, a manual rescaling weight given
-            to each class. It has the same dimensions as class number and the data type
-            is float32, float64, int32, int64. Default is ``'None'``.
+        weight (Variable, optional): A manual rescaling weight given to the loss of each 
+            batch element. If given, has to be a Variable of size nbatch and the data type
+            is float32, float64. Default is ``'None'``.
         reduction (str, optional): Indicate how to average the loss by batch_size, 
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
             If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned; 
+            If :attr:`reduction` is ``'sum'``, the summed loss is returned.
             Default is ``'mean'``.
-    Returns:
-        The tensor variable storing the bce_loss of input and label.
-    Return type: Variable.
+
+    Returns: 
+        A callable object of BCELoss.
+
     Examples:
         .. code-block:: python
+
             # declarative mode
             import paddle.fluid as fluid
             import numpy as np
@@ -409,7 +425,7 @@ class BCELoss(fluid.dygraph.Layer):
         if self.weight is not None:
             if isinstance(self.weight, fluid.framework.Variable):
                 w = self.weight
-                out = fluid.layers.elementwise_mul(out, w, axis=0)
+                out = fluid.layers.elementwise_mul(out, w, axis=-1)
             else:
                 raise ValueError(
                     "The weight is not a Variable, please convert to Variable.")