Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into update-api-reference-1

b77c886e · qiaolongfei · 82a4cf19 · e6654c1c · b77c886e · b77c886e
12 changed file
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -443,7 +443,7 @@ class SwishOpMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(
 Swish Activation Operator.
-$$out = \frac{x}{1 + e^{- \beta x}}$$
+$$out = \\frac{x}{1 + e^{- \beta x}}$$
 )DOC");
  }

--- a/paddle/fluid/operators/clip_by_norm_op.cc
+++ b/paddle/fluid/operators/clip_by_norm_op.cc
@@ -54,10 +54,19 @@ be linearly scaled to make the L2 norm of $Out$ equal to $max\_norm$, as
 shown in the following formula:
 $$
-Out = \frac{max\_norm * X}{norm(X)},
+Out = \\frac{max\\_norm * X}{norm(X)},
 $$
 where $norm(X)$ represents the L2 norm of $X$.
+Examples:
+        .. code-block:: python
+            data = fluid.layer.data(
+                name='data', shape=[2, 4, 6], dtype='float32')
+            reshaped = fluid.layers.clip_by_norm(
+                x=data, max_norm=0.5)
 )DOC");
  }
 };

--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -204,8 +204,6 @@ void Pool2dOpMaker::Make() {
  // TODO(dzhwinter): need to registered layout transform function
  AddComment(R"DOC(
-Pool2d Operator.
 The pooling2d operation calculates the output based on
 the input, pooling_type and ksize, strides, paddings parameters.
 Input(X) and output(Out) are in NCHW format, where N is batch size, C is the
@@ -215,19 +213,28 @@ These two elements represent height and width, respectively.
 The input(X) size and output(Out) size may be different.
 Example:
  Input:
       X shape: $(N, C, H_{in}, W_{in})$
  Output:
       Out shape: $(N, C, H_{out}, W_{out})$
  For ceil_mode = false:
       $$
-       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       H_{out} = \\frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1
-       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
+       $$
+       $$
+       W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
       $$
  For ceil_mode = true:
       $$
-       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0] + strides[0] - 1)}{strides[0]} + 1 \\
+       H_{out} = \\frac{(H_{in} - ksize[0] + 2 * paddings[0] + strides[0] - 1)}{strides[0]} + 1
-       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
+       $$
+       $$
+       W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
       $$
 )DOC");

--- a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
@@ -35,10 +35,10 @@ class UniformRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
 protected:
  void Apply() override {
    AddComment(R"DOC(
-Uniform random operator
+UniformRandomBatchSizeLike operator.
 This operator initializes a tensor with the same batch_size as the Input tensor
- with random values sampled from a uniform distribution.
+with random values sampled from a uniform distribution.
 )DOC");
    AddAttr<float>("min",

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1034,6 +1034,37 @@ class Block(object):
 class Program(object):
+    """
+    Python Program. Beneath it is a ProgramDesc, which is used for
+    create c++ Program. A program is a self-contained programing
+    language like container. It has at least one Block, when the
+    control flow op like conditional_block, while_op is included,
+    it will contains nested block.
+    Please reference the framework.proto for details.
+    Notes: we have default_startup_program and default_main_program
+    by default, a pair of them will shared the parameters.
+    The default_startup_program only run once to initialize parameters,
+    default_main_program run in every minibatch and adjust the weights.
+    Args:
+        None
+    Returns:
+        Python Program
+    Examples:
+       .. code-block:: python
+         main_program = Program()
+         startup_program = Program()
+         with fluid.program_guard(main_program=main_program, startup_program=startup_program):
+            fluid.layers.data(name="x", shape=[-1, 784], dtype='float32')
+            fluid.layers.data(name="y", shape=[-1, 1], dtype='int32')
+            fluid.layers.fc(name="fc", shape=[10], dtype='float32', act="relu")
+    """
    def __init__(self):
        self.desc = core.ProgramDesc()
        self.blocks = [Block(self, 0)]
@@ -1099,6 +1130,8 @@ class Program(object):
    def clone(self, for_test=False):
        """Clone the Program object
+        Args:
+           for_test(bool): indicate whether clone for test.
        Set for_test to False when we want to clone the program for training.
        Set for_test to True when we want to clone the program for testing.
@@ -1109,8 +1142,9 @@ class Program(object):
                the is_test attributes in these operators will be set to True for
                testing purposes, otherwise, they remain unchanged.
-        Returns(Program):
+        Returns:
-            The cloned Program object.
+            Program: The cloned Program object.
        """
        if for_test:
            p = self.inference_optimize()
@@ -1228,6 +1262,7 @@ class Program(object):
    def copy_param_info_from(self, other):
        """
        Copy the information of parameters from other program.
        Args:
            other(Program): Other program
@@ -1246,6 +1281,7 @@ class Program(object):
    def copy_data_info_from(self, other):
        """
        Copy the information of data variables from other program.
        Args:
            other(Program): Other program
@@ -1299,6 +1335,7 @@ class Parameter(Variable):
    def to_string(self, throw_on_error, with_details=False):
        """
        To debug string.
        Args:
            throw_on_error(bool): raise exception when self is not initialized
                when throw_on_error is True

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -822,17 +822,25 @@ def max_sequence_len(rank_table):
 def lod_tensor_to_array(x, table):
-    """ Convert a LOD_TENSOR to an LOD_TENSOR_ARRAY.
+    """ 
+    Convert a LoDTensor to a LoDTensorArray.
+    This function split a LoDTesnor to a LoDTensorArray according to its LoD 
+    information. LoDTensorArray is an alias of C++ std::vector<LoDTensor> in 
+    PaddlePaddle. The generated LoDTensorArray of this function can be further read 
+    or written by `read_from_array()` and `write_to_array()` operators. However, 
+    this function is generally an internal component of PaddlePaddle `DynamicRNN`. 
+    Users should not use it directly.
    Args:
-        x (Variable|list): The LOD tensor to be converted to a LOD tensor array.
+        x (Variable|list): The LoDTensor to be converted to a LoDTensorArray.
        table (ParamAttr|list): The variable that stores the level of lod
                                which is ordered by sequence length in
-                                descending order.
+                                descending order. It is generally generated 
+                                by `layers.lod_rank_table()` API.
    Returns:
-        Variable: The variable of type array that has been converted from a
+        Variable: The LoDTensorArray that has been converted from the input tensor.
-                  tensor.
    Examples:
        .. code-block:: python
@@ -897,8 +905,7 @@ def increment(x, value=1.0, in_place=True):
        in_place (bool): If the increment should be performed in-place.
    Returns:
-        Variable: The tensor variable storing the transformation of
+        Variable: The elementwise-incremented object.
-                  element-wise increment of each value in the input.
    Examples:
        .. code-block:: python
@@ -940,7 +947,7 @@ def array_write(x, i, array=None):
        Variable: The output LOD_TENSOR_ARRAY where the input tensor is written.
    Examples:
-        .. code-block::python
+        .. code-block:: python
          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
@@ -1054,14 +1061,31 @@ def equal(x, y, cond=None, **ignored):
 def array_read(array, i):
-    """This function performs the operation to read the data in as an
+    """
+    This function performs the operation to read the data in as an
    LOD_TENSOR_ARRAY.
+    .. code-block:: text
+        Given:
+        array = [0.6, 0.1, 0.3, 0.1]
+        And:
+        i = 2
+        Then:
+        output = 0.3
    Args:
-        array (Variable|list): The input tensor that will be written to an array.
+        array (Variable|list): The input tensor that store data to be read.
-        i (Variable|list): The subscript index in tensor array, that points the
+        i (Variable|list): The index of the data to be read from input array.
-                           place where data will be written to.
    Returns:
        Variable: The tensor type variable that has the data written to it.
    Examples:
        .. code-block:: python
@@ -1154,6 +1178,13 @@ def array_length(array):
 class ConditionalBlockGuard(BlockGuard):
+    """
+    ConditionalBlockGuard is derived from BlockGuard. It is dedicated for 
+    holding a ConditionalBlock, and helping users entering and exiting the 
+    ConditionalBlock via Python's 'with' keyword. However, ConditionalBlockGuard 
+    is generally an internal component of IfElse, users should not use it directly.
+    """
    def __init__(self, block):
        if not isinstance(block, ConditionalBlock):
            raise TypeError("block should be conditional block")
@@ -1875,26 +1906,26 @@ def reorder_lod_tensor_by_rank(x, rank_table):
 def is_empty(x, cond=None, **ignored):
    """
-    **Is Empty**
+    Test whether a Variable is empty.
-    This layer returns the truth value of whether the variable is empty.
    Args:
-        x(Variable): Operand of *is_empty*
+        x (Variable): The Variable to be tested.
-        cond(Variable|None): Optional output variable to store the result
+        cond (Variable|None): Output parameter. Returns the test result 
-                             of *is_empty*
+                              of given 'x'. Default: None
    Returns:
-        Variable: The tensor variable storing the output of *is_empty*.
+        Variable: A bool scalar. True if 'x' is an empty Variable.
    Raises:
        TypeError: If input cond is not a variable, or cond's dtype is
-                   not bool
+                   not bool.
    Examples:
        .. code-block:: python
-          less = fluid.layers.is_empty(x=input)
+          res = fluid.layers.is_empty(x=input)
+          # or:
+          fluid.layers.is_empty(x=input, cond=res)
    """
    helper = LayerHelper("is_empty", **locals())
    if cond is None:

--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -544,6 +544,41 @@ def shuffle(reader, buffer_size):
 def batch(reader, batch_size):
+    """
+    This layer is a reader decorator. It takes a reader and adds 
+    'batching' decoration on it. When reading with the result 
+    decorated reader, output data will be automatically organized 
+    to the form of batches.
+    Args:
+        reader(Variable): The reader to be decorated with 'batching'.
+        batch_size(int): The batch size.
+    Returns:
+        Variable: The reader which has been decorated with 'batching'.
+    Examples:
+        .. code-block:: python
+            raw_reader = fluid.layers.io.open_files(filenames=['./data1.recordio',
+                                                           './data2.recordio'],
+                                                    shapes=[(3,224,224), (1)],
+                                                    lod_levels=[0, 0],
+                                                    dtypes=['float32', 'int64'],
+                                                    thread_num=2,
+                                                    buffer_size=2)
+            batch_reader = fluid.layers.batch(reader=raw_reader, batch_size=5)
+            # If we read data with the raw_reader:
+            #     data = fluid.layers.read_file(raw_reader)
+            # We can only get data instance by instance.
+            # 
+            # However, if we read data with the batch_reader:
+            #     data = fluid.layers.read_file(batch_reader)
+            # Each 5 adjacent instances will be automatically combined together 
+            # to become a batch. So what we get('data') is a batch data instead 
+            # of an instance.
+    """
    return __create_unshared_decorated_reader__(
        'create_batch_reader', reader, {'batch_size': int(batch_size)})
@@ -589,15 +624,41 @@ def parallel(reader):
                                              {})
-def read_file(file_obj):
+def read_file(reader):
+    """
+    Execute the given reader and get data via it.
+    A reader is also a Variable. It can be a raw reader generated by 
+    `fluid.layers.open_files()` or a decorated one generated by 
+    `fluid.layers.double_buffer()` and so on.
+    Args:
+        reader(Variable): The reader to execute.
+    Returns:
+        Tuple[Variable]: Data read via the given reader.
+    Examples:
+        .. code-block:: python
+           data_file = fluid.layers.open_files(
+                filenames=['mnist.recordio'],
+                shapes=[(-1, 748), (-1, 1)],
+                lod_levels=[0, 0],
+                dtypes=["float32", "int64"])
+            data_file = fluid.layers.double_buffer(
+                fluid.layers.batch(data_file, batch_size=64))
+            input, label = fluid.layers.read_file(data_file)
+    """
    helper = LayerHelper('read_file')
    out = [
        helper.create_tmp_variable(
            stop_gradient=True, dtype='float32')
-        for _ in range(len(file_obj.desc.shapes()))
+        for _ in range(len(reader.desc.shapes()))
    ]
    helper.append_op(
-        type='read', inputs={'Reader': [file_obj]}, outputs={'Out': out})
+        type='read', inputs={'Reader': [reader]}, outputs={'Out': out})
    if len(out) == 1:
        return out[0]
    else:

--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -71,21 +71,40 @@ def noam_decay(d_model, warmup_steps):
 def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
-    """Applies exponential decay to the learning rate.
+    """
+    Applies exponential decay to the learning rate. 
+    When training a model, it is often recommended to lower the learning rate as the 
+    training progresses. By using this function, the learning rate will be decayed by 
+    'decay_rate' every 'decay_steps' steps.
+    >>> if staircase == True:
+    >>>     decayed_learning_rate = learning_rate * decay_rate ^ floor(global_step / decay_steps)
+    >>> else:
+    >>>     decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)
-    ```python
-    decayed_learning_rate = learning_rate *
-            decay_rate ^ (global_step / decay_steps)
-    ```
    Args:
-        learning_rate: A scalar float32 value or a Variable. This
+        learning_rate(Variable|float): The initial learning rate.
-          will be the initial learning rate during training
+        decay_steps(int): See the decay computation above.
-        decay_steps: A Python `int32` number.
+        decay_rate(float): The decay rate. See the decay computation above.
-        decay_rate: A Python `float` number.
+        staircase(Boolean): If True, decay the learning rate at discrete intervals.
-        staircase: Boolean. If set true, decay the learning rate every decay_steps.
+                            Default: False
    Returns:
-        The decayed learning rate
+        Variable: The decayed learning rate
+    Examples:
+        .. code-block:: python
+          base_lr = 0.1
+          sgd_optimizer = fluid.optimizer.SGD(
+                learning_rate=fluid.layers.exponential_decay(
+                    learning_rate=base_lr,
+                    decay_steps=10000,
+                    decay_rate=0.5,
+                    staircase=True))
+          sgd_optimizer.minimize(avg_cost)
    """
    global_step = _decay_step_counter()
@@ -129,22 +148,39 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
 def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
-    """Applies inverse time decay to the initial learning rate.
+    """
+    Applies inverse time decay to the initial learning rate.
-    >>> if staircase:
+    When training a model, it is often recommended to lower the learning rate as the 
+    training progresses. By using this function, an inverse decay function will be 
+    applied to the initial learning rate.
+    >>> if staircase == True:
    >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
    >>> else:
    >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
    Args:
-        learning_rate: A scalar float32 value or a Variable. This
+        learning_rate(Variable|float): The initial learning rate.
-          will be the initial learning rate during training.
+        decay_steps(int): See the decay computation above.
-        decay_steps: A Python `int32` number.
+        decay_rate(float): The decay rate. See the decay computation above.
-        decay_rate: A Python `float` number.
+        staircase(Boolean): If True, decay the learning rate at discrete intervals.
-        staircase: Boolean. If set true, decay the learning rate every decay_steps.
+                            Default: False
    Returns:
-        The decayed learning rate
+        Variable: The decayed learning rate
+    Examples:
+        .. code-block:: python
+          base_lr = 0.1
+          sgd_optimizer = fluid.optimizer.SGD(
+                learning_rate=fluid.layers.inverse_time_decay(
+                    learning_rate=base_lr,
+                    decay_steps=10000,
+                    decay_rate=0.5,
+                    staircase=True))
+          sgd_optimizer.minimize(avg_cost)
    """
    global_step = _decay_step_counter()

--- a/python/paddle/fluid/layers/metric.py
+++ b/python/paddle/fluid/layers/metric.py
@@ -27,8 +27,32 @@ __all__ = ['accuracy', 'auc']
 def accuracy(input, label, k=1, correct=None, total=None):
    """
+    accuracy layer.
+    Refer to the https://en.wikipedia.org/wiki/Precision_and_recall
    This function computes the accuracy using the input and label.
-    The output is the top k inputs and their indices.
+    If the correct label occurs in top k predictions, then correct will increment by one.
+    Note: the dtype of accuracy is determined by input. the input and label dtype can be different.
+    Args:
+        input(Variable): The input of accuracy layer, which is the predictions of network.
+          Carry LoD information is supported.
+        label(Variable): The label of dataset.
+        k(int): The top k predictions for each class will be checked.
+        correct(Variable): The correct predictions count.
+        total(Variable): The total entries count.
+    Returns:
+        Variable: The correct rate.
+    Examples:
+        .. code-block:: python
+           data = fluid.layers.data(name="data", shape=[-1, 32, 32], dtype="float32")
+           label = fluid.layers.data(name="data", shape=[-1,1], dtype="int32")
+           predict = fluid.layers.fc(input=data, size=10)
+           acc = fluid.layers.accuracy(input=predict, label=label, k=5)
    """
    helper = LayerHelper("accuracy", **locals())
    topk_out, topk_indices = nn.topk(input, k=k)

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -91,6 +91,8 @@ __all__ = [
    'gather',
    'random_crop',
    'mean_iou',
+    'relu',
+    'log',
 ]
@@ -106,14 +108,15 @@ def fc(input,
    """
    **Fully Connected Layer**
-    The fully connected layer can take multiple tensors as its inputs. It
+    This function creates a fully connected layer in the network. It can take 
-    creates a variable called weights for each input tensor, which represents
+    multiple tensors as its inputs. It creates a variable called weights for 
-    a fully connected weight matrix from each input unit to each output unit.
+    each input tensor, which represents a fully connected weight matrix from 
-    The fully connected layer multiplies each input tensor with its coresponding
+    each input unit to each output unit. The fully connected layer multiplies 
-    weight to produce an output Tensor. If multiple input tensors are given,
+    each input tensor with its coresponding weight to produce an output Tensor. 
-    the results of multiple multiplications will be sumed up. If bias_attr is
+    If multiple input tensors are given, the results of multiple multiplications 
-    not None, a bias variable will be created and added to the output. Finally,
+    will be sumed up. If bias_attr is not None, a bias variable will be created 
-    if activation is not None, it will be applied to the output as well.
+    and added to the output. Finally, if activation is not None, it will be applied 
+    to the output as well.
    This process can be formulated as follows:
@@ -154,7 +157,7 @@ def fc(input,
        name (str, default None): The name of this layer.
    Returns:
-        A tensor variable storing the transformation result.
+        Variable: The transformation result.
    Raises:
        ValueError: If rank of the input tensor is less than 2.
@@ -162,8 +165,7 @@ def fc(input,
    Examples:
        .. code-block:: python
-          data = fluid.layers.data(
+          data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
-              name="data", shape=[32, 32], dtype="float32")
          fc = fluid.layers.fc(input=data, size=1000, act="tanh")
    """
@@ -845,11 +847,14 @@ def linear_chain_crf(input, label, param_attr=None):
    Args:
        input(${emission_type}): ${emission_comment}
+        input(${transition_type}): ${transition_comment}
        label(${label_type}): ${label_comment}
        param_attr(ParamAttr): The attribute of the learnable parameter.
    Returns:
-        ${log_likelihood_comment}
+        output(${emission_exps_type}): ${emission_exps_comment} \n
+        output(${transition_exps_type}): ${transition_exps_comment} \n
+        output(${log_likelihood_type}): ${log_likelihood_comment}
    """
    helper = LayerHelper('linear_chain_crf', **locals())
@@ -911,7 +916,7 @@ def cos_sim(X, Y):
    Args:
        X (Variable): The input X.
        Y (Variable): The input Y.
    Returns:
        Variable: the output of cosine(X, Y).
    """
@@ -1117,7 +1122,7 @@ def chunk_eval(input,
        chunk_scheme (str): ${chunk_scheme_comment}
        num_chunk_types (int): ${num_chunk_types_comment}
        excluded_chunk_types (list): ${excluded_chunk_types_comment}
    Returns:
        tuple: tuple containing: (precision, recall, f1_score,
               num_infer_chunks, num_label_chunks,
@@ -1177,15 +1182,11 @@ def sequence_conv(input,
        bias_attr (ParamAttr|None): attributes for bias
        param_attr (ParamAttr|None): attributes for parameter
        act (str): the activation type
    Returns:
        Variable: output of sequence_conv
    """
-    # FIXME(dzh) : want to unify the argument of python layer
-    # function. So we ignore some unecessary attributes.
-    # such as, padding_trainable, context_start.
    helper = LayerHelper('sequence_conv', **locals())
    dtype = helper.input_dtype()
    filter_shape = [filter_size * input.shape[1], num_filters]
@@ -1740,6 +1741,7 @@ def sequence_last_step(input):
    return sequence_pool(input=input, pool_type="last")
+@templatedoc()
 def pool2d(input,
           pool_size=-1,
           pool_type="max",
@@ -1751,24 +1753,45 @@ def pool2d(input,
           use_mkldnn=False,
           name=None):
    """
-    This function adds the operator for pooling in 2 dimensions, using the
+    ${comment}
-    pooling configurations mentioned in input parameters.
    Args:
-        input (Variable): ${input_comment}
+        input (Variable): The input tensor of pooling operator. The format of 
-        pool_size (int): ${ksize_comment}
+                          input tensor is NCHW, where N is batch size, C is 
-        pool_type (str): ${pooling_type_comment}
+                          the number of channels, H is the height of the 
+                          feature, and W is the width of the feature.
+        pool_size (int): The side length of pooling windows. All pooling 
+                         windows are squares with pool_size on a side.
+        pool_type: ${pooling_type_comment}
        pool_stride (int): stride of the pooling layer.
        pool_padding (int): padding size.
-        global_pooling (bool): ${global_pooling_comment}
+        global_pooling: ${global_pooling_comment}
-        use_cudnn (bool): ${use_cudnn_comment}
+        use_cudnn: ${use_cudnn_comment}
-        ceil_mode (bool): ${ceil_mode_comment}
+        ceil_mode: ${ceil_mode_comment}
-        use_mkldnn (bool): ${use_mkldnn_comment}
+        use_mkldnn: ${use_mkldnn_comment}
-        name (str): A name for this layer(optional). If set None, the layer
+        name (str|None): A name for this layer(optional). If set None, the 
-            will be named automatically.
+                        layer will be named automatically.
    Returns:
-        Variable: output of pool2d layer.
+        Variable: The pooling result.
+    Raises:
+        ValueError: If 'pool_type' is not "max" nor "avg"
+        ValueError: If 'global_pooling' is False and 'pool_size' is -1
+        ValueError: If 'use_cudnn' is not a bool value.
+    Examples:
+        .. code-block:: python
+          data = fluid.layers.data(
+              name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d = fluid.layers.pool2d(
+                            input=data, 
+                            pool_size=2, 
+                            pool_type='max', 
+                            pool_stride=1, 
+                            global_pooling=False)
    """
    if pool_type not in ["max", "avg"]:
        raise ValueError(
@@ -2127,15 +2150,37 @@ def layer_norm(input,
 def beam_search_decode(ids, scores, name=None):
    """
-    ${beam_search_decode}
+    Beam Search Decode
+    This layers is to pack the output of beam search layer into sentences and
+    associated scores. It is usually called after the beam search layer.
+    Typically, the output of beam search layer is a tensor of selected ids, with
+    a tensor of the score of each id. Beam search layer's output ids, however, 
+    are generated directly during the tree search, and they are stacked by each 
+    level of the search tree. Thus we need to reorganize them into sentences, 
+    based on the score of each id. This layer takes the output of beam search
+    layer as input and repack them into sentences.
    Args:
-        ids (Variable): ${ids_comment}
+        ids (Variable): The selected ids, output of beam search layer. 
-        scores (Variable): ${scores_comment}
+        scores (Variable): The associated scores of the ids, out put of beam
+            search layer.
        name (str): The name of this layer. It is optional.
    Returns:
-        tuple: a tuple of two output variable: sentence_ids, sentence_scores
+        tuple(Variable): a tuple of two output tensors: sentence_ids, sentence_scores.
+        sentence_ids is a tensor with shape [size, length], where size is the
+        beam size of beam search, and length is the length of each sentence. 
+        Note that the length of sentences may vary.
+        sentence_scores is a tensor with the same shape as sentence_ids.
+    Examples:
+        .. code-block:: python
+            ids, scores = fluid.layers.beam_search(
+                pre_ids, ids, scores, beam_size, end_id)
+            sentence_ids, sentence_scores = fluid.layers.beam_search_decode(
+                ids, scores)
    """
    helper = LayerHelper('beam_search_decode', **locals())
    sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
@@ -2567,7 +2612,7 @@ def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
        beam_size (int): ${beam_size_comment}
        end_id (int): ${end_id_comment}
        level (int): ${level_comment}
    Returns:
        tuple: a tuple of beam_search output variables: selected_ids, selected_scores
    '''
@@ -3016,7 +3061,7 @@ def split(input, num_or_sections, dim=-1, name=None):
                       will be named automatically.
    Returns:
-        List: The list of segmented tensor variables.
+        list(Variable): The list of segmented tensor variables.
    Examples:
        .. code-block:: python
@@ -3225,25 +3270,51 @@ def topk(input, k, name=None):
    This operator is used to find values and indices of the k largest entries
    for the last dimension.
-    If the input is a vector (rank=1), finds the k largest entries in the vector
+    If the input is a vector (1-D Tensor), finds the k largest entries in the vector
    and outputs their values and indices as vectors. Thus values[j] is the j-th
    largest entry in input, and its index is indices[j].
    If the input is a Tensor with higher rank, this operator computes the top k
    entries along the last dimension.
+    For example:
+    .. code-block:: text
+        If:
+            input = [[5, 4, 2, 3],
+                     [9, 7, 10, 25],
+                     [6, 2, 10, 1]]
+            k = 2
+        Then:
+            The first output:
+            values = [[5, 4],
+                      [10, 25],
+                      [6, 10]]
+            The second output:
+            indices = [[0, 1],
+                       [2, 3],
+                       [0, 2]]
    Args:
        input(Variable): The input variable which can be a vector or Tensor with
            higher rank.
-        k(int): An integer value to specify the top k largest elements.
+        k(int):  The number of top elements to look for along the last dimension 
+                 of input.
        name(str|None): A name for this layer(optional). If set None, the layer
-                       will be named automatically.
+                       will be named automatically. 
+                       Default: None
    Returns:
-        values(Variable): The k largest elements along each last dimensional
+        Tuple[Variable]: A tuple with two elements. Each element is a Variable. 
-            slice.
+        The first one is k largest elements along each last 
-        indices(Variable): The indices of values within the last dimension of
+        dimensional slice. The second one is indices of values 
-            input.
+        within the last dimension of input.
+    Raises:
+        ValueError: If k < 1 or k is not less than the last dimension of input
    Examples:
        .. code-block:: python
@@ -3251,7 +3322,7 @@ def topk(input, k, name=None):
            top5_values, top5_indices = layers.topk(input, k=5)
    """
    shape = input.shape
-    if k < 1 and k >= shape[-1]:
+    if k < 1 or k >= shape[-1]:
        raise ValueError("k must be greater than 0 and less than %d." %
                         (shape[-1]))
@@ -3269,8 +3340,7 @@ def topk(input, k, name=None):
    return values, indices
-def edit_distance(input, label, normalized=True, ignored_tokens=None,
+def edit_distance(input, label, normalized=True, ignored_tokens=None):
-                  name=None):
    """
    EditDistance operator computes the edit distances between a batch of
    hypothesis strings and their references. Edit distance, also called
@@ -3284,21 +3354,21 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None,
    "kitten" -> "sitten" -> "sittin" -> "sitting"
-    Input(Hyps) is a LoDTensor consisting of all the hypothesis strings with
+    The input is a LoDTensor consisting of all the hypothesis strings with
    the total number denoted by `batch_size`, and the separation is specified
    by the LoD information. And the `batch_size` reference strings are arranged
-    in order in the same way in the LoDTensor Input(Refs).
+    in order in the same way in the input LoDTensor.
-    Output(Out) contains the `batch_size` results and each stands for the edit
+    The output contains the `batch_size` results and each stands for the edit
    distance for a pair of strings respectively. If Attr(normalized) is true,
    the edit distance will be divided by the length of reference string.
    Args:
        input(Variable): The indices for hypothesis strings.
        label(Variable): The indices for reference strings.
-        normalized(bool): Indicated whether to normalize the edit distance by
+        normalized(bool, default True): Indicated whether to normalize the edit distance by
                          the length of reference string.
-        ignored_tokens(list of int): Tokens that should be removed before
+        ignored_tokens(list<int>, default None): Tokens that should be removed before
                                     calculating edit distance.
        name (str): The name of this layer. It is optional.
@@ -3310,7 +3380,6 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None,
            x = fluid.layers.data(name='x', shape=[8], dtype='float32')
            y = fluid.layers.data(name='y', shape=[7], dtype='float32')
            cost = fluid.layers.edit_distance(input=x,label=y)
    """
    helper = LayerHelper("edit_distance", **locals())
@@ -3430,35 +3499,33 @@ def warpctc(input, label, blank=0, norm_by_times=False):
    input tensor.
    Args:
-        input(Variable): (LodTensor, default: LoDTensor<float>),
+       input (Variable): The unscaled probabilities of variable-length sequences,
-            the unscaled probabilities of variable-length sequences,
+         which is a 2-D Tensor with LoD information.
-            which is a 2-D Tensor with LoD information.
+         It's shape is [Lp, num_classes + 1], where Lp is the sum of all input
-            It's shape is [Lp, num_classes + 1], where Lp is the sum of all input
+         sequences' length and num_classes is the true number of classes.
-            sequences' length and num_classes is the true number of classes.
+         (not including the blank label).
-            (not including the blank label).
+       label (Variable): The ground truth of variable-length sequence, 
-        label(Variable): (LodTensor, default: LoDTensor<int>), the ground truth
+         which is a 2-D Tensor with LoD information. It is of the shape [Lg, 1],
-            of variable-length sequence, which is a 2-D Tensor with LoD
+         where Lg is th sum of all labels' length.
-            information. It is of the shape [Lg, 1], where Lg is th sum of
+       blank (int, default 0): The blank label index of Connectionist
-            all labels' length.
+         Temporal Classification (CTC) loss, which is in the
-        blank (int): default 0, the blank label index of Connectionist
+         half-opened interval [0, num_classes + 1).
-            Temporal Classification (CTC) loss, which is in the
+       norm_by_times(bool, default false): Whether to normalize the gradients 
-            half-opened interval [0, num_classes + 1).
+         by the number of time-step, which is also the sequence's length. 
-        norm_by_times (bool): default false, whether to normalize
+         There is no need to normalize the gradients if warpctc layer was 
-            the gradients by the number of time-step, which is also the
+         follewed by a mean_op.
-            sequence's length. There is no need to normalize the gradients
-            if warpctc layer was follewed by a mean_op.
    Returns:
        Variable: The Connectionist Temporal Classification (CTC) loss,
        which is a 2-D Tensor of the shape [batch_size, 1].
    Examples:
        .. code-block:: python
-            y = layers.data(
-                name='y', shape=[11, 8], dtype='float32', lod_level=1)
+            label = fluid.layers.data(shape=[11, 8], dtype='float32', lod_level=1)
-            y_predict = layers.data(
+            predict = fluid.layers.data(shape=[11, 1], dtype='float32')
-                name='y_predict', shape=[11, 1], dtype='float32')
+            cost = fluid.layers.warpctc(input=predict, label=label)
-            cost = layers.warpctc(input=y_predict, label=y)
    """
    helper = LayerHelper('warpctc', **locals())
@@ -3487,17 +3554,21 @@ def sequence_reshape(input, new_dim):
    .. code-block:: text
        x is a LoDTensor:
-            x.lod  = [[2, 4]]
+            x.lod  = [[0, 2, 6]]
-            x.data = [[1, 2], [3, 4],
+            x.data = [[1,  2], [3,  4],
-                      [5, 6], [7, 8], [9, 10], [11, 12]]
+                      [5,  6], [7,  8],
+                      [9, 10], [11, 12]]
            x.dims = [6, 2]
        set new_dim = 4
        then out is a LoDTensor:
-            out.lod  = [[1, 2]]
-            out.data = [[1, 2, 3, 4],
+            out.lod  = [[0, 1, 3]]
-                        [5, 6, 7, 8], [9, 10, 11, 12]]
+            out.data = [[1,  2,  3,  4],
+                        [5,  6,  7,  8],
+                        [9, 10, 11, 12]]
            out.dims = [3, 4]
    Currently, only 1-level LoDTensor is supported and please make sure
@@ -3505,19 +3576,19 @@ def sequence_reshape(input, new_dim):
    no remainder for each sequence.
    Args:
-        input (Variable): (LodTensor, default: LoDTensor<float>), a 2-D LoDTensor
-            with shape being [N, M] where M for dimension.
+       input (Variable): A 2-D LoDTensor with shape being [N, M] where M for dimension.
-        new_dim (int): New dimension which the input LoDTensor is reshaped to.
+       new_dim (int): New dimension that the input LoDTensor is reshaped to.
    Returns:
        Variable: Reshaped LoDTensor according to new dimension.
    Examples:
        .. code-block:: python
-            x = fluid.layers.data(name='x', shape=[5, 20],
+            x = fluid.layers.data(shape=[5, 20], dtype='float32', lod_level=1)
-                              dtype='float32', lod_level=1)
+            x_reshaped = fluid.layers.sequence_reshape(input=x, new_dim=10)
-            x_reshaped = layers.sequence_reshape(input=x, new_dim=10)
    """
    helper = LayerHelper('sequence_reshape', **locals())
    out = helper.create_tmp_variable(helper.input_dtype())
@@ -3553,7 +3624,7 @@ def nce(input,
        param_attr (ParamAttr|None): attributes for parameter
        bias_attr (ParamAttr|None): attributes for bias
        num_neg_samples (int): ${num_neg_samples_comment}
    Returns:
        Variable: The output nce loss.
@@ -3723,8 +3794,6 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
    Examples:
-    As an example:
        .. code-block:: text
            Given:
@@ -3768,7 +3837,7 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
            output.lod = [[4, 4]]
-        The simple usage is:
+     Examples:
        .. code-block:: python
@@ -4253,9 +4322,7 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
    .. math::
-        Output(i, x, y) = Input(i, x, y) / \left(
+      Output(i, x, y) = Input(i, x, y) / \\left(k + \\alpha \\sum\\limits^{\\min(C, c + n/2)}_{j = \\max(0, c - n/2)}(Input(j, x, y))^2\\right)^{\\beta}
-        k + \alpha \sum\limits^{\min(C, c + n/2)}_{j = \max(0, c - n/2)}
-        (Input(j, x, y))^2 \right)^{\beta}
    In the above equation:
@@ -4769,6 +4836,62 @@ def random_crop(x, shape, seed=None):
    return out
+def log(x):
+    """
+    Calculates the natural log of the given input tensor, element-wise.
+    .. math::
+        Out = \\ln(x)
+    Args:
+        x (Variable): Input tensor. 
+    Returns:
+        Variable: The natural log of the input tensor computed element-wise.
+    Examples:
+        .. code-block:: python
+            output = fluid.layers.log(x)
+    """
+    helper = LayerHelper('log', **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(type="log", inputs={"X": input}, outputs={"Out": out})
+    return out
+def relu(x):
+    """
+    Relu takes one input data (Tensor) and produces one output data (Tensor)
+    where the rectified linear function, y = max(0, x), is applied to
+    the tensor elementwise.
+    .. math::
+        Out = \\max(0, x)
+    Args:
+        x (Variable): The input tensor. 
+    Returns:
+        Variable: The output tensor with the same shape as input.
+    Examples:
+        .. code-block:: python
+            output = fluid.layers.relu(x)
+    """
+    helper = LayerHelper('relu', **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(type="relu", inputs={"X": input}, outputs={"Out": out})
+    return out
 def mean_iou(input, label, num_classes):
    """
    Mean Intersection-Over-Union is a common evaluation metric for
@@ -4795,11 +4918,10 @@ def mean_iou(input, label, num_classes):
        out_wrong(Variable): A Tensor with shape [num_classes]. The wrong numbers of each class.
        out_correct(Variable): A Tensor with shape [num_classes]. The correct numbers of each class. 
    Examples:
        .. code-block:: python
            iou, wrongs, corrects = fluid.layers.mean_iou(predict, label, num_classes)
    """
    helper = LayerHelper('mean_iou', **locals())

--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -17,7 +17,6 @@ __activations__ = [
    'sigmoid',
    'logsigmoid',
    'exp',
-    'relu',
    'tanh',
    'tanh_shrink',
    'softshrink',
@@ -29,7 +28,6 @@ __activations__ = [
    'sin',
    'round',
    'reciprocal',
-    'log',
    'square',
    'softplus',
    'softsign',

--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -108,16 +108,29 @@ def create_global_var(shape,
                      force_cpu=False,
                      name=None):
    """
-    Create a global variable. such as global_step
+    Create a new variable in the global block(block 0).
    Args:
        shape(list[int]): shape of the variable
-        value(float): the value of the variable
+        value(float): the value of the variable. The new created 
-        dtype(string): element type of the parameter
+                      variable will be filled with it.
-        persistable(bool): if this variable is persistable
+        dtype(string): data type of the variable
-        force_cpu(bool): force this variable to be on CPU
+        persistable(bool): if this variable is persistable. 
+                           Default: False
+        force_cpu(bool): force this variable to be on CPU. 
+                         Default: False
+        name(str|None): The name of the variable. If set to None the variable 
+                        name will be generated automatically. 
+                        Default: None
    Returns:
        Variable: the created Variable
+    Examples:
+        .. code-block:: python
+            var = fluid.create_global_var(shape=[2,3], value=1.0, dtype='float32', 
+                                 persistable=True, force_cpu=True, name='new_var')
    """
    helper = LayerHelper("global_var", **locals())
    var = helper.create_global_variable(
@@ -175,7 +188,8 @@ def concat(input, axis=0, name=None):
    Examples:
        .. code-block:: python
-          out = fluid.layers.concat(input=[Efirst, Esecond, Ethird, Efourth])
+           out = fluid.layers.concat(input=[Efirst, Esecond, Ethird, Efourth])
    """
    helper = LayerHelper('concat', **locals())
    out = helper.create_tmp_variable(dtype=helper.input_dtype())
@@ -188,19 +202,21 @@ def concat(input, axis=0, name=None):
 def sums(input, out=None):
-    """This function performs the sum operation on the input and returns the
+    """
+    This function performs the sum operation on the input and returns the
    result as the output.
    Args:
        input (Variable|list): The input tensor that has the elements
                               that need to be summed up.
+        out (Variable|None): Output parameter. The sum result.
+                             Default: None
    Returns:
-        Variable: The tensor type variable that has the sum of input
+        Variable: the sum of input. The same as the argument 'out'
-                  written to it.
    Examples:
-        .. code-block::python
+        .. code-block:: python
          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
@@ -371,13 +387,13 @@ def argmin(x, axis=0):
        x(Variable): The input to compute the indices of
                     the min elements.
        axis(int): Axis to compute indices along.
    Returns:
        Variable: The tensor variable storing the output
    Examples:
        .. code-block:: python
          out = fluid.layers.argmin(x=in, axis=0)
          out = fluid.layers.argmin(x=in, axis=-1)  
    """
@@ -402,13 +418,13 @@ def argmax(x, axis=0):
        x(Variable): The input to compute the indices of
                     the max elements.
        axis(int): Axis to compute indices along.
    Returns:
        Variable: The tensor variable storing the output
    Examples:
        .. code-block:: python
          out = fluid.layers.argmax(x=in, axis=0)
          out = fluid.layers.argmax(x=in, axis=-1)  
    """
@@ -456,11 +472,12 @@ def zeros(shape, dtype, force_cpu=False):
    It also sets *stop_gradient* to True.
    Args:
-        shape(tuple|list|None): Shape of output tensor
+        shape(tuple|list|None): Shape of output tensor.
-        dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor
+        dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor.
+        force_cpu(bool, default False): Whether to make output stay on CPU.
    Returns:
-        Variable: The tensor variable storing the output
+        Variable: The tensor variable storing the output.
    Examples:
        .. code-block:: python