Fix doc of warpctc, array_read, edit_distance and sequence_reshape.

297a1698 · wanghaoshuang · e0a8c584 · 297a1698 · 297a1698
隐藏空白更改
内联并排

Showing with 522 addition and 283 deletion

python/paddle/fluid/layers/control_flow.py python/paddle/fluid/layers/control_flow.py +39 -24

python/paddle/fluid/layers/nn.py python/paddle/fluid/layers/nn.py +483 -259

未找到文件。
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import contextlib
-from layer_function_generator import autodoc
+from layer_function_generator import autodoc, templatedoc
 from tensor import assign, fill_constant
 from .. import core
 from ..framework import Program, Variable, Operator
@@ -721,26 +721,22 @@ def lod_rank_table(x, level=0):
    return table
+@templatedoc()
 def max_sequence_len(rank_table):
-    """Max Sequence Len Operator. Given a LoDRankTable object, this layer
+    """
-    returns the max length of a batch of sequences. In fact, a LoDRankTable
+    ${comment}
-    object contains a list of tuples(<sequence index, sequence length>) and
-    the list is already sorted by sequence length in descending order, so the
+    >>> import paddle.fluid as fluid
-    operator just returns the sequence length of the first tuple element.
+    >>> x = fluid.layers.data(name='x', shape=[10], dtype='float32',
+    >>>                       lod_level=1)
+    >>> rank_table = layers.lod_rank_table(x=x, level=0)
+    >>> max_seq_len = layers.max_sequence_len(rank_table)
    Args:
-        rank_table (Variable): Input variable which is a LoDRankTable object.
+        rank_table(${rank_table_type}): ${rank_table_comment}.
    Returns:
-        Variable: The max length of sequence.
+        ${out_comment}.
-    Examples:
-        .. code-block:: python
-            x = fluid.layers.data(name='x', shape=[10],
-                            dtype='float32', lod_level=1)
-            rank_table = layers.lod_rank_table(x=x, level=0)
-            max_seq_len = layers.max_sequence_len(rank_table)
    """
    helper = LayerHelper("max_seqence_len", **locals())
    res = helper.create_tmp_variable(dtype="int64")
@@ -978,19 +974,38 @@ def equal(x, y, cond=None, **ignored):
 def array_read(array, i):
-    """This function performs the operation to read the data in as an
+    """
+    This function performs the operation to read the data in as an
    LOD_TENSOR_ARRAY.
+    .. code-block:: text
+        Given:
+        array = [0.6, 0.1, 0.3, 0.1]
+        And:
+        i = 2
+        Then:
+        output = 0.3
    Args:
-        array (Variable|list): The input tensor that will be written to an array.
+        array (Variable|list): The input tensor that store data to be read.
-        i (Variable|list): The subscript index in tensor array, that points the
+        i (Variable|list): The index of the data to be read from input array.
-                           place where data will be written to.
    Returns:
        Variable: The tensor type variable that has the data written to it.
    Examples:
-        .. code-block::python
+        .. code-block:: python
-          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
-          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
+            tmp = fluid.layers.zeros(shape=[10], dtype='int32')
-          arr = layers.array_read(tmp, i=i)
+            i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
+            arr = layers.array_read(tmp, i=i)
    """
    helper = LayerHelper('array_read', **locals())
    if not isinstance(

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -12,78 +12,33 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-All layers just related to the neural network.
+All layers just related to the neural network. 
 """
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
 from ..param_attr import ParamAttr
-from layer_function_generator import autodoc
+from layer_function_generator import autodoc, templatedoc
 from tensor import concat
 import utils
+import random
 __all__ = [
-    'fc',
+    'fc', 'embedding', 'dynamic_lstm', 'dynamic_lstmp', 'dynamic_gru',
-    'embedding',
+    'gru_unit', 'linear_chain_crf', 'crf_decoding', 'cos_sim', 'cross_entropy',
-    'dynamic_lstm',
+    'square_error_cost', 'chunk_eval', 'sequence_conv', 'conv2d',
-    'dynamic_lstmp',
+    'sequence_pool', 'sequence_softmax', 'softmax', 'pool2d', 'batch_norm',
-    'dynamic_gru',
+    'beam_search_decode', 'conv2d_transpose', 'sequence_expand', 'lstm_unit',
-    'gru_unit',
+    'reduce_sum', 'reduce_mean', 'reduce_max', 'reduce_min', 'reduce_prod',
-    'linear_chain_crf',
+    'sequence_first_step', 'sequence_last_step', 'dropout', 'split',
-    'crf_decoding',
+    'ctc_greedy_decoder', 'edit_distance', 'l2_normalize', 'matmul', 'topk',
-    'cos_sim',
+    'warpctc', 'sequence_reshape', 'transpose', 'im2sequence', 'nce',
-    'cross_entropy',
+    'beam_search', 'row_conv', 'multiplex', 'layer_norm',
-    'square_error_cost',
+    'softmax_with_cross_entropy', 'smooth_l1', 'one_hot',
-    'chunk_eval',
+    'autoincreased_step_counter', 'reshape', 'lod_reset', 'lrn', 'pad',
-    'sequence_conv',
+    'label_smooth', 'roi_pool', 'dice_loss', 'image_resize',
-    'conv2d',
+    'image_resize_short', 'resize_bilinear', 'gather', 'random_crop', 'mean_iou'
-    'sequence_pool',
-    'sequence_softmax',
-    'softmax',
-    'pool2d',
-    'batch_norm',
-    'beam_search_decode',
-    'conv2d_transpose',
-    'sequence_expand',
-    'lstm_unit',
-    'reduce_sum',
-    'reduce_mean',
-    'reduce_max',
-    'reduce_min',
-    'reduce_prod',
-    'sequence_first_step',
-    'sequence_last_step',
-    'dropout',
-    'split',
-    'ctc_greedy_decoder',
-    'edit_distance',
-    'l2_normalize',
-    'matmul',
-    'topk',
-    'warpctc',
-    'sequence_reshape',
-    'transpose',
-    'im2sequence',
-    'nce',
-    'beam_search',
-    'row_conv',
-    'multiplex',
-    'layer_norm',
-    'softmax_with_cross_entropy',
-    'smooth_l1',
-    'one_hot',
-    'autoincreased_step_counter',
-    'reshape',
-    'lod_reset',
-    'lrn',
-    'pad',
-    'label_smooth',
-    'roi_pool',
-    'dice_loss',
-    'resize_bilinear',
-    'gather',
-    'random_crop',
 ]
@@ -92,7 +47,6 @@ def fc(input,
       num_flatten_dims=1,
       param_attr=None,
       bias_attr=None,
-       use_cudnn=False,
       use_mkldnn=False,
       act=None,
       is_test=False,
@@ -219,6 +173,7 @@ def embedding(input,
            have two elements which indicate the size of the dictionary of
            embeddings and the size of each embedding vector respectively.
        is_sparse(bool): The flag indicating whether to use sparse update.
+        is_distributed (bool): Whether to run lookup table from remote parameter server.
        padding_idx(int|long|None): If :attr:`None`, it makes no effect to lookup.
            Otherwise the given :attr:`padding_idx` indicates padding the output
            with zeros whenever lookup encounters it in :attr:`input`. If
@@ -258,9 +213,10 @@ def embedding(input,
    return tmp
-# TODO(qijun): expose H0 and C0
 def dynamic_lstm(input,
                 size,
+                 h_0=None,
+                 c_0=None,
                 param_attr=None,
                 bias_attr=None,
                 use_peepholes=True,
@@ -321,6 +277,13 @@ def dynamic_lstm(input,
                         (T X 4D), where T is the total time steps in this
                         mini-batch, D is the hidden size.
        size(int): 4 * hidden size.
+        h_0(Variable): The initial hidden state is an optional input, default is zero.
+                       This is a tensor with shape (N x D), where N is the
+                       batch size and D is the hidden size.
+        c_0(Variable): The initial cell state is an optional input, default is zero.
+                       This is a tensor with shape (N x D), where N is the
+                       batch size. `h_0` and `c_0` can be NULL but only at the same time.
        param_attr(ParamAttr|None): The parameter attribute for the learnable
                               hidden-hidden weights.
@@ -384,12 +347,20 @@ def dynamic_lstm(input,
    cell = helper.create_tmp_variable(dtype)
    batch_gate = helper.create_tmp_variable(dtype)
    batch_cell_pre_act = helper.create_tmp_variable(dtype)
+    inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
+    batch_size = input.shape[0]
+    if h_0:
+        assert h_0.shape == (batch_size, size), \
+            'The shape of h0 should be (batch_size, %d)' % size
+        inputs['H0'] = h_0
+    if c_0:
+        assert c_0.shape == (batch_size, size), \
+            'The shape of c0 should be (batch_size, %d)' % size
+        inputs['C0'] = c_0
    helper.append_op(
        type='lstm',
-        inputs={'Input': input,
+        inputs=inputs,
-                'Weight': weight,
-                'Bias': bias},
        outputs={
            'Hidden': hidden,
            'Cell': cell,
@@ -651,8 +622,9 @@ def dynamic_gru(input,
            :attr:`False`.
        gate_activation(str): The activation for update gate and reset gate.
            Choices = ["sigmoid", "tanh", "relu", "identity"], default "sigmoid".
-        activation(str): The activation for candidate hidden state.
+        candidate_activation(str): The activation for candidate hidden state.
            Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh".
+        h_0 (Variable): The hidden output of the first time step.
    Returns:
        Variable: The hidden state of GRU. The shape is :math:`(T \\times D)`, \
@@ -673,11 +645,13 @@ def dynamic_gru(input,
        attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
    bias = helper.create_parameter(
        attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True)
+    batch_size = input.shape[0]
    inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
    if h_0 != None:
        assert h_0.shape == (
-            size, size), 'The shape of h0 should be(%d, %d)' % (size, size)
+            batch_size, size
-        inputs['h0'] = h_0
+        ), 'The shape of h0 should be(batch_size, %d)' % size
+        inputs['H0'] = h_0
    hidden = helper.create_tmp_variable(dtype)
    batch_gate = helper.create_tmp_variable(dtype)
@@ -799,7 +773,22 @@ def gru_unit(input,
    return updated_hidden, reset_hidden_pre, gate
+@templatedoc()
 def linear_chain_crf(input, label, param_attr=None):
+    """
+    Linear Chain CRF.
+    ${comment}
+    Args:
+        input(${emission_type}): ${emission_comment}
+        label(${label_type}): ${label_comment}
+        param_attr(ParamAttr): The attribute of the learnable parameter.
+    Returns:
+        ${log_likelihood_comment}
+    """
    helper = LayerHelper('linear_chain_crf', **locals())
    size = input.shape[1]
    transition = helper.create_parameter(
@@ -825,7 +814,19 @@ def linear_chain_crf(input, label, param_attr=None):
    return log_likelihood
+@templatedoc()
 def crf_decoding(input, param_attr, label=None):
+    """
+    ${comment}
+    Args:
+        input(${emission_type}): ${emission_comment}
+        param_attr(ParamAttr): The parameter attribute for training.
+        label(${label_type}): ${label_comment}
+    Returns:
+        ${viterbi_path_comment}
+    """
    helper = LayerHelper('crf_decoding', **locals())
    transition = helper.get_parameter(param_attr.name)
    viterbi_path = helper.create_tmp_variable(dtype=helper.input_dtype())
@@ -843,6 +844,13 @@ def cos_sim(X, Y):
    """
    This function performs the cosine similarity between two tensors
    X and Y and returns that as the output.
+    Args:
+        X (Variable): The input X.
+        Y (Variable): The input Y.
+    Returns:
+        Variable: the output of cosine(X, Y).
    """
    helper = LayerHelper('cos_sim', **locals())
    out = helper.create_tmp_variable(dtype=X.dtype)
@@ -869,15 +877,15 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
    unchanged.
    Args:
-       x(variable): The input tensor.
+        x (Variable): The input tensor.
-       dropout_prob(float): Probability of setting units to zero.
+         dropout_prob (float): Probability of setting units to zero.
-       is_test(bool): A flag indicating whether it is in test phrase or not.
+        is_test (bool): A flag indicating whether it is in test phrase or not.
-       seed(int): A Python integer used to create random seeds. If this
+        seed (int): A Python integer used to create random seeds. If this
-                  parameter is set to None, a random seed is used.
+                    parameter is set to None, a random seed is used.
-                  NOTE: If an integer seed is given, always the same output
+                    NOTE: If an integer seed is given, always the same output
-                  units will be dropped. DO NOT use a fixed seed in training.
+                    units will be dropped. DO NOT use a fixed seed in training.
-       name(str|None): A name for this layer(optional). If set None, the layer
+        name (str|None): A name for this layer(optional). If set None, the layer
-                    will be named automatically.
+                         will be named automatically.
    Returns:
        Variable: A tensor variable.
@@ -999,8 +1007,8 @@ def square_error_cost(input, label):
        * :math:`Out`: Output value, same shape with :math:`X`.
    Args:
-       input(Variable): Input tensor, has predictions.
+        input (Variable): Input tensor, has predictions.
-       label(Variable): Label tensor, has target labels.
+        label (Variable): Label tensor, has target labels.
    Returns:
        Variable: The tensor variable storing the element-wise squared error \
@@ -1029,6 +1037,7 @@ def square_error_cost(input, label):
    return square_out
+@templatedoc()
 def chunk_eval(input,
               label,
               chunk_scheme,
@@ -1037,6 +1046,18 @@ def chunk_eval(input,
    """
    This function computes and outputs the precision, recall and
    F1-score of chunk detection.
+    Args:
+        input (Variable): prediction output of the network.
+        label (Variable): label of the test data set.
+        chunk_scheme (str): ${chunk_scheme_comment}
+        num_chunk_types (int): ${num_chunk_types_comment}
+        excluded_chunk_types (list): ${excluded_chunk_types_comment}
+    Returns:
+        tuple: tuple containing: (precision, recall, f1_score,
+               num_infer_chunks, num_label_chunks,
+               num_correct_chunks)
    """
    helper = LayerHelper("chunk_eval", **locals())
@@ -1069,6 +1090,7 @@ def chunk_eval(input,
            num_correct_chunks)
+@templatedoc()
 def sequence_conv(input,
                  num_filters,
                  filter_size=3,
@@ -1081,6 +1103,19 @@ def sequence_conv(input,
    This function creates the op for sequence_conv, using the inputs and
    other convolutional configurations for the filters and stride as given
    in the input parameters to the function.
+    Args:
+        input (Variable): ${x_comment}
+        num_filters (int): number of filters.
+        filter_size (int): the filter size (H and W).
+        filter_stride (int): stride of the filter.
+        padding (bool): if True, add paddings.
+        bias_attr (ParamAttr|None): attributes for bias
+        param_attr (ParamAttr|None): attributes for parameter
+        act (str): the activation type
+    Returns:
+        Variable: output of sequence_conv
    """
    # FIXME(dzh) : want to unify the argument of python layer
@@ -1180,48 +1215,49 @@ def conv2d(input,
        - Input:
-          Input shape: $(N, C_{in}, H_{in}, W_{in})$
+          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
-          Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
+          Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
        - Output:
-          Output shape: $(N, C_{out}, H_{out}, W_{out})$
+          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
        Where
        .. math::
-        H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
-        W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
    Args:
-       input(Variable): The input image with [N, C, H, W] format.
+        input (Variable): The input image with [N, C, H, W] format.
-       num_filters(int): The number of filter. It is as same as the output
+            num_filters(int): The number of filter. It is as same as the output
-           image channel.
+            image channel.
-       filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+        filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
-           it must contain two integers, (filter_size_H, filter_size_W).
+            it must contain two integers, (filter_size_H, filter_size_W).
-           Otherwise, the filter will be a square.
+            Otherwise, the filter will be a square.
-       stride(int|tuple): The stride size. If stride is a tuple, it must
+        stride (int|tuple): The stride size. If stride is a tuple, it must
-           contain two integers, (stride_H, stride_W). Otherwise, the
+            contain two integers, (stride_H, stride_W). Otherwise, the
-           stride_H = stride_W = stride. Default: stride = 1.
+            stride_H = stride_W = stride. Default: stride = 1.
-       padding(int|tuple): The padding size. If padding is a tuple, it must
+        padding (int|tuple): The padding size. If padding is a tuple, it must
-           contain two integers, (padding_H, padding_W). Otherwise, the
+            contain two integers, (padding_H, padding_W). Otherwise, the
-           padding_H = padding_W = padding. Default: padding = 0.
+            padding_H = padding_W = padding. Default: padding = 0.
-       dilation(int|tuple): The dilation size. If dilation is a tuple, it must
+        dilation (int|tuple): The dilation size. If dilation is a tuple, it must
-           contain two integers, (dilation_H, dilation_W). Otherwise, the
+            contain two integers, (dilation_H, dilation_W). Otherwise, the
-           dilation_H = dilation_W = dilation. Default: dilation = 1.
+            dilation_H = dilation_W = dilation. Default: dilation = 1.
-       groups(int): The groups number of the Conv2d Layer. According to grouped
+        groups (int): The groups number of the Conv2d Layer. According to grouped
-           convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
-           the first half of the filters is only connected to the first half
+            the first half of the filters is only connected to the first half
-           of the input channels, while the second half of the filters is only
+            of the input channels, while the second half of the filters is only
-           connected to the second half of the input channels. Default: groups=1
+            connected to the second half of the input channels. Default: groups=1
-       param_attr(ParamAttr): The parameters to the Conv2d Layer. Default: None
+        param_attr (ParamAttr): The parameters to the Conv2d Layer. Default: None
-       bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
+        bias_attr (ParamAttr): Bias parameter for the Conv2d layer. Default: None
-       use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
-           library is installed. Default: True
+            library is installed. Default: True
-       act(str): Activation type. Default: None
+        use_mkldnn (bool): Use mkldnn kernels or not.
-       name(str|None): A name for this layer(optional). If set None, the layer
+        act (str): Activation type. Default: None
-           will be named automatically.
+        name (str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
    Returns:
        Variable: The tensor variable storing the convolution and \
@@ -1379,7 +1415,7 @@ def sequence_pool(input, pool_type):
 def sequence_first_step(input):
    """
-    This funciton get the first step of sequence.
+    This function gets the first step of sequence.
    .. code-block:: text
@@ -1412,7 +1448,7 @@ def sequence_first_step(input):
 def sequence_last_step(input):
    """
-    This funciton get the last step of sequence.
+    This function gets the last step of sequence.
    .. code-block:: text
@@ -1456,6 +1492,22 @@ def pool2d(input,
    """
    This function adds the operator for pooling in 2 dimensions, using the
    pooling configurations mentioned in input parameters.
+    Args:
+        input (Variable): ${input_comment}
+        pool_size (int): ${ksize_comment}
+        pool_type (str): ${pooling_type_comment}
+        pool_stride (int): stride of the pooling layer.
+        pool_padding (int): padding size.
+        global_pooling (bool): ${global_pooling_comment}
+        use_cudnn (bool): ${use_cudnn_comment}
+        ceil_mode (bool): ${ceil_mode_comment}
+        use_mkldnn (bool): ${use_mkldnn_comment}
+        name (str): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+    Returns:
+        Variable: output of pool2d layer.
    """
    if pool_type not in ["max", "avg"]:
        raise ValueError(
@@ -1513,6 +1565,25 @@ def batch_norm(input,
    """
    This function helps create an operator to implement
    the BatchNorm layer using the configurations from the input parameters.
+    Args:
+        input (Variable): the input variable.
+        act (str): activation type
+        is_test (bool): whether to run batch_norm as test mode.
+        momentum (float): momentum
+        epsilon (float): epsilon, default 1e-05
+        param_attr (ParamAttr|None): attributes for parameter
+        bias_attr (ParamAttr|None): attributes for bias
+        data_layout (str): data layout, default NCHW
+        in_place (bool): if True, do not create tmp variable
+        use_mkldnn (bool): ${use_mkldnn_comment}
+        name (str): The name of this layer. It is optional.
+        moving_mean_name (str): The name of moving mean variable name, optional.
+        moving_variance_name (str): The name of moving variance name, optional.
+        do_model_average_for_mean_and_var (bool):
+    Returns:
+        Variable: output of batch_norm layer.
    """
    helper = LayerHelper('batch_norm', **locals())
    dtype = helper.input_dtype()
@@ -1640,6 +1711,7 @@ def layer_norm(input,
        bias_attr(ParamAttr|None): The parameter attribute for the learnable
            bias :math:`b`.
        act(str): Activation to be applied to the output of layer normalizaiton.
+        name (str): The name of this layer. It is optional.
    Returns:
        Variable: A tensor variable with the same shape as the input.
@@ -1691,6 +1763,17 @@ def layer_norm(input,
 def beam_search_decode(ids, scores, name=None):
+    """
+    ${beam_search_decode}
+    Args:
+        ids (Variable): ${ids_comment}
+        scores (Variable): ${scores_comment}
+        name (str): The name of this layer. It is optional.
+    Returns:
+        tuple: a tuple of two output variable: sentence_ids, sentence_scores
+    """
    helper = LayerHelper('beam_search_decode', **locals())
    sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
    sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
@@ -1766,46 +1849,46 @@ def conv2d_transpose(input,
           W_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1
    Args:
-       input(Variable): The input image with [N, C, H, W] format.
+        input(Variable): The input image with [N, C, H, W] format.
-       num_filters(int): The number of the filter. It is as same as the output
+        num_filters(int): The number of the filter. It is as same as the output
-           image channel.
+            image channel.
-       output_size(int|tuple|None): The output image size. If output size is a
+        output_size(int|tuple|None): The output image size. If output size is a
-           tuple, it must contain two integers, (image_H, image_W). This
+            tuple, it must contain two integers, (image_H, image_W). This
-           parameter only works when filter_size is None.
+            parameter only works when filter_size is None.
-       filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
-           it must contain two integers, (filter_size_H, filter_size_W).
+            it must contain two integers, (filter_size_H, filter_size_W).
-           Otherwise, the filter will be a square. None if use output size to
+            Otherwise, the filter will be a square. None if use output size to
-           calculate filter_size.
+            calculate filter_size.
-       padding(int|tuple): The padding size. If padding is a tuple, it must
+        padding(int|tuple): The padding size. If padding is a tuple, it must
-           contain two integers, (padding_H, padding_W). Otherwise, the
+            contain two integers, (padding_H, padding_W). Otherwise, the
-           padding_H = padding_W = padding. Default: padding = 0.
+            padding_H = padding_W = padding. Default: padding = 0.
-       stride(int|tuple): The stride size. If stride is a tuple, it must
+        stride(int|tuple): The stride size. If stride is a tuple, it must
-           contain two integers, (stride_H, stride_W). Otherwise, the
+            contain two integers, (stride_H, stride_W). Otherwise, the
-           stride_H = stride_W = stride. Default: stride = 1.
+            stride_H = stride_W = stride. Default: stride = 1.
-       dilation(int|tuple): The dilation size. If dilation is a tuple, it must
+        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
-           contain two integers, (dilation_H, dilation_W). Otherwise, the
+            contain two integers, (dilation_H, dilation_W). Otherwise, the
-           dilation_H = dilation_W = dilation. Default: dilation = 1.
+            dilation_H = dilation_W = dilation. Default: dilation = 1.
-       groups(int): The groups number of the Conv2d transpose layer. Inspired by
+        groups(int): The groups number of the Conv2d transpose layer. Inspired by
-           grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
+            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
-           when group=2, the first half of the filters is only connected to the
+            when group=2, the first half of the filters is only connected to the
-           first half of the input channels, while the second half of the
+            first half of the input channels, while the second half of the
-           filters is only connected to the second half of the input channels.
+            filters is only connected to the second half of the input channels.
-           Default: groups=1
+            Default: groups=1
-       param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer.
+        param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer.
-                              Default: None
+                               Default: None
-       bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
+        bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
-       use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
+        use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
-           library is installed. Default: True
+            library is installed. Default: True
-       act(str): Activation type. Default: None
+        act(str): Activation type. Default: None
-       name(str|None): A name for this layer(optional). If set None, the layer
+        name(str|None): A name for this layer(optional). If set None, the layer
-           will be named automatically.
+            will be named automatically.
    Returns:
-       Variable: The tensor variable storing the convolution transpose result.
+        Variable: The tensor variable storing the convolution transpose result.
    Raises:
-       ValueError: If the shapes of input, filter_size, stride, padding and
+        ValueError: If the shapes of input, filter_size, stride, padding and
-                   groups mismatch.
+                    groups mismatch.
    Examples:
       .. code-block:: python
@@ -1942,6 +2025,17 @@ def sequence_expand(x, y, ref_level=-1, name=None):
 def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
    '''
    This function implements the beam search algorithm.
+    Args:
+        pre_ids (Variable): ${pre_ids_comment}
+        ids (Variable): ${ids_comment}
+        scores (Variable): ${scores_comment}
+        beam_size (int): ${beam_size_comment}
+        end_id (int): ${end_id_comment}
+        level (int): ${level_comment}
+    Returns:
+        tuple: a tuple of beam_search output variables: selected_ids, selected_scores
    '''
    helper = LayerHelper('beam_search', **locals())
    score_type = scores.dtype
@@ -2437,19 +2531,21 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
    The l2 normalize layer normalizes `x` along dimension `axis` using an L2
    norm. For a 1-D tensor (`dim` is fixed to 0), this layer computes
-    output = x / sqrt(max(sum(x**2), epsilon))
+    .. math::
+    y = \frac{x}{ \sqrt{\sum {x^2} + epsion }}
    For `x` with more dimensions, this layer independently normalizes each 1-D
    slice along dimension `axis`.
    Args:
-       x(Variable|list): The input tensor to l2_normalize layer.
+        x(Variable|list): The input tensor to l2_normalize layer.
-       axis(int): Dimension along which to normalize the input.
+        axis(int): The axis on which to apply normalization. If `axis < 0`,
-       epsilon(float): A lower bound value for `x`'s l2 norm. sqrt(epsilon) will
+            the dimension to normalization is rank(X) + axis. -1 is the
-                       be used as the divisor if the l2 norm of `x` is less than
+            last dimension.
-                       sqrt(epsilon).
+        epsilon(float): The epsilon value is used to avoid division by zero,
-       name(str|None): A name for this layer(optional). If set None, the layer
+            the defalut value is 1e-10.
-                       will be named automatically.
+        name(str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
    Returns:
@@ -2468,46 +2564,17 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
        axis = 0
    helper = LayerHelper("l2_normalize", **locals())
-    square = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_tmp_variable(dtype=x.dtype)
-    helper.append_op(type="square", inputs={"X": x}, outputs={"Out": square})
+    norm = helper.create_tmp_variable(dtype=x.dtype)
-    reduced_sum = helper.create_tmp_variable(dtype=x.dtype)
    helper.append_op(
-        type="reduce_sum",
+        type="norm",
-        inputs={"X": square},
+        inputs={"X": x},
-        outputs={"Out": reduced_sum},
+        outputs={"Out": out,
+                 "Norm": norm},
        attrs={
-            "dim": [1] if axis is None else [axis],
+            "axis": 1 if axis is None else axis,
-            "keep_dim": True,
+            "epsilon": epsilon,
-            "reduce_all": False
        })
-    # TODO(caoying) A lower bound value epsilon for the norm is needed to
-    # imporve the numeric stability of reciprocal. This requires a maximum_op.
-    rsquare = helper.create_tmp_variable(dtype=x.dtype)
-    helper.append_op(
-        type="reciprocal", inputs={"X": reduced_sum}, outputs={"Out": rsquare})
-    # TODO(caoying) the current elementwise_mul operator does not support a
-    # general broadcast rule which broadcasts input(Y) to have the same
-    # dimension with Input(X) starting from a specified dimension. So this
-    # exanpsion is requred. Once a general broadcast rule is spported, this
-    # expanding canbe removed.
-    rsquare_expanded = helper.create_tmp_variable(dtype=x.dtype)
-    expand_times = [1] * len(x.shape)
-    expand_times[axis] = int(x.shape[axis])
-    helper.append_op(
-        type="expand",
-        inputs={"X": rsquare},
-        outputs={"Out": rsquare_expanded},
-        attrs={"expand_times": expand_times})
-    out = helper.create_tmp_variable(dtype=x.dtype)
-    helper.append_op(
-        type="elementwise_mul",
-        inputs={"X": x,
-                "Y": rsquare_expanded},
-        outputs={"Out": out})
    return out
@@ -2666,8 +2733,7 @@ def topk(input, k, name=None):
    return values, indices
-def edit_distance(input, label, normalized=True, ignored_tokens=None,
+def edit_distance(input, label, normalized=True, ignored_tokens=None):
-                  name=None):
    """
    EditDistance operator computes the edit distances between a batch of
    hypothesis strings and their references. Edit distance, also called
@@ -2681,26 +2747,23 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None,
    "kitten" -> "sitten" -> "sittin" -> "sitting"
-    Input(Hyps) is a LoDTensor consisting of all the hypothesis strings with
+    The input is a LoDTensor consisting of all the hypothesis strings with
    the total number denoted by `batch_size`, and the separation is specified
    by the LoD information. And the `batch_size` reference strings are arranged
-    in order in the same way in the LoDTensor Input(Refs).
+    in order in the same way in the input LoDTensor.
-    Output(Out) contains the `batch_size` results and each stands for the edit
+    The output contains the `batch_size` results and each stands for the edit
    distance for a pair of strings respectively. If Attr(normalized) is true,
    the edit distance will be divided by the length of reference string.
    Args:
        input(Variable): The indices for hypothesis strings.
        label(Variable): The indices for reference strings.
+        normalized(bool, default True): Indicated whether to normalize the edit distance by
-        normalized(bool): Indicated whether to normalize the edit distance by
                          the length of reference string.
+        ignored_tokens(list<int>, default None): Tokens that should be removed before
-        ignored_tokens(list of int): Tokens that should be removed before
                                     calculating edit distance.
+        name (str): The name of this layer. It is optional.
    Returns:
        Variable: sequence-to-sequence edit distance in shape [batch_size, 1].
@@ -2710,7 +2773,6 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None,
            x = fluid.layers.data(name='x', shape=[8], dtype='float32')
            y = fluid.layers.data(name='y', shape=[7], dtype='float32')
            cost = fluid.layers.edit_distance(input=x,label=y)
    """
    helper = LayerHelper("edit_distance", **locals())
@@ -2790,10 +2852,10 @@ def ctc_greedy_decoder(input, blank, name=None):
                         where Lp is the sum of all input sequences' length and
                         num_classes is the true number of classes. (not
                         including the blank label).
        blank(int): the blank label index of Connectionist Temporal
                    Classification (CTC) loss, which is in thehalf-opened
                    interval [0, num_classes + 1).
+        name (str): The name of this layer. It is optional.
    Returns:
        Variable: CTC greedy decode result. If all the sequences in result were
@@ -2830,35 +2892,33 @@ def warpctc(input, label, blank=0, norm_by_times=False):
    input tensor.
    Args:
-       input(Variable): (LodTensor, default: LoDTensor<float>),
+       input (Variable): The unscaled probabilities of variable-length sequences,
-         the unscaled probabilities of variable-length sequences,
         which is a 2-D Tensor with LoD information.
         It's shape is [Lp, num_classes + 1], where Lp is the sum of all input
         sequences' length and num_classes is the true number of classes.
         (not including the blank label).
-       label(Variable): (LodTensor, default: LoDTensor<int>), the ground truth
+       label (Variable): The ground truth of variable-length sequence, 
-         of variable-length sequence, which is a 2-D Tensor with LoD
+         which is a 2-D Tensor with LoD information. It is of the shape [Lg, 1],
-         information. It is of the shape [Lg, 1], where Lg is th sum of
+         where Lg is th sum of all labels' length.
-         all labels' length.
+       blank (int, default 0): The blank label index of Connectionist
-       blank: (int, default: 0), the blank label index of Connectionist
         Temporal Classification (CTC) loss, which is in the
         half-opened interval [0, num_classes + 1).
-       norm_by_times: (bool, default: false), whether to normalize
+       norm_by_times(bool, default false): Whether to normalize the gradients 
-       the gradients by the number of time-step, which is also the
+         by the number of time-step, which is also the sequence's length. 
-       sequence's length. There is no need to normalize the gradients
+         There is no need to normalize the gradients if warpctc layer was 
-       if warpctc layer was follewed by a mean_op.
+         follewed by a mean_op.
    Returns:
        Variable: The Connectionist Temporal Classification (CTC) loss,
        which is a 2-D Tensor of the shape [batch_size, 1].
    Examples:
        .. code-block:: python
-            y = layers.data(
-                name='y', shape=[11, 8], dtype='float32', lod_level=1)
+            label = layers.data(shape=[11, 8], dtype='float32', lod_level=1)
-            y_predict = layers.data(
+            predict = layers.data(shape=[11, 1], dtype='float32')
-                name='y_predict', shape=[11, 1], dtype='float32')
+            cost = layers.warpctc(input=predict, label=label)
-            cost = layers.warpctc(input=y_predict, label=y)
    """
    helper = LayerHelper('warpctc', **locals())
@@ -2888,16 +2948,21 @@ def sequence_reshape(input, new_dim):
        x is a LoDTensor:
            x.lod  = [[0, 2, 6]]
-            x.data = [[1, 2], [3, 4],
+            x.data = [[1,  2], [3,  4],
-                      [5, 6], [7, 8], [9, 10], [11, 12]]
+                      [5,  6], [7,  8],
+                      [9, 10], [11, 12]]
            x.dims = [6, 2]
        set new_dim = 4
        then out is a LoDTensor:
            out.lod  = [[0, 1, 3]]
-            out.data = [[1, 2, 3, 4],
-                        [5, 6, 7, 8], [9, 10, 11, 12]]
+            out.data = [[1,  2,  3,  4],
+                        [5,  6,  7,  8],
+                        [9, 10, 11, 12]]
            out.dims = [3, 4]
    Currently, only 1-level LoDTensor is supported and please make sure
@@ -2905,18 +2970,18 @@ def sequence_reshape(input, new_dim):
    no remainder for each sequence.
    Args:
-       input (Variable): (LodTensor, default: LoDTensor<float>), a 2-D LoDTensor
-                with shape being [N, M] where M for dimension.
+       input (Variable): A 2-D LoDTensor with shape being [N, M] where M for dimension.
-       new_dim (int): New dimension which the input LoDTensor is reshaped to.
+       new_dim (int): New dimension that the input LoDTensor is reshaped to.
    Returns:
        Variable: Reshaped LoDTensor according to new dimension.
    Examples:
        .. code-block:: python
-            x = fluid.layers.data(name='x', shape=[5, 20],
+            x = fluid.layers.data(shape=[5, 20], dtype='float32', lod_level=1)
-                              dtype='float32', lod_level=1)
            x_reshaped = layers.sequence_reshape(input=x, new_dim=10)
    """
    helper = LayerHelper('sequence_reshape', **locals())
@@ -2929,7 +2994,10 @@ def sequence_reshape(input, new_dim):
    return out
-@autodoc()
+# FIXME(wuyi): let docstring_checker.py understand @autodoc.
+# For now, the comments in c++ use types like Tensor, but in python side
+# the type is often "Variable", and arguments may vary.
+@templatedoc(op_type="nce")
 def nce(input,
        label,
        num_total_classes,
@@ -2937,6 +3005,21 @@ def nce(input,
        param_attr=None,
        bias_attr=None,
        num_neg_samples=None):
+    """
+    ${comment}
+    Args:
+        input (Variable): input variable.
+        label (Variable): label.
+        num_total_classes (int):${num_total_classes_comment}
+        sample_weight (int): ${sample_weight_comment}
+        param_attr (ParamAttr|None): attributes for parameter
+        bias_attr (ParamAttr|None): attributes for bias
+        num_neg_samples (int): ${num_neg_samples_comment}
+    Returns:
+        Variable: output of nce layer.
+    """
    helper = LayerHelper('nce', **locals())
    assert isinstance(input, Variable)
    dim = input.shape[1]
@@ -2994,8 +3077,9 @@ def transpose(x, perm, name=None):
    perm[i]-th dimension of `input`.
    Args:
-       input (Variable): (Tensor), A Tensor.
+        x (Variable): The input Tensor.
-       perm (list): A permutation of the dimensions of `input`.
+        perm (list): A permutation of the dimensions of `input`.
+        name (str): The name of this layer. It is optional.
    Returns:
        Variable: A transposed Tensor.
@@ -3228,9 +3312,9 @@ def multiplex(inputs, index):
    row of the matrix, then `O[i]` is equal to :math:`I_{ID[i]}[i]`.
    Args:
-       inputs (list): A list of variables to gather from. All variables have the
+        inputs (list): A list of variables to gather from. All variables have the
                same shape and the rank is at least 2.
-       index (Variable): Tensor<int32>, index variable which is a 2-D tensor
+        index (Variable): Tensor<int32>, index variable which is a 2-D tensor
                with shape [M, 1] where M is the batch size.
    Returns:
@@ -3429,7 +3513,8 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
        begin(int): The first value of this counter.
        step(int): The increment step between each execution.
-    Returns(Variable): The global run counter.
+    Returns:
+        Variable: The global run counter.
    """
    helper = LayerHelper('global_step_counter')
    if counter_name is None:
@@ -3490,7 +3575,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
    the corresponding dimension of x.
    Args:
-        input(variable): The input tensor.
+        x(variable): The input tensor.
        shape(list): The new shape. At most one dimension of the new shape can
                     be -1.
        actual_shape(variable): An optional input. If provided, reshape
@@ -3502,8 +3587,10 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
        inplace(bool): If this flag is set true, a new output tensor is created
                       whose data is copied from input x, otherwise the output
                       shares data with input without copying.
+        name (str): The name of this layer. It is optional.
-    Returns(variable): The output tensor.
+    Returns:
+        Variable: The output tensor.
    Examples:
        .. code-block:: python
@@ -3929,22 +4016,25 @@ def dice_loss(input, label, epsilon=0.00001):
    return reduce_mean(dice_score)
-def resize_bilinear(input, out_shape=None, scale=None, name=None):
+def image_resize(input,
+                 out_shape=None,
+                 scale=None,
+                 name=None,
+                 resample='BILINEAR'):
    """
-    The mathematical meaning of resize bilinear layer is
+    Resize a batch of images.
-    Bilinear interpolation.
-    Bilinear interpolation is an extension of linear interpolation for
-    interpolating functions of two variables (e.g. H-direction and
-    W-direction in this layer) on a rectilinear 2D grid.
-    For details, please refer to Wikipedia:
+    The input must be a tensor of the shape (num_batches, channels, in_h, in_w), 
-    https://en.wikipedia.org/wiki/Bilinear_interpolation
+    and the resizing only applies on the last two dimensions(hight and width).
+    Supporting resample methods:
+        'BILINEAR' : Bilinear interpolation
    Args:
-        input (Variable): The input tensor of resize bilinear layer,
+        input (Variable): The input tensor of image resize layer,
                          This is a 4-D tensor of the shape
                          (num_batches, channels, in_h, in_w).
-        out_shape(list|tuple|Variable|None): Output shape of resize bilinear
+        out_shape(list|tuple|Variable|None): Output shape of image resize
                                    layer, the shape is (out_h, out_w).
                                    Default: None
        scale(float|None): The multiplier for the input height or width.
@@ -3953,6 +4043,8 @@ def resize_bilinear(input, out_shape=None, scale=None, name=None):
                         Default: None
        name(str|None): A name for this layer(optional). If set None, the layer
                        will be named automatically.
+        resample(str): The resample method. It can only be 'BILINEAR' currently.
+                       Default: 'BILINEAR'
    Returns:
        out (Variable): The output is a 4-D tensor of the shape
@@ -3961,8 +4053,12 @@ def resize_bilinear(input, out_shape=None, scale=None, name=None):
    Examples:
        .. code-block:: python
-            out = fluid.layers.resize_bilinear(input, out_shape=[12, 12])
+            out = fluid.layers.image_resize(input, out_shape=[12, 12])
    """
+    resample_methods = {'BILINEAR': 'bilinear_interp'}
+    if resample not in resample_methods:
+        raise ValueError(
+            "The 'resample' of image_resize can only be 'BILINEAR' currently.")
    if out_shape is None and scale is None:
        raise ValueError("One of out_shape and scale must not be None")
    helper = LayerHelper('bilinear_interp', **locals())
@@ -3990,7 +4086,7 @@ def resize_bilinear(input, out_shape=None, scale=None, name=None):
    out = helper.create_tmp_variable(dtype)
    helper.append_op(
-        type="bilinear_interp",
+        type=resample_methods[resample],
        inputs=inputs,
        outputs={"Out": out},
        attrs={"out_h": out_h,
@@ -3998,6 +4094,62 @@ def resize_bilinear(input, out_shape=None, scale=None, name=None):
    return out
+@templatedoc(op_type="bilinear_interp")
+def resize_bilinear(input, out_shape=None, scale=None, name=None):
+    """
+    ${comment}
+    Args:
+        input(${x_type}): ${x_comment}.
+        out_shape(${out_size_type}): ${out_size_comment}.
+        scale(float|None): The multiplier for the input height or width. At
+             least one of out_shape or scale must be set. And out_shape has
+             a higher priority than scale. Default: None.
+        name(str|None): The output variable name.
+    Returns:
+        ${out_comment}.
+    """
+    return image_resize(input, out_shape, scale, name, 'BILINEAR')
+def image_resize_short(input, out_short_len, resample='BILINEAR'):
+    """
+    Resize a batch of images. The short edge of input images will be 
+    resized to the given 'out_short_len'. The long edge of input images 
+    will be resized proportionately to make images' length-width ratio 
+    constant.
+    Args:
+        input (Variable): The input tensor of image resize layer,
+                          This is a 4-D tensor of the shape
+                          (num_batches, channels, in_h, in_w).
+        out_short_len(int): The length of output images' short edge.
+        resample (str): resample method, default: BILINEAR.
+    Returns:
+        out (Variable): The output is a 4-D tensor of the shape
+                        (num_batches, channls, out_h, out_w).
+    """
+    in_shape = input.shape
+    if len(in_shape) != 4:
+        raise ValueError(
+            "The rank of input must be 4 (num_batches, channels, in_h, in_w).")
+    hw = in_shape[2:4]
+    short_idx = hw.index(min(hw))
+    long_idx = 1 - short_idx
+    out_shape = list(hw)
+    out_shape[short_idx] = out_short_len
+    out_shape[long_idx] = int(
+        float(out_shape[long_idx]) * (float(out_short_len) / float(hw[
+            short_idx])) + 0.5)
+    return image_resize(input=input, out_shape=out_shape, resample=resample)
 def gather(input, index):
    """
    Output is obtained by gathering entries of the outer-most dimension 
@@ -4005,7 +4157,7 @@ def gather(input, index):
    .. math::
-	Out = X[Index]
+        Out = X[Index]
    .. code-block:: text
@@ -4013,8 +4165,8 @@ def gather(input, index):
                Given:
-    		X = [[1, 2],
+                X = [[1, 2],
-         	     [3, 4],
+                     [3, 4],
                     [5, 6]]
                Index = [1, 2]
@@ -4032,6 +4184,7 @@ def gather(input, index):
        output (Variable): The output is a tensor with the same rank as input.
    Examples:
        .. code-block:: python
            output = fluid.layers.gather(x, index)
@@ -4047,10 +4200,31 @@ def gather(input, index):
    return out
-def random_crop(input, shape, seed=1):
+@templatedoc()
+def random_crop(x, shape, seed=None):
+    """
+    ${comment}
+    Examples:
+        >>> img = fluid.layers.data("img", [3, 256, 256])
+        >>> cropped_img = fluid.layers.random_crop(img, shape=[3, 224, 224])
+    Args:
+        x(${x_type}): ${x_comment}
+        shape(${shape_type}): ${shape_comment}
+        seed(int|${seed_type}|None): ${seed_comment} By default, the seed will
+            get from `random.randint(-65536, 65535)`.
+    Returns:
+        ${out_comment}
+    """
    helper = LayerHelper("random_crop", **locals())
    dtype = helper.input_dtype()
    out = helper.create_tmp_variable(dtype)
+    if seed is None:
+        seed = random.randint(-65536, 65535)
    if isinstance(seed, int):
        seed_value = seed
        seed = helper.create_tmp_variable(dtype="int64")
@@ -4069,9 +4243,59 @@ def random_crop(input, shape, seed=1):
    seed_out = helper.create_tmp_variable(dtype="int64")
    helper.append_op(
        type="random_crop",
-        inputs={"X": input,
+        inputs={"X": x,
                "Seed": seed},
        outputs={"Out": out,
                 "SeedOut": seed_out},
        attrs={"shape": shape})
    return out
+def mean_iou(input, label, num_classes):
+    """
+    Mean Intersection-Over-Union is a common evaluation metric for
+    semantic image segmentation, which first computes the IOU for each 
+    semantic class and then computes the average over classes. 
+    IOU is defined as follows: 
+    .. math::
+        IOU = true_positive / (true_positive + false_positive + false_negative). 
+    The predictions are accumulated in a confusion matrix and mean-IOU 
+    is then calculated from it.
+    Args:
+        input (Variable): A Tensor of prediction results for semantic labels with type int32 or int64.
+        label (Variable):  A Tensor of ground truth labels with type int32 or int64. 
+                           Its shape should be the same as input.
+    Returns:
+        mean_iou (Variable): A Tensor representing the mean intersection-over-union with shape [1].
+        out_wrong(Variable): A Tensor with shape [num_classes]. The wrong numbers of each class.
+        out_correct(Variable): A Tensor with shape [num_classes]. The correct numbers of each class. 
+    Examples:
+        .. code-block:: python
+            iou, wrongs, corrects = fluid.layers.mean_iou(predict, label, num_classes)
+    """
+    helper = LayerHelper('mean_iou', **locals())
+    dtype = helper.input_dtype()
+    out_mean_iou = helper.create_tmp_variable(dtype='float32')
+    out_wrong = helper.create_tmp_variable(dtype='int32')
+    out_correct = helper.create_tmp_variable(dtype='int32')
+    helper.append_op(
+        type="mean_iou",
+        inputs={"predictions": input,
+                "labels": label},
+        outputs={
+            "out_mean_iou": out_mean_iou,
+            "out_wrong": out_wrong,
+            "out_correct": out_correct
+        },
+        attrs={"num_classes": num_classes})
+    return out_mean_iou, out_wrong, out_correct