diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index d699004789bdf5b8ab24d8ca7585e6495ef79123..8fa91d1f4f25953df3410c0585f9775dd9d4191c 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -128,7 +128,7 @@ paddle.fluid.layers.dynamic_lstmp (ArgSpec(args=['input', 'size', 'proj_size', '
 paddle.fluid.layers.dynamic_gru (ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False)), ('document', '83617c165827e030636c80486d5de6f3'))
 paddle.fluid.layers.gru_unit (ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False)), ('document', '33974b9bfa69f2f1eb85e6f956dff04e'))
 paddle.fluid.layers.linear_chain_crf (ArgSpec(args=['input', 'label', 'param_attr', 'length'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'b28bdb43160e9667be2a3457d19d9f5b'))
-paddle.fluid.layers.crf_decoding (ArgSpec(args=['input', 'param_attr', 'label', 'length'], varargs=None, keywords=None, defaults=(None, None)), ('document', '933b7e268c4ffa3d5c3ef953a5ee9f0b'))
+paddle.fluid.layers.crf_decoding (ArgSpec(args=['input', 'param_attr', 'label', 'length'], varargs=None, keywords=None, defaults=(None, None)), ('document', '708ce0348b74d3e0c7885c2c524b7fa7'))
 paddle.fluid.layers.cos_sim (ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None), ('document', '48ec1ba2d75c4e2faf8d9a47350462ae'))
 paddle.fluid.layers.cross_entropy (ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)), ('document', 'd1985a930a59c3bd41a7c1d72594f5b9'))
 paddle.fluid.layers.bpr_loss (ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ae57e6e5136dade436f0df1f11770afa'))
@@ -176,7 +176,7 @@ paddle.fluid.layers.warpctc (ArgSpec(args=['input', 'label', 'blank', 'norm_by_t
 paddle.fluid.layers.sequence_reshape (ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None), ('document', 'eeb1591cfc854c6ffdac77b376313c44'))
 paddle.fluid.layers.transpose (ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '8e72db173d4c082e27cb11f31d8c9bfa'))
 paddle.fluid.layers.im2sequence (ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)), ('document', 'fe352915a543cec434f74e9b32ac49da'))
-paddle.fluid.layers.nce (ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)), ('document', '83d4ca6dfb957912807f535756e76992'))
+paddle.fluid.layers.nce (ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)), ('document', '38297567127888e01542857839058d52'))
 paddle.fluid.layers.sampled_softmax_with_cross_entropy (ArgSpec(args=['logits', 'label', 'num_samples', 'num_true', 'remove_accidental_hits', 'use_customized_samples', 'customized_samples', 'customized_probabilities', 'seed'], varargs=None, keywords=None, defaults=(1, True, False, None, None, 0)), ('document', 'd4435a63d34203339831ee6a86ef9242'))
 paddle.fluid.layers.hsigmoid (ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)), ('document', 'b83e7dfa81059b39bb137922dc914f50'))
 paddle.fluid.layers.beam_search (ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False)), ('document', '1270395ce97a4e1b556104abbb14f096'))
@@ -188,7 +188,7 @@ paddle.fluid.layers.spectral_norm (ArgSpec(args=['weight', 'dim', 'power_iters',
 paddle.fluid.layers.softmax_with_cross_entropy (ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax', 'axis'], varargs=None, keywords=None, defaults=(False, -100, True, False, -1)), ('document', '54e1675aa0364f4a78fa72804ec0f413'))
 paddle.fluid.layers.smooth_l1 (ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'ecb75c1b00c4c76c98b482f633b7a10c'))
 paddle.fluid.layers.one_hot (ArgSpec(args=['input', 'depth', 'allow_out_of_range'], varargs=None, keywords=None, defaults=(False,)), ('document', 'cdf5dc2078f1e20dc61dd0bec7e28a29'))
-paddle.fluid.layers.autoincreased_step_counter (ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1)), ('document', '98e7927f09ee2270535b29f048e481ec'))
+paddle.fluid.layers.autoincreased_step_counter (ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1)), ('document', 'd016c137beb9a4528b7378b437d00151'))
 paddle.fluid.layers.reshape (ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', 'd7a6d59e464a7ef1184eb6caefeb49f1'))
 paddle.fluid.layers.squeeze (ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ebbac07662a6e22e8e299ced880c7775'))
 paddle.fluid.layers.unsqueeze (ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b9bd3129d36a70e7c4385df51ff71c62'))
@@ -206,7 +206,7 @@ paddle.fluid.layers.image_resize_short (ArgSpec(args=['input', 'out_short_len',
 paddle.fluid.layers.resize_bilinear (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode', 'data_format'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1, 'NCHW')), ('document', '44da7890c8a362a83a1c0902a1dc1e4d'))
 paddle.fluid.layers.resize_trilinear (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode', 'data_format'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1, 'NCDHW')), ('document', '5b4d0f823f94c260fe5e6f7eec60a797'))
 paddle.fluid.layers.resize_nearest (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'data_format'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 'NCHW')), ('document', '0107a5cbae1aef3f381d3d769a6068eb'))
-paddle.fluid.layers.gather (ArgSpec(args=['input', 'index', 'overwrite'], varargs=None, keywords=None, defaults=(True,)), ('document', 'f985c9b66e3aec96fa753a8eb44c991c'))
+paddle.fluid.layers.gather (ArgSpec(args=['input', 'index', 'overwrite'], varargs=None, keywords=None, defaults=(True,)), ('document', '5c52e8512f97a84608bc3b8b3250fc70'))
 paddle.fluid.layers.gather_nd (ArgSpec(args=['input', 'index', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'a7d625028525167b138106f574dffdf9'))
 paddle.fluid.layers.scatter (ArgSpec(args=['input', 'index', 'updates', 'name', 'overwrite'], varargs=None, keywords=None, defaults=(None, True)), ('document', '69b22affd4a6326502af166f04c095ab'))
 paddle.fluid.layers.scatter_nd_add (ArgSpec(args=['ref', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '2607b5c9369fbc52f208de066a80fc25'))
@@ -220,7 +220,7 @@ paddle.fluid.layers.log (ArgSpec(args=['x', 'name'], varargs=None, keywords=None
 paddle.fluid.layers.crop (ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '32196a194f757b4da114a595a5bc6414'))
 paddle.fluid.layers.crop_tensor (ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'd460aaf35afbbeb9beea4789aa6e4343'))
 paddle.fluid.layers.rank_loss (ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '8eb36596bb43d7a907d3397c7aedbdb3'))
-paddle.fluid.layers.margin_rank_loss (ArgSpec(args=['label', 'left', 'right', 'margin', 'name'], varargs=None, keywords=None, defaults=(0.1, None)), ('document', '6fc86ed23b420c8a0f6c043563cf3937'))
+paddle.fluid.layers.margin_rank_loss (ArgSpec(args=['label', 'left', 'right', 'margin', 'name'], varargs=None, keywords=None, defaults=(0.1, None)), ('document', '1a177f30e5013fae7ee6c45860cf4946'))
 paddle.fluid.layers.elu (ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(1.0, None)), ('document', '9af1926c06711eacef9e82d7a9e4d308'))
 paddle.fluid.layers.relu6 (ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(6.0, None)), ('document', '538fc860b2a1734e118b94e4a1a3ee67'))
 paddle.fluid.layers.pow (ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None)), ('document', '00d437d1e0d9450ea75a0495b93b54a7'))
@@ -254,7 +254,7 @@ paddle.fluid.layers.elementwise_floordiv (ArgSpec(args=['x', 'y', 'axis', 'act',
 paddle.fluid.layers.uniform_random_batch_size_like (ArgSpec(args=['input', 'shape', 'dtype', 'input_dim_idx', 'output_dim_idx', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', 0, 0, -1.0, 1.0, 0)), ('document', 'cfa120e583cd4a5bfa120c8a26f98a28'))
 paddle.fluid.layers.gaussian_random (ArgSpec(args=['shape', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32')), ('document', 'ebbf399d4e03190ce5dc9488f05c92f4'))
 paddle.fluid.layers.sampling_id (ArgSpec(args=['x', 'min', 'max', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32')), ('document', 'c39b647b6cf08e058d96ee503d5284fe'))
-paddle.fluid.layers.gaussian_random_batch_size_like (ArgSpec(args=['input', 'shape', 'input_dim_idx', 'output_dim_idx', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0, 0, 0.0, 1.0, 0, 'float32')), ('document', 'b24d0b21361c4bb8ef2cec8c26fb12b2'))
+paddle.fluid.layers.gaussian_random_batch_size_like (ArgSpec(args=['input', 'shape', 'input_dim_idx', 'output_dim_idx', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0, 0, 0.0, 1.0, 0, 'float32')), ('document', '2aed0f546f220364fb1da724a3176f74'))
 paddle.fluid.layers.sum (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', 'f4b60847cb0f1ae00823ba6fb1b11310'))
 paddle.fluid.layers.slice (ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None), ('document', '8c622791994a0d657d8c6c9cefa5bf34'))
 paddle.fluid.layers.strided_slice (ArgSpec(args=['input', 'axes', 'starts', 'ends', 'strides'], varargs=None, keywords=None, defaults=None), ('document', '340d8d656272ea396b441aab848429a2'))
@@ -275,12 +275,12 @@ paddle.fluid.layers.space_to_depth (ArgSpec(args=['x', 'blocksize', 'name'], var
 paddle.fluid.layers.affine_grid (ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '315b50c1cbd9569375b098c56f1e91c9'))
 paddle.fluid.layers.sequence_reverse (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5b32ed21ab89140a8e758002923a0da3'))
 paddle.fluid.layers.affine_channel (ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name', 'act'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None, None)), ('document', 'ecc4b1323028bde0518d666882d03515'))
-paddle.fluid.layers.similarity_focus (ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '18ec2e3afeb90e70c8b73d2b71c40fdb'))
+paddle.fluid.layers.similarity_focus (ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '57256fcb1119dc35ae031889fa601d61'))
 paddle.fluid.layers.hash (ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'a0b73c21be618cec0281e7903039e5e3'))
 paddle.fluid.layers.grid_sampler (ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '90c74742f48c70b103f1fbb9eb129066'))
-paddle.fluid.layers.log_loss (ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)), ('document', 'e3993a477c94729526040ff65d95728e'))
+paddle.fluid.layers.log_loss (ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)), ('document', 'ef1701e11d60508fe8f02dd2a8c60bdf'))
 paddle.fluid.layers.add_position_encoding (ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e399f9436fed5f7ff480d8532e42c937'))
-paddle.fluid.layers.bilinear_tensor_product (ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '45fc3652a8e1aeffbe4eba371c54f756'))
+paddle.fluid.layers.bilinear_tensor_product (ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '6755168c4b2308e1e4f54cb56fa7dcb2'))
 paddle.fluid.layers.merge_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b2b0e5d5c155ce24bafc38b78cd0b164'))
 paddle.fluid.layers.get_tensor_from_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '2c568321feb4d16c41a83df43f95089d'))
 paddle.fluid.layers.lstm (ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)), ('document', '5193cf1113f9d8d8f682ee5a5fc8b391'))
@@ -900,7 +900,7 @@ paddle.fluid.transpiler.DistributeTranspilerConfig ('paddle.fluid.transpiler.dis
 paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.nets.simple_img_conv_pool (ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True)), ('document', '5e89c978199c4ecce2b26d5fed1ec52b'))
 paddle.fluid.nets.sequence_conv_pool (ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type', 'bias_attr'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max', None)), ('document', 'b2d435f782ac8ea3ca480b8d24e7f5b4'))
-paddle.fluid.nets.glu (ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)), ('document', 'b87bacfc70dd3477ed25ef14aa01389a'))
+paddle.fluid.nets.glu (ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)), ('document', '3efe197c8e3e75f84a4c464d8b74e943'))
 paddle.fluid.nets.scaled_dot_product_attention (ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)), ('document', 'b1a07a0000eb9103e3a143ca8c13de5b'))
 paddle.fluid.nets.img_conv_group (ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True)), ('document', '6033b78da39b8b0ed302fbb0f67da502'))
 paddle.fluid.optimizer.SGDOptimizer ('paddle.fluid.optimizer.SGDOptimizer', ('document', 'c3c8dd3193d991adf8bda505560371d6'))
@@ -1009,7 +1009,7 @@ paddle.fluid.optimizer.DGCMomentumOptimizer.backward (ArgSpec(args=['self', 'los
 paddle.fluid.optimizer.DGCMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.DGCMomentumOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
 paddle.fluid.optimizer.DGCMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '8387af01322a6defc92c1832faccd304'))
-paddle.fluid.optimizer.LambOptimizer ('paddle.fluid.optimizer.LambOptimizer', ('document', '7dd8b270156a52f1f6b4663336960893'))
+paddle.fluid.optimizer.LambOptimizer ('paddle.fluid.optimizer.LambOptimizer', ('document', '56b5b21dc8fb01174c3bdd0b24f8be4b'))
 paddle.fluid.optimizer.LambOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'lamb_weight_decay', 'beta1', 'beta2', 'epsilon', 'regularization', 'exclude_from_weight_decay_fn', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.01, 0.9, 0.999, 1e-06, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.LambOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610'))
 paddle.fluid.optimizer.LambOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
@@ -1017,9 +1017,9 @@ paddle.fluid.optimizer.LambOptimizer.backward (ArgSpec(args=['self', 'loss', 'st
 paddle.fluid.optimizer.LambOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.LambOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
 paddle.fluid.optimizer.LambOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '8387af01322a6defc92c1832faccd304'))
-paddle.fluid.optimizer.ExponentialMovingAverage ('paddle.fluid.optimizer.ExponentialMovingAverage', ('document', 'a38b7d5b9f17a295ed15d4c1b9ab4cd0'))
+paddle.fluid.optimizer.ExponentialMovingAverage ('paddle.fluid.optimizer.ExponentialMovingAverage', ('document', 'c93a5e29890877d94c3180b3d5f2d464'))
 paddle.fluid.optimizer.ExponentialMovingAverage.__init__ (ArgSpec(args=['self', 'decay', 'thres_steps', 'name'], varargs=None, keywords=None, defaults=(0.999, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.ExponentialMovingAverage.apply (ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)), ('document', '30f494752ac8921dc5835a63637f453a'))
+paddle.fluid.optimizer.ExponentialMovingAverage.apply (ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)), ('document', '178fa374c299906eb88c0df8ed21a337'))
 paddle.fluid.optimizer.ExponentialMovingAverage.restore (ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None), ('document', '8c8a1791608b02a1ede53d6dd3a4fcec'))
 paddle.fluid.optimizer.ExponentialMovingAverage.update (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'ea10f08af6d7aac3b7974aa976e4085f'))
 paddle.fluid.optimizer.PipelineOptimizer ('paddle.fluid.optimizer.PipelineOptimizer', ('document', '6f85382abedb922387b08d98e8d0b69c'))
diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc
index 671623dcee6948f10bacea4ad0e7505b33a6d84b..4676bd04646ff04d65539b35d9e45eb8a06188c2 100644
--- a/paddle/fluid/operators/crf_decoding_op.cc
+++ b/paddle/fluid/operators/crf_decoding_op.cc
@@ -21,45 +21,47 @@ class CRFDecodingOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput(
         "Emission",
-        "(Tensor<float>/LoDTensor<float>). For a LoDTensor input, its "
-        "shape is [N x D] where N is the total sequence length of the "
-        "mini-batch and D is the total tag number. While for a tensor "
-        "input, its shape is [B X S X D] with B the batch size and S the "
-        "sequence length of each sample after padding. This input is the "
-        "unscaled emission weight matrix of the linear_chain_crf operator.");
+        "(Tensor/LoDTensor). For a LoDTensor input, its shape is [N x D] "
+        "where N is the total sequence length of the mini-batch and D is "
+        "the total tag number. While for a tensor input, its shape is "
+        "[B X S X D] with B the batch size and S the sequence length of each "
+        "sample after padding. This input is the unscaled emission weight "
+        "matrix of the linear_chain_crf operator. The data type is float32 "
+        "or float64.");
     AddInput(
         "Transition",
-        "(Tensor<float>). A Tensor with shape [(D + 2) x D]. "
+        "(Tensor). A Tensor with shape [(D + 2) x D]. "
         "This input is the transition weights learned by the linear_chain_crf "
         "operator, denoted as w. The 1st row of w are transition weights for "
         "the start mask. The 2nd row of w are transition weights for the end "
         "mask. Transition weights between other tags begin from the 3rd row of "
-        "w. See more details in comments of the linear_chain_crf operator.");
+        "w. See more details in comments of the linear_chain_crf operator. "
+        "The data type is the same as Input(Emission).");
     AddInput(
         "Label",
-        "(Tensor<int64_t>/LoDTensor<int64_t>). The ground truth with shape "
+        "(Tensor/LoDTensor). The ground truth with shape "
         "[N x 1] (for LoDTensor) or [B x S] (for Tensor). This input is "
-        "optional. See more details in the operator's comments.")
+        "optional. See more details in the operator's comments. The data type "
+        "is int64.")
         .AsDispensable();
     AddOutput(
         "ViterbiPath",
-        "(Tensor<int64_t>/LoDTensor<int64_t>). The decoding results. What to "
+        "(Tensor/LoDTensor). The decoding results. What to "
         "return changes depending on whether the Input(Label) (the ground "
-        "truth) is given. See more details in the operator's comment.");
+        "truth) is given. See more details in the operator's comment. "
+        "The data type is int64.");
     AddInput("Length",
-             "(Tensor<int64_t>). The actual length of each sample before "
+             "(Tensor). The actual length of each sample before "
              "padding with shape [B x 1]. It means the Input(Emission), "
-             "Input(Label) "
-             "and Output(ViterbiPath) are common tensors with padding when "
-             "this input "
-             "is given.")
+             "Input(Label) and Output(ViterbiPath) are common tensors with "
+             "padding when this input is given. The data type is int64.")
         .AsDispensable();
     AddComment(R"DOC(
 The crf_decoding operator reads the emission feature weights and the transition
-feature weights learned by the linear_chain_crf operator. It implements the
-Viterbi algorithm which is a dynamic programming algorithm for finding the most
-likely sequence of hidden states, called the Viterbi path, that results in a
-sequence of observed tags.
+feature weights learned by the linear_chain_crf operator and performs decoding. 
+It implements the Viterbi algorithm which is a dynamic programming algorithm 
+for finding the most likely sequence of hidden states, called the Viterbi path, 
+that results in a sequence of observed tags.
 
 The output of this operator changes according to whether Input(Label) is given:
 
diff --git a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
index 4d39358477016d4a2ae01aba635347bc26727474..daf85139103df76ecb01cae1ee0f858d057a9417 100644
--- a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
@@ -46,7 +46,7 @@ class GaussianRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
     AddAttr<int>("seed",
                  "(int, default 0) "
                  "Random seed of generator."
-                 "0 means use system wide seed."
+                 "0 means don't specify random seed."
                  "Note that if seed is not 0, this operator will always "
                  "generate the same random numbers every time.")
         .SetDefault(0);
@@ -58,9 +58,9 @@ class GaussianRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
     AddComment(R"DOC(
 
 Used to initialize tensors with gaussian random generator.
-The default mean of the distribution is 0. and default standard
-deviation (std) of the distribution is 1.. Uers can set mean and std
-by input arguments.
+The default mean of the distribution is 0, and default standard
+deviation (std) of the distribution is 1.0. Uers can set mean and std
+via input arguments.
 )DOC");
   }
 };
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index aedb6efb98bc61a151d731288293b1536064c5c9..b0998721125af68a0118d7a2f99fb43d454e517f 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1610,11 +1610,13 @@ def crf_decoding(input, param_attr, label=None, length=None):
     Args:
         input(${emission_type}): ${emission_comment}
 
-        param_attr(ParamAttr): The parameter attribute for training.
+        param_attr (ParamAttr|None): To specify the weight parameter attribute. 
+            Default: None, which means the default weight parameter property is 
+            used. See usage for details in :ref:`api_fluid_ParamAttr` .
 
-        label(${label_type}): ${label_comment}
+        label(${label_type}, optional): ${label_comment}
         
-        label(${length_type}): ${length_comment}
+        length(${length_type}, optional): ${length_comment}
 
     Returns:
         Variable: ${viterbi_path_comment}
@@ -1626,8 +1628,8 @@ def crf_decoding(input, param_attr, label=None, length=None):
 
            # LoDTensor-based example
            num_labels = 10
-           feature = fluid.layers.data(name='word_emb', shape=[784], dtype='float32', lod_level=1)
-           label = fluid.layers.data(name='label', shape=[1], dtype='int64', lod_level=1)
+           feature = fluid.data(name='word_emb', shape=[-1, 784], dtype='float32', lod_level=1)
+           label = fluid.data(name='label', shape=[-1, 1], dtype='int64', lod_level=1)
            emission = fluid.layers.fc(input=feature, size=num_labels)
            
            crf_cost = fluid.layers.linear_chain_crf(input=emission, label=label, 
@@ -1637,9 +1639,9 @@ def crf_decoding(input, param_attr, label=None, length=None):
 
            # Common tensor example
            num_labels, max_len = 10, 20
-           feature = fluid.layers.data(name='word_emb_pad', shape=[max_len, 784], dtype='float32')
-           label = fluid.layers.data(name='label_pad', shape=[max_len, 1], dtype='int64')
-           length = fluid.layers.data(name='length', shape=[1], dtype='int64')
+           feature = fluid.data(name='word_emb_pad', shape=[-1, max_len, 784], dtype='float32')
+           label = fluid.data(name='label_pad', shape=[-1, max_len, 1], dtype='int64')
+           length = fluid.data(name='length', shape=[-1, 1], dtype='int64')
            emission = fluid.layers.fc(input=feature, size=num_labels,
                                       num_flatten_dims=2)
            
@@ -7078,33 +7080,33 @@ def nce(input,
     ${comment}
 
     Args:
-        input (Variable): input variable.
-        label (Variable): label.
-        num_total_classes (int):${num_total_classes_comment}
+        input (Variable): Input variable, 2-D tensor with shape [batch_size, dim], 
+            and data type is float32 or float64.
+        label (Variable): Input label, 2-D tensor with shape [batch_size, num_true_class],
+            and data type is int64.
+        num_total_classes (int):${num_total_classes_comment}.
         sample_weight (Variable|None): A Variable of shape [batch_size, 1]
             storing a weight for each sample. The default weight for each
             sample is 1.0.
-        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
-             of nce. If it is set to None or one attribute of ParamAttr, nce
-             will create ParamAttr as param_attr. If the Initializer of the param_attr
-             is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of nce.
-             If it is set to False, no bias will be added to the output units.
-             If it is set to None or one attribute of ParamAttr, nce
-             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-             is not set, the bias is initialized zero. Default: None.
-        num_neg_samples (int): ${num_neg_samples_comment}
-        name (str|None): A name for this layer(optional). If set None, the layer
-             will be named automatically. Default: None.
-        sampler (str): The sampler used to sample class from negtive classes.
+        param_attr (ParamAttr|None): To specify the weight parameter attribute. 
+            Default: None, which means the default weight parameter property is 
+            used. See usage for details in :ref:`api_fluid_ParamAttr` .
+        bias_attr (ParamAttr|None): To specify the bias parameter attribute. 
+            Default: None, which means the default bias parameter property is 
+            used. See usage for details in :ref:`api_fluid_ParamAttr` .
+        num_neg_samples (int): ${num_neg_samples_comment}.
+        name(str|None): For detailed information, please refer to 
+            :ref:`api_guide_Name` . Usually name is no need to set and None by default.
+        sampler (str, optional): The sampler used to sample class from negtive classes.
                        It can be 'uniform', 'log_uniform' or 'custom_dist'.
                        default: 'uniform'.
-        custom_dist (float[]): A float[] with size=num_total_classes.
+        custom_dist (nd.array|None): A numpy ndarray with size=num_total_classes.
                        It is used when sampler is set to 'custom_dist'.
                        custom_dist[i] is the probsbility of i-th class to be sampled.
                        default: None.
-        seed (int): The seed used in sampler. default: 0.
-        is_sparse(bool): The flag indicating whether to use sparse update, the weight@GRAD and bias@GRAD will be changed to SelectedRows.
+        seed (int, optional): The seed used in sampler. Default 0, means no random seed.
+        is_sparse(bool, optional): The flag indicating whether to use sparse update, 
+            the weight@GRAD and bias@GRAD will be changed to SelectedRows. Default False.
 
     Returns:
         Variable: The output nce loss.
@@ -7119,8 +7121,8 @@ def nce(input,
             window_size = 5
             words = []
             for i in xrange(window_size):
-                words.append(fluid.layers.data(
-                    name='word_{0}'.format(i), shape=[1], dtype='int64'))
+                words.append(fluid.data(
+                    name='word_{0}'.format(i), shape=[-1, 1], dtype='int64'))
 
             dict_size = 10000
             label_word = int(window_size / 2) + 1
@@ -8182,17 +8184,17 @@ def one_hot(input, depth, allow_out_of_range=False):
 
 def autoincreased_step_counter(counter_name=None, begin=1, step=1):
     """
-    Create an auto-increase variable
-    which will be automatically increased by 1 every mini-batch
-    Return the run counter of the main program, default is started from 1.
+    Create an auto-increase variable. which will be automatically increased 
+    by 1 in every iteration. By default, the first return of this counter is 1, 
+    and the step size is 1.
 
     Args:
-        counter_name(str): The counter name, default is '@STEP_COUNTER@'.
-        begin(int): The first value of this counter.
-        step(int): The increment step between each execution.
+        counter_name(str, optional): The counter name. Default '@STEP_COUNTER@'.
+        begin(int, optional): The first return value of this counter. Default 1.
+        step(int, optional): The step size. Default 1.
 
     Returns:
-        Variable: The global run counter.
+        Variable: The auto-increased Variable with data type int64.
 
     Examples:
         .. code-block:: python
@@ -9976,9 +9978,11 @@ def gather(input, index, overwrite=True):
                        [5, 6]]
 
     Args:
-        input (Variable): The source input with rank>=1.
-        index (Variable): The index input with rank=1.
-        overwrite (bool): The mode that updating the grad when has same index.
+        input (Variable): The source input tensor with rank>=1. Supported data type is 
+            int32, int64, float32, float64 and uint8 (only for CPU), 
+            float16 (only for GPU).
+        index (Variable): The index input tensor with rank=1. Data type is int32 or int64.
+        overwrite (bool, optional): The mode that updating the grad when has same index.
             If True, use the overwrite mode to update the grad of the same index,
 	    if False, use the accumulate mode to update the grad of the same index. 
 	    Default value is True.
@@ -9993,8 +9997,8 @@ def gather(input, index, overwrite=True):
         .. code-block:: python
 
             import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[-1, 5], dtype='float32')
-            index = fluid.layers.data(name='index', shape=[-1, 1], dtype='int32')
+            x = fluid.data(name='x', shape=[-1, 5], dtype='float32')
+            index = fluid.data(name='index', shape=[-1, 1], dtype='int32')
             output = fluid.layers.gather(x, index)
     """
     helper = LayerHelper('gather', **locals())
@@ -10996,11 +11000,12 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None):
 
     Args:
        label (Variable): Indicates whether the left is ranked higher than the right or not.
-       left (Variable): Ranking score for left.
-       right (Variable): Ranking score for right.
+           Data type is float32.
+       left (Variable): Ranking score for left. Data type float32.
+       right (Variable): Ranking score for right. Data type float32.
        margin (float): Indicates the given margin.
-       name (str|None): A name for this layer (optional). If set None, the layer
-                       will be named automatically.
+       name(str|None): For detailed information, please refer to 
+           :ref:`api_guide_Name` . Usually name is no need to set and None by default.
 
     Returns:
        Variable: The ranking loss.
@@ -11013,9 +11018,9 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None):
         .. code-block:: python
 
            import paddle.fluid as fluid
-           label = fluid.layers.data(name="label", shape=[-1, 1], dtype="float32")
-           left = fluid.layers.data(name="left", shape=[-1, 1], dtype="float32")
-           right = fluid.layers.data(name="right", shape=[-1, 1], dtype="float32")
+           label = fluid.data(name="label", shape=[-1, 1], dtype="float32")
+           left = fluid.data(name="left", shape=[-1, 1], dtype="float32")
+           right = fluid.data(name="right", shape=[-1, 1], dtype="float32")
            out = fluid.layers.margin_rank_loss(label, left, right)
     """
     helper = LayerHelper('margin_rank_loss', **locals())
@@ -12186,12 +12191,12 @@ def gaussian_random_batch_size_like(input,
     Args:
         input (Variable): ${input_comment}
         shape (tuple|list): ${shape_comment}
-        input_dim_idx (Int): ${input_dim_idx_comment}
-        output_dim_idx (Int): ${output_dim_idx_comment}
-        mean (Float): ${mean_comment}
-        std (Float): ${std_comment}
-        seed (Int): ${seed_comment}
-        dtype(np.dtype|core.VarDesc.VarType|str): The type of output data : float32, float_16, int etc
+        input_dim_idx (int): ${input_dim_idx_comment}
+        output_dim_idx (int): ${output_dim_idx_comment}
+        mean (float): ${mean_comment}
+        std (float): ${std_comment}
+        seed (int): ${seed_comment}
+        dtype(np.dtype|core.VarDesc.VarType|str): The type of output data, float32 or float_64.
 
     Returns:
         out (Variable): ${out_comment}
@@ -12200,7 +12205,7 @@ def gaussian_random_batch_size_like(input,
         .. code-block:: python
 
             import paddle.fluid as fluid
-            input = fluid.layers.data(name="input", shape=[13, 11], dtype='float32')
+            input = fluid.data(name="input", shape=[13, 11], dtype='float32')
 
             out = fluid.layers.gaussian_random_batch_size_like(
                 input, shape=[-1, 11], mean=1.0, std=2.0)
@@ -14000,7 +14005,8 @@ def similarity_focus(input, axis, indexes, name=None):
 
     Args:
         input(Variable): The input tensor variable(default float). It should
-            be a 4-D tensor with shape [BatchSize, A, B, C].
+            be a 4-D tensor with shape [BatchSize, A, B, C]. Data type is 
+            float32 or float64.
         axis(int): Indicating the dimension to be selected. It can only be
             1, 2 or 3.
         indexes(list): Indicating the indexes of the selected dimension.
@@ -14013,7 +14019,7 @@ def similarity_focus(input, axis, indexes, name=None):
         .. code-block:: python
 
             import paddle.fluid as fluid
-            data = fluid.layers.data(
+            data = fluid.data(
                 name='data', shape=[-1, 3, 2, 2], dtype='float32')
             fluid.layers.similarity_focus(input=data, axis=1, indexes=[0])
     """
@@ -14226,13 +14232,15 @@ def log_loss(input, label, epsilon=1e-4, name=None):
               - (1 - label) * \\log{(1 - input + \\epsilon)}
 
     Args:
-        input (Variable|list):  a 2-D tensor with shape [N x 1], where N is the
+        input (Variable|list):  A 2-D tensor with shape [N x 1], where N is the
                                 batch size. This input is a probability computed
-                                by the previous operator.
-        label (Variable|list):  the ground truth which is a 2-D tensor with
-                                shape [N x 1], where N is the batch size.
-        epsilon (float): epsilon
-        name (string): the name of log_loss
+                                by the previous operator. Data type float32.
+        label (Variable|list):  The ground truth which is a 2-D tensor with
+                                shape [N x 1], where N is the batch size. 
+                                Data type float32.
+        epsilon (float, optional): A small number for numerical stability. Default 1e-4.
+        name(str|None): For detailed information, please refer to 
+            :ref:`api_guide_Name` . Usually name is no need to set and None by default.
 
     Returns:
         Variable: A 2-D tensor with shape [N x 1], the negative log loss.
@@ -14241,8 +14249,8 @@ def log_loss(input, label, epsilon=1e-4, name=None):
         .. code-block:: python
 
           import paddle.fluid as fluid
-          label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-          prob = fluid.layers.data(name='prob', shape=[10], dtype='float32')
+          label = fluid.data(name='label', shape=[-1, 1], dtype='int64')
+          prob = fluid.data(name='prob', shape=[-1, 10], dtype='float32')
           cost = fluid.layers.log_loss(input=prob, label=label)
     """
     helper = LayerHelper('log_loss', **locals())
@@ -14380,7 +14388,7 @@ def bilinear_tensor_product(x,
                             param_attr=None,
                             bias_attr=None):
     """
-    **Add Bilinear Tensor Product Layer**
+    **Bilinear Tensor Product Layer**
 
     This layer performs bilinear tensor product on two inputs.
     For example:
@@ -14391,31 +14399,34 @@ def bilinear_tensor_product(x,
     In this formula:
       - :math:`x`: the first input contains M elements, shape is [batch_size, M].
       - :math:`y`: the second input contains N elements, shape is [batch_size, N].
-      - :math:`W_{i}`: the i-th learned weight, shape is [M, N]
+      - :math:`W_{i}`: the i-th learned weight, shape is [M, N].
       - :math:`out_{i}`: the i-th element of out, shape is [batch_size, size].
       - :math:`y^\mathrm{T}`: the transpose of :math:`y_{2}`.
 
     Args:
-        x (Variable): 2-D input tensor with shape [batch_size, M]
-        y (Variable): 2-D input tensor with shape [batch_size, N]
+        x (Variable): 2-D input tensor with shape [batch_size, M]. Data type 
+            is float32 or float64.
+        y (Variable): 2-D input tensor with shape [batch_size, N]. Data type 
+            should be same as **x**.
         size (int): The dimension of this layer.
-        act (str, default None): Activation to be applied to the output of this layer.
-        name (str, default None): The name of this layer.
-        param_attr (ParamAttr, default None): The parameter attribute for the learnable w.
-            parameters/weights of this layer.
-        bias_attr (ParamAttr, default None): The parameter attribute for the bias
-            of this layer. If it is set to False, no bias will be added to the output units.
-            If it is set to None, the bias is initialized zero. Default: None.
-
+        act (str|None): Activation to be applied to the output of this layer. Default None.
+        name(str|None): For detailed information, please refer to 
+            :ref:`api_guide_Name` . Usually name is no need to set and None by default.
+        param_attr (ParamAttr|None): To specify the weight parameter attribute. 
+            Default: None, which means the default weight parameter property is 
+            used. See usage for details in :ref:`api_fluid_ParamAttr` .
+        bias_attr (ParamAttr|None): To specify the bias parameter attribute. 
+            Default: None, which means the default bias parameter property is 
+            used. See usage for details in :ref:`api_fluid_ParamAttr` .
     Returns:
-        Variable: A 2-D Tensor of shape [batch_size, size].
+        Variable: A 2-D Tensor of shape [batch_size, size]. Data type is the same as input **x**.
 
     Examples:
         .. code-block:: python
 
           import paddle.fluid as fluid
-          layer1 = fluid.layers.data("t1", shape=[-1, 5], dtype="float32")
-          layer2 = fluid.layers.data("t2", shape=[-1, 4], dtype="float32")
+          layer1 = fluid.data("t1", shape=[-1, 5], dtype="float32")
+          layer2 = fluid.data("t2", shape=[-1, 4], dtype="float32")
           tensor = fluid.layers.bilinear_tensor_product(x=layer1, y=layer2, size=1000)
     """
     helper = LayerHelper('bilinear_tensor_product', **locals())
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index 5656779140a3d706b226631450ea9f225da8edbf..03c7e10cd49288535e4af6406cfd813a6841a696 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -318,8 +318,9 @@ def sequence_conv_pool(input,
 
 def glu(input, dim=-1):
     """
-    The Gated Linear Units(GLU) composed by split, sigmoid activation and element-wise
-    multiplication. Specifically, Split the input into two equal sized parts,
+    The Gated Linear Units(GLU) composed by :ref:`api_fluid_layers_split` , 
+    :ref:`api_fluid_layers_sigmoid`  and :ref:`api_fluid_layers_elementwise_mul` . 
+    Specifically, GLU will plit the input into two equal-sized parts,
     :math:`a` and :math:`b`, along the given dimension and then compute as
     following:
 
@@ -331,18 +332,20 @@ def glu(input, dim=-1):
     <https://arxiv.org/pdf/1612.08083.pdf>`_.
 
     Args:
-        input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (int): The dimension along which to split. If :math:`dim < 0`, the
+        input (Variable): The input variable which is a Tensor or LoDTensor. 
+                          The supported data types include float32, float64 
+                          and float16 (only for GPU).
+        dim (int, optional): The dimension along which to split. If :math:`dim < 0`, the
             dimension to split along is :math:`rank(input) + dim`. Default -1.
 
     Returns:
-        Variable: Variable with half the size of input.
+        Variable: Variable with half the size and same data type of input.
 
     Examples:
         .. code-block:: python
 
             import paddle.fluid as fluid
-            data = fluid.layers.data(
+            data = fluid.data(
                 name="words", shape=[-1, 6, 3, 9], dtype="float32")
             # shape of output: [-1, 3, 3, 9]
             output = fluid.nets.glu(input=data, dim=1)
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index c6ed4385e4de73f40f5c43ccd505ba4be8768b98..0d84edc7b22bc5035976f6c2dd627410c6724425 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -2254,11 +2254,11 @@ class LambOptimizer(AdamOptimizer):
 
     ..  math::
 
-        m_t &= \\beta_1 m_{t - 1}+ (1 - \\beta_1)g_t \\
+        m_t &= \\beta_1 m_{t - 1}+ (1 - \\beta_1)g_t 
 
-        v_t &= \\beta_2 v_{t - 1}  + (1 - \\beta_2)g_t^2 \\
+        v_t &= \\beta_2 v_{t - 1}  + (1 - \\beta_2)g_t^2
 
-        r_t &= \\frac{m_t}{\\sqrt{v_t}+\\epsilon} \\
+        r_t &= \\frac{m_t}{\\sqrt{v_t}+\\epsilon}
 
         w_t &= w_{t-1} -\\eta_t \\frac{\\left \| w_{t-1}\\right \|}{\\left \| r_t + \\lambda w_{t-1}\\right \|} (r_t + \\lambda w_{t-1})
 
@@ -2267,25 +2267,28 @@ class LambOptimizer(AdamOptimizer):
     learning rate, :math:`\\lambda` the LAMB weight decay rate.
 
     Args:
-        learning_rate (float|Variable): the learning rate used to update parameters. \
-                                        Can be a float value or a Variable with one \
-                                        float value as data element.
-        lamb_weight_decay (float): The LAMB weight decay rate.
-        beta1 (float): The exponential decay rate for the 1st moment estimates.
-        beta2 (float): The exponential decay rate for the 2nd moment estimates.
-        epsilon (float): A small float value for numerical stability.
-        regularization (Regularizer): A Regularizer, such as
-                        fluid.regularizer.L1DecayRegularizer.
-        exclude_from_weight_decay_fn (function): Exclude a parameter from weight 
-            decay when **exclude_from_weight_decay_fn(parameter)** returns true.
-        name (str|None): An optional name prefix.
+        learning_rate (float|Variable, optional): the learning rate used to update parameters. \
+            Can be a float value or a Variable with data type float32. Default 0.001.
+        lamb_weight_decay (float, optional): The LAMB weight decay rate. Default 0.01.
+        beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
+            Default 0.9.
+        beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
+            Default 0.999.
+        epsilon (float, optional): A small float value for numerical stability. Default 1e-6.
+        regularization (Regularizer|None): A Regularizer, such as
+           fluid.regularizer.L1DecayRegularizer. Default None.
+        exclude_from_weight_decay_fn (function|None): Exclude a parameter from weight 
+            decay when **exclude_from_weight_decay_fn(parameter)** returns true. 
+            Default None.
+        name(str|None): For detailed information, please refer to 
+            :ref:`api_guide_Name` . Usually name is no need to set and None by default.
 
     Examples:
         .. code-block:: python
             
             import paddle.fluid as fluid 
 
-            data = fluid.layers.data(name='x', shape=[5], dtype='float32')
+            data = fluid.data(name='x', shape=[-1, 5], dtype='float32')
             hidden = fluid.layers.fc(input=data, size=10)
             cost = fluid.layers.mean(hidden)
 
@@ -2738,10 +2741,13 @@ class ExponentialMovingAverage(object):
 
 
     Args:
-	decay (float): The exponential decay rate, usually close to 1, such as 
-                       0.999, 0.9999, ... .
-        thres_steps (Variable|None): If not `None`, schedule the decay rate.
-	name (str|None): An optional name prefix.
+	decay (float, optional): The exponential decay rate, usually close to 1, such as 
+            0.999, 0.9999, ... . Default 0.999.
+        thres_steps (Variable|None): If not `None`, schedule the decay rate. 
+            Default None.
+        name (str|None): For detailed information, please refer to 
+            :ref:`api_guide_Name`. Usually name is no need to set and None by 
+            default.
 
 
     Examples:
@@ -2752,7 +2758,7 @@ class ExponentialMovingAverage(object):
 	    import paddle
 	    import paddle.fluid as fluid
 
-	    data = fluid.layers.data(name='x', shape=[5], dtype='float32')
+	    data = fluid.data(name='x', shape=[-1, 5], dtype='float32')
 	    hidden = fluid.layers.fc(input=data, size=10)
 	    cost = fluid.layers.mean(hidden)
 
@@ -2910,7 +2916,8 @@ class ExponentialMovingAverage(object):
         
         Args:
             executor (Executor): The Executor to execute applying.
-            need_restore (bool): Whether to restore parameters after applying.
+            need_restore (bool, optional): Whether to restore parameters after 
+                applying. Default True.
         """
         executor.run(self.apply_program)
         try: