diff --git a/doc/v2/api/config/evaluators.rst b/doc/v2/api/config/evaluators.rst
index 9ac972fb193a2fb525edc507f7ba1303d2c8eabe..458d892e825a7a9bbe7843ad5c508bd5a31f5f0f 100644
--- a/doc/v2/api/config/evaluators.rst
+++ b/doc/v2/api/config/evaluators.rst
@@ -101,7 +101,7 @@ value_printer
     :noindex:
 
 Detection
-=====
+==========
 
 detection_map
 -------------
diff --git a/doc/v2/api/config/layer.rst b/doc/v2/api/config/layer.rst
index 1a6496968cae1fef88142ba9ca3f9e63a81b196d..5a0cfadfce84df41defdf518b7c3a6222d5b30a1 100644
--- a/doc/v2/api/config/layer.rst
+++ b/doc/v2/api/config/layer.rst
@@ -11,7 +11,7 @@ Data layer
 
 data
 ----
-..  autoclass:: paddle.v2.layer.data
+..  autofunction:: paddle.v2.layer.data
     :noindex:
 
 Fully Connected Layers
@@ -21,12 +21,12 @@ Fully Connected Layers
 
 fc
 --
-..  autoclass:: paddle.v2.layer.fc
+..  autofunction:: paddle.v2.layer.fc
     :noindex:
 
 selective_fc
 ------------
-..  autoclass:: paddle.v2.layer.selective_fc
+..  autofunction:: paddle.v2.layer.selective_fc
     :noindex:
 
 Conv Layers
@@ -34,34 +34,34 @@ Conv Layers
 
 conv_operator
 -------------
-..  autoclass:: paddle.v2.layer.conv_operator
+..  autofunction:: paddle.v2.layer.conv_operator
     :noindex:
 
 conv_projection
 ---------------
-..  autoclass:: paddle.v2.layer.conv_projection
+..  autofunction:: paddle.v2.layer.conv_projection
     :noindex:
 
 conv_shift
 ----------
-..  autoclass:: paddle.v2.layer.conv_shift
+..  autofunction:: paddle.v2.layer.conv_shift
     :noindex:
 
 img_conv
 --------
-..  autoclass:: paddle.v2.layer.img_conv
+..  autofunction:: paddle.v2.layer.img_conv
     :noindex:
 
 ..  _api_v2.layer_context_projection:
 
 context_projection
 ------------------
-..  autoclass:: paddle.v2.layer.context_projection
+..  autofunction:: paddle.v2.layer.context_projection
     :noindex:
 
 row_conv
 --------
-..  autoclass:: paddle.v2.layer.row_conv
+..  autofunction:: paddle.v2.layer.row_conv
     :noindex:
 
 Image Pooling Layer
@@ -69,27 +69,27 @@ Image Pooling Layer
 
 img_pool
 --------
-..  autoclass:: paddle.v2.layer.img_pool
+..  autofunction:: paddle.v2.layer.img_pool
     :noindex:
 
 spp
 ---
-..  autoclass:: paddle.v2.layer.spp
+..  autofunction:: paddle.v2.layer.spp
     :noindex:
 
 maxout
 ------
-..  autoclass:: paddle.v2.layer.maxout
+..  autofunction:: paddle.v2.layer.maxout
     :noindex:
 
 roi_pool
 --------
-..  autoclass:: paddle.v2.layer.roi_pool
+..  autofunction:: paddle.v2.layer.roi_pool
     :noindex:
 
 pad
 ----
-..  autoclass:: paddle.v2.layer.pad
+..  autofunction:: paddle.v2.layer.pad
     :noindex:
 
 Norm Layer
@@ -97,27 +97,27 @@ Norm Layer
 
 img_cmrnorm
 -----------
-..  autoclass:: paddle.v2.layer.img_cmrnorm
+..  autofunction:: paddle.v2.layer.img_cmrnorm
     :noindex:
 
 batch_norm
 ----------
-..  autoclass:: paddle.v2.layer.batch_norm
+..  autofunction:: paddle.v2.layer.batch_norm
     :noindex:
 
 sum_to_one_norm
 ---------------
-..  autoclass:: paddle.v2.layer.sum_to_one_norm
+..  autofunction:: paddle.v2.layer.sum_to_one_norm
     :noindex:
 
 cross_channel_norm
 ------------------
-..  autoclass:: paddle.v2.layer.cross_channel_norm
+..  autofunction:: paddle.v2.layer.cross_channel_norm
     :noindex:
 
 row_l2_norm
 -----------
-..  autoclass:: paddle.v2.layer.row_l2_norm
+..  autofunction:: paddle.v2.layer.row_l2_norm
     :noindex:
 
 Recurrent Layers
@@ -125,22 +125,22 @@ Recurrent Layers
 
 recurrent
 ---------
-..  autoclass:: paddle.v2.layer.recurrent
+..  autofunction:: paddle.v2.layer.recurrent
     :noindex:
 
 lstmemory
 ---------
-..  autoclass:: paddle.v2.layer.lstmemory
+..  autofunction:: paddle.v2.layer.lstmemory
     :noindex:
 
 grumemory
 ---------
-..  autoclass:: paddle.v2.layer.grumemory
+..  autofunction:: paddle.v2.layer.grumemory
     :noindex:
 
 gated_unit
 -----------
-..  autoclass:: paddle.v2.layer.gated_unit
+..  autofunction:: paddle.v2.layer.gated_unit
     :noindex:
 
 Recurrent Layer Group
@@ -148,32 +148,32 @@ Recurrent Layer Group
 
 memory
 ------
-..  autoclass:: paddle.v2.layer.memory
+..  autofunction:: paddle.v2.layer.memory
     :noindex:
 
 recurrent_group
 ---------------
-..  autoclass:: paddle.v2.layer.recurrent_group
+..  autofunction:: paddle.v2.layer.recurrent_group
     :noindex:
 
 lstm_step
 ---------
-..  autoclass:: paddle.v2.layer.lstm_step
+..  autofunction:: paddle.v2.layer.lstm_step
     :noindex:
 
 gru_step
 --------
-..  autoclass:: paddle.v2.layer.gru_step
+..  autofunction:: paddle.v2.layer.gru_step
     :noindex:
 
 beam_search
 ------------
-..  autoclass:: paddle.v2.layer.beam_search
+..  autofunction:: paddle.v2.layer.beam_search
     :noindex:
 
 get_output
 ----------
-..  autoclass:: paddle.v2.layer.get_output
+..  autofunction:: paddle.v2.layer.get_output
     :noindex:
 
 Mixed Layer
@@ -183,54 +183,54 @@ Mixed Layer
 
 mixed
 -----
-..  autoclass:: paddle.v2.layer.mixed
+..  autofunction:: paddle.v2.layer.mixed
     :noindex:
 
 ..  _api_v2.layer_embedding:
 
 embedding
 ---------
-..  autoclass:: paddle.v2.layer.embedding
+..  autofunction:: paddle.v2.layer.embedding
     :noindex:
 
 scaling_projection
 ------------------
-..  autoclass:: paddle.v2.layer.scaling_projection
+..  autofunction:: paddle.v2.layer.scaling_projection
     :noindex:
 
 dotmul_projection
 -----------------
-..  autoclass:: paddle.v2.layer.dotmul_projection
+..  autofunction:: paddle.v2.layer.dotmul_projection
     :noindex:
 
 dotmul_operator
 ---------------
-..  autoclass:: paddle.v2.layer.dotmul_operator
+..  autofunction:: paddle.v2.layer.dotmul_operator
     :noindex:
 
 full_matrix_projection
 ----------------------
-..  autoclass:: paddle.v2.layer.full_matrix_projection
+..  autofunction:: paddle.v2.layer.full_matrix_projection
     :noindex:
 
 identity_projection
 -------------------
-..  autoclass:: paddle.v2.layer.identity_projection
+..  autofunction:: paddle.v2.layer.identity_projection
     :noindex:
 
 slice_projection
 -------------------
-..  autoclass:: paddle.v2.layer.slice_projection
+..  autofunction:: paddle.v2.layer.slice_projection
     :noindex:
 
 table_projection
 ----------------
-..  autoclass:: paddle.v2.layer.table_projection
+..  autofunction:: paddle.v2.layer.table_projection
     :noindex:
 
 trans_full_matrix_projection
 ----------------------------
-..  autoclass:: paddle.v2.layer.trans_full_matrix_projection
+..  autofunction:: paddle.v2.layer.trans_full_matrix_projection
     :noindex:
 
 Aggregate Layers
@@ -245,51 +245,46 @@ AggregateLevel
 
 pooling
 -------
-..  autoclass:: paddle.v2.layer.pooling
+..  autofunction:: paddle.v2.layer.pooling
     :noindex:
 
 ..  _api_v2.layer_last_seq:
 
 last_seq
 --------
-..  autoclass:: paddle.v2.layer.last_seq
+..  autofunction:: paddle.v2.layer.last_seq
     :noindex:
 
 ..  _api_v2.layer_first_seq:
 
 first_seq
 ---------
-..  autoclass:: paddle.v2.layer.first_seq
+..  autofunction:: paddle.v2.layer.first_seq
     :noindex:
 
 sub_seq
 ---------
-..  autoclass:: paddle.v2.layer.sub_seq
+..  autofunction:: paddle.v2.layer.sub_seq
     :noindex:
 
 concat
 ------
-..  autoclass:: paddle.v2.layer.concat
+..  autofunction:: paddle.v2.layer.concat
     :noindex:
 
 seq_concat
 ----------
-..  autoclass:: paddle.v2.layer.seq_concat
+..  autofunction:: paddle.v2.layer.seq_concat
     :noindex:
 
 seq_slice
 ---------
-..  autoclass:: paddle.v2.layer.seq_slice
-    :noindex:
-
-kmax_sequence_score
--------------------
-..  autoclass:: paddle.v2.layer.kmax_sequence_score
+..  autofunction:: paddle.v2.layer.seq_slice
     :noindex:
 
 sub_nested_seq
 --------------
-..  autoclass:: paddle.v2.layer.sub_nested_seq
+..  autofunction:: paddle.v2.layer.sub_nested_seq
     :noindex:
 
 Reshaping Layers
@@ -297,7 +292,7 @@ Reshaping Layers
 
 block_expand
 ------------
-..  autoclass:: paddle.v2.layer.block_expand
+..  autofunction:: paddle.v2.layer.block_expand
     :noindex:
 
 ..  _api_v2.layer_expand:
@@ -309,22 +304,22 @@ ExpandLevel
 
 expand
 ------
-..  autoclass:: paddle.v2.layer.expand
+..  autofunction:: paddle.v2.layer.expand
     :noindex:
 
 repeat
 ------
-..  autoclass:: paddle.v2.layer.repeat
+..  autofunction:: paddle.v2.layer.repeat
     :noindex:
 
 rotate
 ------
-..  autoclass:: paddle.v2.layer.rotate
+..  autofunction:: paddle.v2.layer.rotate
     :noindex:
 
 seq_reshape
 -----------
-..  autoclass:: paddle.v2.layer.seq_reshape
+..  autofunction:: paddle.v2.layer.seq_reshape
     :noindex:
 
 Math Layers
@@ -332,94 +327,94 @@ Math Layers
 
 addto
 -----
-..  autoclass:: paddle.v2.layer.addto
+..  autofunction:: paddle.v2.layer.addto
     :noindex:
 
 linear_comb
 -----------
-..  autoclass:: paddle.v2.layer.linear_comb
+..  autofunction:: paddle.v2.layer.linear_comb
     :noindex:
 
 interpolation
 -------------
-..  autoclass:: paddle.v2.layer.interpolation
+..  autofunction:: paddle.v2.layer.interpolation
     :noindex:
 
 bilinear_interp
 ---------------
-..  autoclass:: paddle.v2.layer.bilinear_interp
+..  autofunction:: paddle.v2.layer.bilinear_interp
     :noindex:
 
 dropout
 --------
-..  autoclass:: paddle.v2.layer.dropout
+..  autofunction:: paddle.v2.layer.dropout
     :noindex:
 
 dot_prod
 ---------
-.. autoclass:: paddle.v2.layer.dot_prod
+.. autofunction:: paddle.v2.layer.dot_prod
     :noindex:
 
 out_prod
 --------
-.. autoclass:: paddle.v2.layer.out_prod
+.. autofunction:: paddle.v2.layer.out_prod
     :noindex:
 
 power
 -----
-..  autoclass:: paddle.v2.layer.power
+..  autofunction:: paddle.v2.layer.power
     :noindex:
 
 scaling
 -------
-..  autoclass:: paddle.v2.layer.scaling
+..  autofunction:: paddle.v2.layer.scaling
     :noindex:
 
 clip
 ----
-..  autoclass:: paddle.v2.layer.clip
+..  autofunction:: paddle.v2.layer.clip
     :noindex:
 
 resize
 ------
-..  autoclass:: paddle.v2.layer.resize
+..  autofunction:: paddle.v2.layer.resize
     :noindex:
 
 slope_intercept
 ---------------
-..  autoclass:: paddle.v2.layer.slope_intercept
+..  autofunction:: paddle.v2.layer.slope_intercept
     :noindex:
 
 tensor
 ------
-..  autoclass:: paddle.v2.layer.tensor
+..  autofunction:: paddle.v2.layer.tensor
     :noindex:
 
 ..  _api_v2.layer_cos_sim:
 
 cos_sim
 -------
-..  autoclass:: paddle.v2.layer.cos_sim
+..  autofunction:: paddle.v2.layer.cos_sim
     :noindex:
 
 l2_distance
 -----------
-..  autoclass:: paddle.v2.layer.l2_distance
+..  autofunction:: paddle.v2.layer.l2_distance
     :noindex:
 
 trans
 -----
-..  autoclass:: paddle.v2.layer.trans
+..  autofunction:: paddle.v2.layer.trans
     :noindex:
 
 scale_shift
 -----------
-..  autoclass:: paddle.v2.layer.scale_shift
+..  autofunction:: paddle.v2.layer.scale_shift
     :noindex:
 
 factorization_machine
 ---------------------
-..  autoclass:: paddle.v2.layer.factorization_machine
+..  autofunction:: paddle.v2.layer.factorization_machine
     :noindex:
 
 Sampling Layers
@@ -427,17 +422,17 @@ Sampling Layers
 
 maxid
 -----
-..  autoclass:: paddle.v2.layer.max_id
+..  autofunction:: paddle.v2.layer.max_id
     :noindex:
 
 sampling_id
 -----------
-..  autoclass:: paddle.v2.layer.sampling_id
+..  autofunction:: paddle.v2.layer.sampling_id
     :noindex:
 
 multiplex
 ---------
-..  autoclass:: paddle.v2.layer.multiplex
+..  autofunction:: paddle.v2.layer.multiplex
     :noindex:
 
 ..  _api_v2.layer_costs:
@@ -447,97 +442,97 @@ Cost Layers
 
 cross_entropy_cost
 ------------------
-..  autoclass:: paddle.v2.layer.cross_entropy_cost
+..  autofunction:: paddle.v2.layer.cross_entropy_cost
     :noindex:
 
 cross_entropy_with_selfnorm_cost
 --------------------------------
-..  autoclass:: paddle.v2.layer.cross_entropy_with_selfnorm_cost
+..  autofunction:: paddle.v2.layer.cross_entropy_with_selfnorm_cost
     :noindex:
 
 multi_binary_label_cross_entropy_cost
 -------------------------------------
-..  autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
+..  autofunction:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
     :noindex:
 
 classification_cost
 -------------------
-.. autoclass:: paddle.v2.layer.classification_cost
+.. autofunction:: paddle.v2.layer.classification_cost
    :noindex:
 
 huber_regression_cost
 -------------------------
-..  autoclass:: paddle.v2.layer.huber_regression_cost
+..  autofunction:: paddle.v2.layer.huber_regression_cost
     :noindex:
 
 huber_classification_cost
 -------------------------
-..  autoclass:: paddle.v2.layer.huber_classification_cost
+..  autofunction:: paddle.v2.layer.huber_classification_cost
     :noindex:
 
 lambda_cost
 -----------
-..  autoclass:: paddle.v2.layer.lambda_cost
+..  autofunction:: paddle.v2.layer.lambda_cost
     :noindex:
 
 square_error_cost
 -----------------
-..  autoclass:: paddle.v2.layer.square_error_cost
+..  autofunction:: paddle.v2.layer.square_error_cost
     :noindex:
 
 rank_cost
 ---------
-..  autoclass:: paddle.v2.layer.rank_cost
+..  autofunction:: paddle.v2.layer.rank_cost
     :noindex:
 
 sum_cost
 ---------
-..  autoclass:: paddle.v2.layer.sum_cost
+..  autofunction:: paddle.v2.layer.sum_cost
     :noindex:
 
 crf
 ---
-..  autoclass:: paddle.v2.layer.crf
+..  autofunction:: paddle.v2.layer.crf
     :noindex:
 
 crf_decoding
 ------------
-..  autoclass:: paddle.v2.layer.crf_decoding
+..  autofunction:: paddle.v2.layer.crf_decoding
     :noindex:
 
 ctc
 ---
-..  autoclass:: paddle.v2.layer.ctc
+..  autofunction:: paddle.v2.layer.ctc
     :noindex:
 
 warp_ctc
 --------
-..  autoclass:: paddle.v2.layer.warp_ctc
+..  autofunction:: paddle.v2.layer.warp_ctc
     :noindex:
 
 nce
 ---
-..  autoclass:: paddle.v2.layer.nce
+..  autofunction:: paddle.v2.layer.nce
     :noindex:
 
 hsigmoid
 ---------
-..  autoclass:: paddle.v2.layer.hsigmoid
+..  autofunction:: paddle.v2.layer.hsigmoid
     :noindex:
 
 smooth_l1_cost
 --------------
-..  autoclass:: paddle.v2.layer.smooth_l1_cost
+..  autofunction:: paddle.v2.layer.smooth_l1_cost
     :noindex:
 
 multibox_loss
 --------------
-..  autoclass:: paddle.v2.layer.multibox_loss
+..  autofunction:: paddle.v2.layer.multibox_loss
     :noindex:
 
 detection_output
 ----------------
-..  autoclass:: paddle.v2.layer.detection_output
+..  autofunction:: paddle.v2.layer.detection_output
     :noindex:
 
 Check Layer
@@ -545,7 +540,7 @@ Check Layer
 
 eos
 ---
-..  autoclass:: paddle.v2.layer.eos
+..  autofunction:: paddle.v2.layer.eos
     :noindex:
 
 Activation
@@ -553,5 +548,5 @@ Activation
 
 prelu
 --------
-..  autoclass:: paddle.v2.layer.prelu
+..  autofunction:: paddle.v2.layer.prelu
     :noindex:
diff --git a/doc/v2/api/index_en.rst b/doc/v2/api/index_en.rst
index b11cd449affd1dcd9d3f42492961469331350942..70c5c524aaf0a9ae003bf4340c3f268c225d4419 100644
--- a/doc/v2/api/index_en.rst
+++ b/doc/v2/api/index_en.rst
@@ -8,4 +8,3 @@ API
     model_configs.rst
     data.rst
     run_logic.rst
-    fluid/index.rst
diff --git a/doc/v2/build_and_install/pip_install_cn.rst b/doc/v2/build_and_install/pip_install_cn.rst
index 853bdb21bbcf07ae1742d2196dbcfe4668828b7b..095da19cd41d29bfa72ab23abd24bec45f925a86 100644
--- a/doc/v2/build_and_install/pip_install_cn.rst
+++ b/doc/v2/build_and_install/pip_install_cn.rst
@@ -60,6 +60,7 @@ paddlepaddle-gpu==0.11.0            使用CUDA 7.5和cuDNN 5编译的0.11.0版
     "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
     "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
     "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
 
 .. _pip_dependency:
 
diff --git a/doc/v2/build_and_install/pip_install_en.rst b/doc/v2/build_and_install/pip_install_en.rst
index fecf6d3712feac3265100a6121901ba784f7d5cc..8406e4aa1fbb953c3b615b10d1bcb2c45974dde0 100644
--- a/doc/v2/build_and_install/pip_install_en.rst
+++ b/doc/v2/build_and_install/pip_install_en.rst
@@ -63,6 +63,7 @@ If the links below shows up the login form, just click "Log in as guest" to star
     "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
     "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
     "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
 
 .. _pip_dependency:
 
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index 63d371310d2a26a1460e527fc51923dfd6e0b8bc..6b06913d1c83f4534238ac3dd22ac4035c0f0fbf 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -18,6 +18,17 @@
 namespace paddle {
 namespace operators {
 
+using conv_bwd_data = mkldnn::convolution_backward_data;
+using conv_bwd_weights = mkldnn::convolution_backward_weights;
+using conv_fwd = mkldnn::convolution_forward;
+using framework::DataLayout;
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::reorder;
+using mkldnn::stream;
+using platform::to_void_cast;
+using platform::GetMKLDNNFormat;
+
 template <typename T>
 class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
@@ -25,6 +36,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                    "It must use CPUPlace.");
 
+    // Get unique name for index
+    const std::string key = ctx.op().Output("Output");
+    const std::string key_conv_pd = key + "@conv_pd";
+
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
@@ -33,10 +48,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto* filter = ctx.Input<Tensor>("Filter");
     auto* output = ctx.Output<Tensor>("Output");
 
-    // Get an unique name from "argument" name of "Output" variable
-    // This name will be used as key when saving info into device context
-    const std::string key = ctx.op().Output("Output");
-    const std::string key_conv_pd = key + "@conv_pd";
+    PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
+                       input->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input tensor");
+    PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
+                       filter->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Filter tensor");
 
     std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
@@ -63,60 +80,86 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         paddle::framework::vectorize2int(filter->dims());
     std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
 
-    // TODO(pzelazko-intel): support more formats
-    auto src_md = platform::MKLDNNMemDesc(
-        src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
-    auto weights_md =
-        platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32,
-                                mkldnn::memory::format::oihw);
-    auto dst_md = platform::MKLDNNMemDesc(
-        dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
-
-    auto src_memory =
-        mkldnn::memory({src_md, mkldnn_engine},
-                       reinterpret_cast<void*>(const_cast<T*>(input_data)));
-    auto weights_memory =
-        mkldnn::memory({weights_md, mkldnn_engine},
-                       reinterpret_cast<void*>(const_cast<T*>(filter_data)));
-    auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine}, output_data);
-
-    std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd =
-        ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
-                             mkldnn_engine);
-
-    // save conv_pd into global device context to be referred in backward path
-    dev_ctx.SetBlob(key_conv_pd, conv_pd);
+    // create mkldnn memory from input tensors (data/weights)
+    auto user_src_memory = memory(
+        {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine},
+        to_void_cast(input_data));
+    auto user_weights_memory =
+        memory({{{weights_tz}, memory::data_type::f32, filter->format()},
+                mkldnn_engine},
+               to_void_cast(filter_data));
+
+    /* create memory descriptor for convolution without specified format
+     * ('any') which lets a primitive (convolution in this case) choose
+     * the memory format preferred for best performance
+     */
+    auto src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32,
+                                          memory::format::any);
+    auto weights_md = platform::MKLDNNMemDesc(
+        weights_tz, memory::data_type::f32, memory::format::any);
+    auto dst_md = platform::MKLDNNMemDesc(dst_tz, memory::data_type::f32,
+                                          memory::format::any);
+
+    // create a conv primitive descriptor and save it for usage in backward
+    std::shared_ptr<conv_fwd::primitive_desc> conv_pd = ConvFwdPrimitiveDesc(
+        src_md, weights_md, dst_md, strides, paddings, mkldnn_engine);
+
+    // create reorder primitive if the input format is not the preferred one
+    auto src_memory = user_src_memory;
+    primitive reorder_src;
+    bool is_src_reordered = false;
+    if (memory::primitive_desc(conv_pd->src_primitive_desc()) !=
+        user_src_memory.get_primitive_desc()) {
+      src_memory = memory(conv_pd->src_primitive_desc());
+      reorder_src = reorder(user_src_memory, src_memory);
+      is_src_reordered = true;
+    }
+    auto weights_memory = user_weights_memory;
+    primitive reorder_weights;
+    bool is_weights_reordered = false;
+    if (memory::primitive_desc(conv_pd->weights_primitive_desc()) !=
+        user_weights_memory.get_primitive_desc()) {
+      weights_memory = memory(conv_pd->weights_primitive_desc());
+      reorder_weights = reorder(user_weights_memory, weights_memory);
+      is_weights_reordered = true;
+    }
+
+    // create memory primitive for conv dst
+    auto dst_memory = memory(conv_pd->dst_primitive_desc(), output_data);
 
     // create convolution op primitive
-    auto conv_prim = mkldnn::convolution_forward(*conv_pd, src_memory,
-                                                 weights_memory, dst_memory);
+    auto conv_prim = conv_fwd(*conv_pd, src_memory, weights_memory, dst_memory);
 
     // push primitive to stream and wait until it's executed
-    std::vector<mkldnn::primitive> pipeline{conv_prim};
-    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+    std::vector<primitive> pipeline;
+    if (is_src_reordered) pipeline.push_back(reorder_src);
+    if (is_weights_reordered) pipeline.push_back(reorder_weights);
+    pipeline.push_back(conv_prim);
+    stream(stream::kind::eager).submit(pipeline).wait();
+
+    // Save conv_pd/src_memory/weights_memory for backward pass
+    dev_ctx.SetBlob(key_conv_pd, conv_pd);
+
+    output->set_layout(DataLayout::kMKLDNN);
+    output->set_format(GetMKLDNNFormat(dst_memory));
   }
 
  private:
-  std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
-  ConvFwdPrimitiveDesc(const mkldnn::memory::desc& src,
-                       const mkldnn::memory::desc& weights,
-                       const mkldnn::memory::desc& dst,
-                       const std::vector<int>& strides,
-                       const std::vector<int>& paddings,
-                       const mkldnn::engine& engine) const {
-    mkldnn::memory::dims stride_dims = {strides[0], strides[1]};
-    mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]};
-
-    auto conv_desc = mkldnn::convolution_forward::desc(
-        mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights,
-        dst, stride_dims, padding_dims, padding_dims,
-        mkldnn::padding_kind::zero);
-
-    auto p_conv_pd =
-        new mkldnn::convolution_forward::primitive_desc(conv_desc, engine);
-
-    return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
-        p_conv_pd);
+  std::unique_ptr<conv_fwd::primitive_desc> ConvFwdPrimitiveDesc(
+      const memory::desc& src, const memory::desc& weights,
+      const memory::desc& dst, const std::vector<int>& strides,
+      const std::vector<int>& paddings, const mkldnn::engine& engine) const {
+    memory::dims stride_dims = {strides[0], strides[1]};
+    memory::dims padding_dims = {paddings[0], paddings[1]};
+
+    auto conv_desc =
+        conv_fwd::desc(mkldnn::prop_kind::forward, mkldnn::convolution_direct,
+                       src, weights, dst, stride_dims, padding_dims,
+                       padding_dims, mkldnn::padding_kind::zero);
+
+    auto p_conv_pd = new conv_fwd::primitive_desc(conv_desc, engine);
+
+    return std::unique_ptr<conv_fwd::primitive_desc>(p_conv_pd);
   }
 };
 
@@ -139,6 +182,19 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
     Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
 
+    PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
+                       input->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input tensor");
+    PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
+                       filter->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Filter tensor");
+    PADDLE_ENFORCE(output->layout() == DataLayout::kMKLDNN &&
+                       output->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Output tensor");
+    PADDLE_ENFORCE(output_grad->layout() == DataLayout::kMKLDNN &&
+                       output_grad->format() != memory::format::format_undef,
+                   "Wrong layout/format set for output_grad tensor");
+
     if (!input_grad && !filter_grad) return;
 
     // Get an unique name from "argument" name of "Output" variable
@@ -167,108 +223,147 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
         paddle::framework::vectorize2int(filter->dims());
     std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
 
-    // TODO(pzelazko-intel): support more formats
-    auto src_md = platform::MKLDNNMemDesc(
-        src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
-    auto diff_src_md = platform::MKLDNNMemDesc(
-        src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
-    auto weights_md =
-        platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32,
-                                mkldnn::memory::format::oihw);
-    auto diff_weights_md =
-        platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32,
-                                mkldnn::memory::format::oihw);
-    auto diff_dst_md = platform::MKLDNNMemDesc(
-        dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
-
-    // create memory
-    auto diff_dst_memory = mkldnn::memory(
-        {diff_weights_md, mkldnn_engine},
-        reinterpret_cast<void*>(const_cast<T*>(output_grad_data)));
+    // create mkldnn memory from input tensors (input/weights/output_grad)
+    auto user_src_memory = memory(
+        {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine},
+        to_void_cast(input_data));
+    auto user_weights_memory =
+        memory({{{weights_tz}, memory::data_type::f32, filter->format()},
+                mkldnn_engine},
+               to_void_cast(filter_data));
+    auto user_diff_dst_memory =
+        memory({{{dst_tz}, memory::data_type::f32, output_grad->format()},
+                mkldnn_engine},
+               to_void_cast(output_grad_data));
+
+    /* create memory descriptor for conv backward without specified format
+     * ('any') which lets a primitive (conv backward in this case) choose
+     * the memory format preferred for best performance
+     */
+    auto src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32,
+                                          memory::format::any);
+    auto diff_src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32,
+                                               memory::format::any);
+    auto weights_md = platform::MKLDNNMemDesc(
+        weights_tz, memory::data_type::f32, memory::format::any);
+    auto diff_weights_md = platform::MKLDNNMemDesc(
+        weights_tz, memory::data_type::f32, memory::format::any);
+    auto diff_dst_md = platform::MKLDNNMemDesc(dst_tz, memory::data_type::f32,
+                                               memory::format::any);
+
     // Retrieve conv_pd from device context
-    auto conv_pd =
-        std::static_pointer_cast<mkldnn::convolution_forward::primitive_desc>(
-            dev_ctx.GetBlob(key_conv_pd));
+    auto conv_pd = std::static_pointer_cast<conv_fwd::primitive_desc>(
+        dev_ctx.GetBlob(key_conv_pd));
     PADDLE_ENFORCE(conv_pd != nullptr,
                    "Fail to find conv_pd in device context");
 
     // create backward conv primitive for weights
     if (filter_grad) {
-      // create primitive descriptor
-      mkldnn::convolution_backward_weights::primitive_desc conv_bwd_weights_pd =
-          ConvBwdWeightsPrimitiveDesc(src_md, diff_weights_md, diff_dst_md,
-                                      strides, paddings, *conv_pd,
-                                      mkldnn_engine);
-
-      // create memory
+      // create backward convolution primitive descriptor
+      auto conv_bwd_weights_desc = conv_bwd_weights::desc(
+          mkldnn::convolution_direct, src_md, diff_weights_md, diff_dst_md,
+          strides, paddings, paddings, mkldnn::padding_kind::zero);
+      auto conv_bwd_weights_pd = conv_bwd_weights::primitive_desc(
+          conv_bwd_weights_desc, mkldnn_engine, *conv_pd);
+
+      // create reorder primitive if the input format is not the preferred one
+      auto src_memory = user_src_memory;
+      primitive reorder_src;
+      bool is_src_reordered = false;
+      if (memory::primitive_desc(conv_bwd_weights_pd.src_primitive_desc()) !=
+          user_src_memory.get_primitive_desc()) {
+        src_memory = memory(conv_bwd_weights_pd.src_primitive_desc());
+        reorder_src = reorder(user_src_memory, src_memory);
+        is_src_reordered = true;
+      }
+
+      auto diff_dst_memory_4filter = user_diff_dst_memory;
+      primitive reorder_diff_dst_4filter;
+      bool is_diff_dst_reordered_4filter = false;
+      if (memory::primitive_desc(
+              conv_bwd_weights_pd.diff_dst_primitive_desc()) !=
+          user_diff_dst_memory.get_primitive_desc()) {
+        diff_dst_memory_4filter =
+            memory(conv_bwd_weights_pd.diff_dst_primitive_desc());
+        reorder_diff_dst_4filter =
+            reorder(user_diff_dst_memory, diff_dst_memory_4filter);
+        is_diff_dst_reordered_4filter = true;
+      }
+
+      // create mkldnn memory for output (i.e. diff weights)
       auto diff_weights_memory =
-          mkldnn::memory({diff_weights_md, mkldnn_engine},
-                         reinterpret_cast<void*>(filter_grad_data));
-      auto src_memory =
-          mkldnn::memory({src_md, mkldnn_engine},
-                         reinterpret_cast<void*>(const_cast<T*>(input_data)));
+          memory(conv_bwd_weights_pd.diff_weights_primitive_desc(),
+                 reinterpret_cast<void*>(filter_grad_data));
 
       // create backward conv primitive for weights
-      auto conv_bwd_weights_prim = mkldnn::convolution_backward_weights(
-          conv_bwd_weights_pd, src_memory, diff_dst_memory,
-          diff_weights_memory);
+      auto conv_bwd_weights_prim =
+          conv_bwd_weights(conv_bwd_weights_pd, src_memory,
+                           diff_dst_memory_4filter, diff_weights_memory);
 
       // push primitive and execute it
-      std::vector<mkldnn::primitive> pipeline{conv_bwd_weights_prim};
-      mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+      std::vector<primitive> pipeline;
+      if (is_src_reordered) pipeline.push_back(reorder_src);
+      if (is_diff_dst_reordered_4filter)
+        pipeline.push_back(reorder_diff_dst_4filter);
+      pipeline.push_back(conv_bwd_weights_prim);
+      stream(stream::kind::eager).submit(pipeline).wait();
+
+      filter_grad->set_layout(DataLayout::kMKLDNN);
+      filter_grad->set_format(GetMKLDNNFormat(diff_weights_memory));
     }
 
     if (input_grad) {
-      // create primitive descriptor
-      mkldnn::convolution_backward_data::primitive_desc conv_bwd_data_pd =
-          ConvBwdDataPrimitiveDesc(diff_src_md, weights_md, diff_dst_md,
-                                   strides, paddings, *conv_pd, mkldnn_engine);
-
-      // create memory
-      auto diff_src_memory = mkldnn::memory(
-          {diff_src_md, mkldnn_engine},
-          reinterpret_cast<void*>(const_cast<T*>(input_grad_data)));
-      auto weights_memory =
-          mkldnn::memory({weights_md, mkldnn_engine},
-                         reinterpret_cast<void*>(const_cast<T*>(filter_data)));
+      // create backward convolution primitive descriptor
+      auto conv_bwd_data_desc = conv_bwd_data::desc(
+          mkldnn::convolution_direct, diff_src_md, weights_md, diff_dst_md,
+          strides, paddings, paddings, mkldnn::padding_kind::zero);
+      auto conv_bwd_data_pd = conv_bwd_data::primitive_desc(
+          conv_bwd_data_desc, mkldnn_engine, *conv_pd);
+
+      // create reorder primitive if the input format is not the preferred one
+      auto weights_memory = user_weights_memory;
+      primitive reorder_weights;
+      bool is_weights_reordered = false;
+      if (memory::primitive_desc(conv_bwd_data_pd.weights_primitive_desc()) !=
+          user_weights_memory.get_primitive_desc()) {
+        weights_memory = memory(conv_bwd_data_pd.weights_primitive_desc());
+        reorder_weights = reorder(user_weights_memory, weights_memory);
+        is_weights_reordered = true;
+      }
+
+      auto diff_dst_memory_4data = user_diff_dst_memory;
+      primitive reorder_diff_dst_4data;
+      bool is_diff_dst_reordered_4data = false;
+      if (memory::primitive_desc(conv_bwd_data_pd.diff_dst_primitive_desc()) !=
+          user_diff_dst_memory.get_primitive_desc()) {
+        diff_dst_memory_4data =
+            memory(conv_bwd_data_pd.diff_dst_primitive_desc());
+        reorder_diff_dst_4data =
+            reorder(user_diff_dst_memory, diff_dst_memory_4data);
+        is_diff_dst_reordered_4data = true;
+      }
+
+      // create mkldnn memory for output (i.e. diff src)
+      auto diff_src_memory = memory(conv_bwd_data_pd.diff_src_primitive_desc(),
+                                    reinterpret_cast<void*>(input_grad_data));
 
       // create backward conv primitive for data
-      auto conv_bwd_data_prim = mkldnn::convolution_backward_data(
-          conv_bwd_data_pd, diff_dst_memory, weights_memory, diff_src_memory);
+      auto conv_bwd_data_prim =
+          conv_bwd_data(conv_bwd_data_pd, diff_dst_memory_4data, weights_memory,
+                        diff_src_memory);
 
-      // push primitive to stream and wait until it's executed
-      std::vector<mkldnn::primitive> pipeline{conv_bwd_data_prim};
-      mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+      // push primitive and execute it
+      std::vector<primitive> pipeline;
+      if (is_weights_reordered) pipeline.push_back(reorder_weights);
+      if (is_diff_dst_reordered_4data)
+        pipeline.push_back(reorder_diff_dst_4data);
+      pipeline.push_back(conv_bwd_data_prim);
+      stream(stream::kind::eager).submit(pipeline).wait();
+
+      input_grad->set_layout(DataLayout::kMKLDNN);
+      input_grad->set_format(GetMKLDNNFormat(diff_src_memory));
     }
   }  // Compute()
-
- private:
-  mkldnn::convolution_backward_weights::primitive_desc
-  ConvBwdWeightsPrimitiveDesc(
-      const mkldnn::memory::desc& src, const mkldnn::memory::desc& diff_weights,
-      const mkldnn::memory::desc& diff_dst, const std::vector<int>& strides,
-      const std::vector<int>& paddings,
-      const mkldnn::convolution_forward::primitive_desc& conv_pd,
-      const mkldnn::engine& engine) const {
-    auto conv_bwd_weights_desc = mkldnn::convolution_backward_weights::desc(
-        mkldnn::convolution_direct, src, diff_weights, diff_dst, strides,
-        paddings, paddings, mkldnn::padding_kind::zero);
-    return mkldnn::convolution_backward_weights::primitive_desc(
-        conv_bwd_weights_desc, engine, conv_pd);
-  }
-
-  mkldnn::convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc(
-      const mkldnn::memory::desc& diff_src, const mkldnn::memory::desc& weights,
-      const mkldnn::memory::desc& diff_dst, const std::vector<int>& strides,
-      const std::vector<int>& paddings,
-      const mkldnn::convolution_forward::primitive_desc& conv_pd,
-      const mkldnn::engine& engine) const {
-    auto conv_bwd_data_desc = mkldnn::convolution_backward_data::desc(
-        mkldnn::convolution_direct, diff_src, weights, diff_dst, strides,
-        paddings, paddings, mkldnn::padding_kind::zero);
-    return mkldnn::convolution_backward_data::primitive_desc(conv_bwd_data_desc,
-                                                             engine, conv_pd);
-  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 27f1313116aad99d34fa8f1d3d6a1e7aced4d394..37153d58439a90190eb2ad82d5dcc145e22dfa48 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -75,9 +75,8 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   framework::LibraryType library{framework::LibraryType::kPlain};
-
-  std::string data_format = ctx.Attr<std::string>("data_format");
   // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  std::string data_format = ctx.Attr<std::string>("data_format");
   framework::DataLayout layout = framework::StringToDataLayout(data_format);
 
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/gen_nccl_id_op.cc b/paddle/fluid/operators/gen_nccl_id_op.cc
index 111e58844c83806af4ebe0aa9e2126a9ddec1d8a..f824eee4e7d1ef19c9a38fd5d3369265f9c549a0 100644
--- a/paddle/fluid/operators/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/gen_nccl_id_op.cc
@@ -67,6 +67,10 @@ class GenNCCLIdOp : public framework::OperatorBase {
       client->AsyncSendVar(ep, dev_ctx, *scope, NCCL_ID_VARNAME);
     }
     client->Wait();
+    for (auto& ep : endpoint_list) {
+      client->AsyncSendBatchBarrier(ep);
+    }
+    client->Wait();
     VLOG(3) << "sending completed...";
   }
 
diff --git a/python/paddle/batch.py b/python/paddle/batch.py
index d48c54fcbb66487617b1946bc69724870c8f879c..3c6a53db3c2287e8ef5931a06ca5dad455665ee0 100644
--- a/python/paddle/batch.py
+++ b/python/paddle/batch.py
@@ -15,7 +15,7 @@
 __all__ = ['batch']
 
 
-def batch(reader, batch_size, drop_last=False):
+def batch(reader, batch_size, drop_last=True):
     """
     Create a batched reader.
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 047c3aa2b7acd0e19b05dfae13c28d2f16007e10..3194b72da68ca0567793de29fcac02995136b4c1 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -262,9 +262,10 @@ def embedding(input,
     return tmp
 
 
-# TODO(qijun): expose H0 and C0
 def dynamic_lstm(input,
                  size,
+                 h_0=None,
+                 c_0=None,
                  param_attr=None,
                  bias_attr=None,
                  use_peepholes=True,
@@ -325,6 +326,13 @@ def dynamic_lstm(input,
                          (T X 4D), where T is the total time steps in this
                          mini-batch, D is the hidden size.
         size(int): 4 * hidden size.
+        h_0(Variable): The initial hidden state is an optional input, default is zero.
+                       This is a tensor with shape (N x D), where N is the
+                       batch size and D is the hidden size.
+        c_0(Variable): The initial cell state is an optional input, default is zero.
+                       This is a tensor with shape (N x D), where N is the
+                       batch size. `h_0` and `c_0` can be NULL but only at the same time.
+
         param_attr(ParamAttr|None): The parameter attribute for the learnable
                                hidden-hidden weights.
 
@@ -388,12 +396,20 @@ def dynamic_lstm(input,
     cell = helper.create_tmp_variable(dtype)
     batch_gate = helper.create_tmp_variable(dtype)
     batch_cell_pre_act = helper.create_tmp_variable(dtype)
+    inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
+    batch_size = input.shape[0]
+    if h_0:
+        assert h_0.shape == (batch_size, size), \
+            'The shape of h0 should be (batch_size, %d)' % size
+        inputs['H0'] = h_0
+    if c_0:
+        assert c_0.shape == (batch_size, size), \
+            'The shape of c0 should be (batch_size, %d)' % size
+        inputs['C0'] = c_0
 
     helper.append_op(
         type='lstm',
-        inputs={'Input': input,
-                'Weight': weight,
-                'Bias': bias},
+        inputs=inputs,
         outputs={
             'Hidden': hidden,
             'Cell': cell,
@@ -678,11 +694,13 @@ def dynamic_gru(input,
         attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
     bias = helper.create_parameter(
         attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True)
+    batch_size = input.shape[0]
     inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
     if h_0 != None:
         assert h_0.shape == (
-            size, size), 'The shape of h0 should be(%d, %d)' % (size, size)
-        inputs['h0'] = h_0
+            batch_size, size
+        ), 'The shape of h0 should be(batch_size, %d)' % size
+        inputs['H0'] = h_0
 
     hidden = helper.create_tmp_variable(dtype)
     batch_gate = helper.create_tmp_variable(dtype)
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
index 2df3da9cca7042222317de626460909f018cb107..8e222d26907e8fe697b596a67e62cc9df84afe0e 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
@@ -96,10 +96,11 @@ def train(use_cuda, train_program, params_dirname):
     train_reader = paddle.batch(
         paddle.reader.shuffle(
             cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10),
-        batch_size=BATCH_SIZE)
+        batch_size=BATCH_SIZE,
+        drop_last=False)
 
     test_reader = paddle.batch(
-        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
+        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False)
 
     def event_handler(event):
         if isinstance(event, fluid.EndStepEvent):
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
index 224cca417e717bbcc54b58be6ac0219be207dea3..dbc7bc06c93157f271c79e85b6925468e861e57f 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
@@ -73,10 +73,11 @@ def train(use_cuda, train_program, params_dirname):
     train_reader = paddle.batch(
         paddle.reader.shuffle(
             cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10),
-        batch_size=BATCH_SIZE)
+        batch_size=BATCH_SIZE,
+        drop_last=False)
 
     test_reader = paddle.batch(
-        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
+        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False)
 
     def event_handler(event):
         if isinstance(event, fluid.EndStepEvent):
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
index 113dda88ca974c9e6241f127091bd96fb2af4a70..8c74be0f08855c20f5aa3ecd75622a51e94a0304 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
@@ -87,7 +87,9 @@ def train(use_cuda, train_program, params_dirname):
     def event_handler(event):
         if isinstance(event, fluid.EndEpochEvent):
             test_reader = paddle.batch(
-                paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE)
+                paddle.dataset.imdb.test(word_dict),
+                batch_size=BATCH_SIZE,
+                drop_last=False)
             avg_cost, acc = trainer.test(
                 reader=test_reader, feed_order=['words', 'label'])
 
@@ -113,7 +115,8 @@ def train(use_cuda, train_program, params_dirname):
     train_reader = paddle.batch(
         paddle.reader.shuffle(
             paddle.dataset.imdb.train(word_dict), buf_size=25000),
-        batch_size=BATCH_SIZE)
+        batch_size=BATCH_SIZE,
+        drop_last=False)
 
     trainer.train(
         num_epochs=1,
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
index 8818cf96fa8f08036f9e23aae786f67b5614b2b9..be347cd5315668dde0454d7959dbf9bcfa465b5f 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -56,7 +56,7 @@ BATCH_SIZE = 200
 
 # fix the order of training data
 train_reader = paddle.batch(
-    paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE)
+    paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE, drop_last=False)
 
 # train_reader = paddle.batch(
 #     paddle.reader.shuffle(
diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py
index e6f87ce61b1d16d4f98f111626776aa52c2ec35b..4e3beaf639bad9fed2862a5477095b66ef4b9aee 100644
--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -240,14 +240,15 @@ class ExtraLayerAttribute(object):
     :type error_clipping_threshold: float
     :param drop_rate: Dropout rate. Dropout will create a mask on layer output.
                       The dropout rate is the zero rate of this mask. The
-                      details of what dropout is please refer to `here
-                      <https://www.cs.toronto.edu/~hinton/absps/
-                      JMLRdropout.pdf>`_.
+                      details of what dropout is please refer to `JMLRdropout
+                      <https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf
+                      >`_.
     :type drop_rate: float
     :param device: device ID of layer. device=-1, use CPU. device>=0, use GPU.
-                   The details allocation in parallel_nn please refer to `here
-                   <http://www.paddlepaddle.org/doc/ui/cmd_argument/
-                   use_case.html#case-2-specify-layers-in-different-devices>`_.
+                   The details allocation in parallel_nn please refer to `use_case
+                   <https://github.com/PaddlePaddle/Paddle/blob/develop/doc/v2
+                   /howto/cmd_parameter/use_case_en.md#case-2-specify-layers-in
+                   -different-devices>`_.
     :type device: int
     """
 
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index ebc31b23e0f5504b4bebccabe996b054c7fbce3b..e6a03759ef431086390e217eabcdff47e610346c 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -2556,7 +2556,7 @@ def img_conv_layer(input,
     the output will be obtained by concatenating the two results.
 
     The details of grouped convolution, please refer to:
-    `ImageNet Classification with Deep Convolutional Neural Networks
+    `ImageNet Classification With Deep Convolutional Neural Networks
     <http://www.cs.toronto.edu/~kriz/imagenet_classification_with_deep_convolutional.pdf>`_
     
     The example usage is:
@@ -5678,8 +5678,8 @@ def warp_ctc_layer(input,
     <https://github.com/baidu-research/warp-ctc>`_ library, which is used in
     `Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin
     <https://arxiv.org/pdf/1512.02595v1.pdf>`_, to compute Connectionist Temporal
-    Classification (CTC) loss. Besides, another `warp-ctc
-    <https://github.com/gangliao/warp-ctc>`_ repository, which is forked from
+    Classification (CTC) loss. Besides, another `warp-ctc repository
+    <https://github.com/gangliao/warp-ctc>`_ , which is forked from
     the official one, is maintained to enable more compiling options. During the
     building process, PaddlePaddle will clone the source codes, build and
     install it to :code:`third_party/install/warpctc` directory.
diff --git a/python/paddle/v2/minibatch.py b/python/paddle/v2/minibatch.py
index d48c54fcbb66487617b1946bc69724870c8f879c..3c6a53db3c2287e8ef5931a06ca5dad455665ee0 100644
--- a/python/paddle/v2/minibatch.py
+++ b/python/paddle/v2/minibatch.py
@@ -15,7 +15,7 @@
 __all__ = ['batch']
 
 
-def batch(reader, batch_size, drop_last=False):
+def batch(reader, batch_size, drop_last=True):
     """
     Create a batched reader.