diff --git a/doc/fluid/api/detection.rst b/doc/fluid/api/detection.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/doc/fluid/api/gen_doc.sh b/doc/fluid/api/gen_doc.sh
index 0f0539355559446fd91f659d61b636db214b5a40..3ee299f5aceb32465574277c25d962360183f1cc 100755
--- a/doc/fluid/api/gen_doc.sh
+++ b/doc/fluid/api/gen_doc.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst
+python gen_doc.py layers --submodules control_flow device io nn ops tensor detection > layers.rst
 
 for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer
 do
diff --git a/doc/fluid/api/io.rst b/doc/fluid/api/io.rst
index 3e956f8302d261b52f9f76ff8eb4a01f9c6381f8..dd9d88b669957c22cd0a07fa4b7e219e2d6e5d61 100644
--- a/doc/fluid/api/io.rst
+++ b/doc/fluid/api/io.rst
@@ -59,21 +59,3 @@ get_inference_program
 ..  autofunction:: paddle.fluid.io.get_inference_program
     :noindex:
 
-save_checkpoint
----------------
-
-..  autofunction:: paddle.fluid.io.save_checkpoint
-    :noindex:
-
-load_checkpoint
----------------
-
-..  autofunction:: paddle.fluid.io.load_checkpoint
-    :noindex:
-
-clean_checkpoint
-----------------
-
-..  autofunction:: paddle.fluid.io.clean_checkpoint
-    :noindex:
-
diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index f78e6db3268e44d5f30d83508f07c4ed68106e48..ba33c0d7d650e76711040c40ab9e5fdcf11c3a6c 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -181,12 +181,6 @@ Print
 ..  autofunction:: paddle.fluid.layers.Print
     :noindex:
 
-is_empty
---------
-
-..  autofunction:: paddle.fluid.layers.is_empty
-    :noindex:
-
 device
 ======
 
@@ -261,19 +255,6 @@ double_buffer
 ..  autofunction:: paddle.fluid.layers.double_buffer
     :noindex:
 
-random_data_generator
----------------------
-
-..  autofunction:: paddle.fluid.layers.random_data_generator
-    :noindex:
-
-Preprocessor
-------------
-
-..  autoclass:: paddle.fluid.layers.Preprocessor
-    :members:
-    :noindex:
-
 nn
 ==
 
@@ -613,30 +594,6 @@ roi_pool
 ..  autofunction:: paddle.fluid.layers.roi_pool
     :noindex:
 
-dice_loss
----------
-
-..  autofunction:: paddle.fluid.layers.dice_loss
-    :noindex:
-
-resize_bilinear
----------------
-
-..  autofunction:: paddle.fluid.layers.resize_bilinear
-    :noindex:
-
-gather
-------
-
-..  autofunction:: paddle.fluid.layers.gather
-    :noindex:
-
-random_crop
------------
-
-..  autofunction:: paddle.fluid.layers.random_crop
-    :noindex:
-
 ops
 ===
 
@@ -784,12 +741,6 @@ sum
 ..  autofunction:: paddle.fluid.layers.sum
     :noindex:
 
-shape
------
-
-..  autofunction:: paddle.fluid.layers.shape
-    :noindex:
-
 sigmoid
 -------
 
@@ -1039,3 +990,54 @@ zeros
 ..  autofunction:: paddle.fluid.layers.zeros
     :noindex:
 
+detection
+=========
+
+multi_box_head
+--------------
+
+..  autofunction:: paddle.fluid.layers.multi_box_head
+    :noindex:
+
+bipartite_match
+---------------
+
+..  autofunction:: paddle.fluid.layers.bipartite_match
+    :noindex:
+
+target_assign
+-------------
+
+..  autofunction:: paddle.fluid.layers.target_assign
+    :noindex:
+
+detection_output
+----------------
+
+..  autofunction:: paddle.fluid.layers.detection_output
+    :noindex:
+
+ssd_loss
+--------
+
+..  autofunction:: paddle.fluid.layers.ssd_loss
+    :noindex:
+
+detection_map
+-------------
+
+..  autofunction:: paddle.fluid.layers.detection_map
+    :noindex:
+
+iou_similarity
+--------------
+
+..  autofunction:: paddle.fluid.layers.iou_similarity
+    :noindex:
+
+box_coder
+---------
+
+..  autofunction:: paddle.fluid.layers.box_coder
+    :noindex:
+
diff --git a/doc/fluid/api/optimizer.rst b/doc/fluid/api/optimizer.rst
index 6ad44bb6905b6e3f2b6e4aeb3701ced5d18e2005..79a0995fce303518d989693976c4e92e05795ca2 100644
--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
@@ -89,13 +89,6 @@ DecayedAdagradOptimizer
     :members:
     :noindex:
 
-RMSPropOptimizer
-----------------
-
-..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
-    :members:
-    :noindex:
-
 Adadelta
 --------
 
diff --git a/doc/fluid/api/profiler.rst b/doc/fluid/api/profiler.rst
index 39fda65863471a78895503184848a754828b71a1..74d102dcb0db35766c34e3d14939a8aa5861686b 100644
--- a/doc/fluid/api/profiler.rst
+++ b/doc/fluid/api/profiler.rst
@@ -23,15 +23,3 @@ profiler
 ..  autofunction:: paddle.fluid.profiler.profiler
     :noindex:
 
-start_profiler
---------------
-
-..  autofunction:: paddle.fluid.profiler.start_profiler
-    :noindex:
-
-stop_profiler
--------------
-
-..  autofunction:: paddle.fluid.profiler.stop_profiler
-    :noindex:
-
diff --git a/doc/v2/api/config/evaluators.rst b/doc/v2/api/config/evaluators.rst
index 9ac972fb193a2fb525edc507f7ba1303d2c8eabe..458d892e825a7a9bbe7843ad5c508bd5a31f5f0f 100644
--- a/doc/v2/api/config/evaluators.rst
+++ b/doc/v2/api/config/evaluators.rst
@@ -101,7 +101,7 @@ value_printer
     :noindex:
 
 Detection
-=====
+==========
 
 detection_map
 -------------
diff --git a/doc/v2/api/config/layer.rst b/doc/v2/api/config/layer.rst
index 1a6496968cae1fef88142ba9ca3f9e63a81b196d..5a0cfadfce84df41defdf518b7c3a6222d5b30a1 100644
--- a/doc/v2/api/config/layer.rst
+++ b/doc/v2/api/config/layer.rst
@@ -11,7 +11,7 @@ Data layer
 
 data
 ----
-..  autoclass:: paddle.v2.layer.data
+..  autofunction:: paddle.v2.layer.data
     :noindex:
 
 Fully Connected Layers
@@ -21,12 +21,12 @@ Fully Connected Layers
 
 fc
 --
-..  autoclass:: paddle.v2.layer.fc
+..  autofunction:: paddle.v2.layer.fc
     :noindex:
 
 selective_fc
 ------------
-..  autoclass:: paddle.v2.layer.selective_fc
+..  autofunction:: paddle.v2.layer.selective_fc
     :noindex:
 
 Conv Layers
@@ -34,34 +34,34 @@ Conv Layers
 
 conv_operator
 -------------
-..  autoclass:: paddle.v2.layer.conv_operator
+..  autofunction:: paddle.v2.layer.conv_operator
     :noindex:
 
 conv_projection
 ---------------
-..  autoclass:: paddle.v2.layer.conv_projection
+..  autofunction:: paddle.v2.layer.conv_projection
     :noindex:
 
 conv_shift
 ----------
-..  autoclass:: paddle.v2.layer.conv_shift
+..  autofunction:: paddle.v2.layer.conv_shift
     :noindex:
 
 img_conv
 --------
-..  autoclass:: paddle.v2.layer.img_conv
+..  autofunction:: paddle.v2.layer.img_conv
     :noindex:
 
 ..  _api_v2.layer_context_projection:
 
 context_projection
 ------------------
-..  autoclass:: paddle.v2.layer.context_projection
+..  autofunction:: paddle.v2.layer.context_projection
     :noindex:
 
 row_conv
 --------
-..  autoclass:: paddle.v2.layer.row_conv
+..  autofunction:: paddle.v2.layer.row_conv
     :noindex:
 
 Image Pooling Layer
@@ -69,27 +69,27 @@ Image Pooling Layer
 
 img_pool
 --------
-..  autoclass:: paddle.v2.layer.img_pool
+..  autofunction:: paddle.v2.layer.img_pool
     :noindex:
 
 spp
 ---
-..  autoclass:: paddle.v2.layer.spp
+..  autofunction:: paddle.v2.layer.spp
     :noindex:
 
 maxout
 ------
-..  autoclass:: paddle.v2.layer.maxout
+..  autofunction:: paddle.v2.layer.maxout
     :noindex:
 
 roi_pool
 --------
-..  autoclass:: paddle.v2.layer.roi_pool
+..  autofunction:: paddle.v2.layer.roi_pool
     :noindex:
 
 pad
 ----
-..  autoclass:: paddle.v2.layer.pad
+..  autofunction:: paddle.v2.layer.pad
     :noindex:
 
 Norm Layer
@@ -97,27 +97,27 @@ Norm Layer
 
 img_cmrnorm
 -----------
-..  autoclass:: paddle.v2.layer.img_cmrnorm
+..  autofunction:: paddle.v2.layer.img_cmrnorm
     :noindex:
 
 batch_norm
 ----------
-..  autoclass:: paddle.v2.layer.batch_norm
+..  autofunction:: paddle.v2.layer.batch_norm
     :noindex:
 
 sum_to_one_norm
 ---------------
-..  autoclass:: paddle.v2.layer.sum_to_one_norm
+..  autofunction:: paddle.v2.layer.sum_to_one_norm
     :noindex:
 
 cross_channel_norm
 ------------------
-..  autoclass:: paddle.v2.layer.cross_channel_norm
+..  autofunction:: paddle.v2.layer.cross_channel_norm
     :noindex:
 
 row_l2_norm
 -----------
-..  autoclass:: paddle.v2.layer.row_l2_norm
+..  autofunction:: paddle.v2.layer.row_l2_norm
     :noindex:
 
 Recurrent Layers
@@ -125,22 +125,22 @@ Recurrent Layers
 
 recurrent
 ---------
-..  autoclass:: paddle.v2.layer.recurrent
+..  autofunction:: paddle.v2.layer.recurrent
     :noindex:
 
 lstmemory
 ---------
-..  autoclass:: paddle.v2.layer.lstmemory
+..  autofunction:: paddle.v2.layer.lstmemory
     :noindex:
 
 grumemory
 ---------
-..  autoclass:: paddle.v2.layer.grumemory
+..  autofunction:: paddle.v2.layer.grumemory
     :noindex:
 
 gated_unit
 -----------
-..  autoclass:: paddle.v2.layer.gated_unit
+..  autofunction:: paddle.v2.layer.gated_unit
     :noindex:
 
 Recurrent Layer Group
@@ -148,32 +148,32 @@ Recurrent Layer Group
 
 memory
 ------
-..  autoclass:: paddle.v2.layer.memory
+..  autofunction:: paddle.v2.layer.memory
     :noindex:
 
 recurrent_group
 ---------------
-..  autoclass:: paddle.v2.layer.recurrent_group
+..  autofunction:: paddle.v2.layer.recurrent_group
     :noindex:
 
 lstm_step
 ---------
-..  autoclass:: paddle.v2.layer.lstm_step
+..  autofunction:: paddle.v2.layer.lstm_step
     :noindex:
 
 gru_step
 --------
-..  autoclass:: paddle.v2.layer.gru_step
+..  autofunction:: paddle.v2.layer.gru_step
     :noindex:
 
 beam_search
 ------------
-..  autoclass:: paddle.v2.layer.beam_search
+..  autofunction:: paddle.v2.layer.beam_search
     :noindex:
 
 get_output
 ----------
-..  autoclass:: paddle.v2.layer.get_output
+..  autofunction:: paddle.v2.layer.get_output
     :noindex:
 
 Mixed Layer
@@ -183,54 +183,54 @@ Mixed Layer
 
 mixed
 -----
-..  autoclass:: paddle.v2.layer.mixed
+..  autofunction:: paddle.v2.layer.mixed
     :noindex:
 
 ..  _api_v2.layer_embedding:
 
 embedding
 ---------
-..  autoclass:: paddle.v2.layer.embedding
+..  autofunction:: paddle.v2.layer.embedding
     :noindex:
 
 scaling_projection
 ------------------
-..  autoclass:: paddle.v2.layer.scaling_projection
+..  autofunction:: paddle.v2.layer.scaling_projection
     :noindex:
 
 dotmul_projection
 -----------------
-..  autoclass:: paddle.v2.layer.dotmul_projection
+..  autofunction:: paddle.v2.layer.dotmul_projection
     :noindex:
 
 dotmul_operator
 ---------------
-..  autoclass:: paddle.v2.layer.dotmul_operator
+..  autofunction:: paddle.v2.layer.dotmul_operator
     :noindex:
 
 full_matrix_projection
 ----------------------
-..  autoclass:: paddle.v2.layer.full_matrix_projection
+..  autofunction:: paddle.v2.layer.full_matrix_projection
     :noindex:
 
 identity_projection
 -------------------
-..  autoclass:: paddle.v2.layer.identity_projection
+..  autofunction:: paddle.v2.layer.identity_projection
     :noindex:
 
 slice_projection
 -------------------
-..  autoclass:: paddle.v2.layer.slice_projection
+..  autofunction:: paddle.v2.layer.slice_projection
     :noindex:
 
 table_projection
 ----------------
-..  autoclass:: paddle.v2.layer.table_projection
+..  autofunction:: paddle.v2.layer.table_projection
     :noindex:
 
 trans_full_matrix_projection
 ----------------------------
-..  autoclass:: paddle.v2.layer.trans_full_matrix_projection
+..  autofunction:: paddle.v2.layer.trans_full_matrix_projection
     :noindex:
 
 Aggregate Layers
@@ -245,51 +245,46 @@ AggregateLevel
 
 pooling
 -------
-..  autoclass:: paddle.v2.layer.pooling
+..  autofunction:: paddle.v2.layer.pooling
     :noindex:
 
 ..  _api_v2.layer_last_seq:
 
 last_seq
 --------
-..  autoclass:: paddle.v2.layer.last_seq
+..  autofunction:: paddle.v2.layer.last_seq
     :noindex:
 
 ..  _api_v2.layer_first_seq:
 
 first_seq
 ---------
-..  autoclass:: paddle.v2.layer.first_seq
+..  autofunction:: paddle.v2.layer.first_seq
     :noindex:
 
 sub_seq
 ---------
-..  autoclass:: paddle.v2.layer.sub_seq
+..  autofunction:: paddle.v2.layer.sub_seq
     :noindex:
 
 concat
 ------
-..  autoclass:: paddle.v2.layer.concat
+..  autofunction:: paddle.v2.layer.concat
     :noindex:
 
 seq_concat
 ----------
-..  autoclass:: paddle.v2.layer.seq_concat
+..  autofunction:: paddle.v2.layer.seq_concat
     :noindex:
 
 seq_slice
 ---------
-..  autoclass:: paddle.v2.layer.seq_slice
-    :noindex:
-
-kmax_sequence_score
--------------------
-..  autoclass:: paddle.v2.layer.kmax_sequence_score
+..  autofunction:: paddle.v2.layer.seq_slice
     :noindex:
 
 sub_nested_seq
 --------------
-..  autoclass:: paddle.v2.layer.sub_nested_seq
+..  autofunction:: paddle.v2.layer.sub_nested_seq
     :noindex:
 
 Reshaping Layers
@@ -297,7 +292,7 @@ Reshaping Layers
 
 block_expand
 ------------
-..  autoclass:: paddle.v2.layer.block_expand
+..  autofunction:: paddle.v2.layer.block_expand
     :noindex:
 
 ..  _api_v2.layer_expand:
@@ -309,22 +304,22 @@ ExpandLevel
 
 expand
 ------
-..  autoclass:: paddle.v2.layer.expand
+..  autofunction:: paddle.v2.layer.expand
     :noindex:
 
 repeat
 ------
-..  autoclass:: paddle.v2.layer.repeat
+..  autofunction:: paddle.v2.layer.repeat
     :noindex:
 
 rotate
 ------
-..  autoclass:: paddle.v2.layer.rotate
+..  autofunction:: paddle.v2.layer.rotate
     :noindex:
 
 seq_reshape
 -----------
-..  autoclass:: paddle.v2.layer.seq_reshape
+..  autofunction:: paddle.v2.layer.seq_reshape
     :noindex:
 
 Math Layers
@@ -332,94 +327,94 @@ Math Layers
 
 addto
 -----
-..  autoclass:: paddle.v2.layer.addto
+..  autofunction:: paddle.v2.layer.addto
     :noindex:
 
 linear_comb
 -----------
-..  autoclass:: paddle.v2.layer.linear_comb
+..  autofunction:: paddle.v2.layer.linear_comb
     :noindex:
 
 interpolation
 -------------
-..  autoclass:: paddle.v2.layer.interpolation
+..  autofunction:: paddle.v2.layer.interpolation
     :noindex:
 
 bilinear_interp
 ---------------
-..  autoclass:: paddle.v2.layer.bilinear_interp
+..  autofunction:: paddle.v2.layer.bilinear_interp
     :noindex:
 
 dropout
 --------
-..  autoclass:: paddle.v2.layer.dropout
+..  autofunction:: paddle.v2.layer.dropout
     :noindex:
 
 dot_prod
 ---------
-.. autoclass:: paddle.v2.layer.dot_prod
+.. autofunction:: paddle.v2.layer.dot_prod
     :noindex:
 
 out_prod
 --------
-.. autoclass:: paddle.v2.layer.out_prod
+.. autofunction:: paddle.v2.layer.out_prod
     :noindex:
 
 power
 -----
-..  autoclass:: paddle.v2.layer.power
+..  autofunction:: paddle.v2.layer.power
     :noindex:
 
 scaling
 -------
-..  autoclass:: paddle.v2.layer.scaling
+..  autofunction:: paddle.v2.layer.scaling
     :noindex:
 
 clip
 ----
-..  autoclass:: paddle.v2.layer.clip
+..  autofunction:: paddle.v2.layer.clip
     :noindex:
 
 resize
 ------
-..  autoclass:: paddle.v2.layer.resize
+..  autofunction:: paddle.v2.layer.resize
     :noindex:
 
 slope_intercept
 ---------------
-..  autoclass:: paddle.v2.layer.slope_intercept
+..  autofunction:: paddle.v2.layer.slope_intercept
     :noindex:
 
 tensor
 ------
-..  autoclass:: paddle.v2.layer.tensor
+..  autofunction:: paddle.v2.layer.tensor
     :noindex:
 
 ..  _api_v2.layer_cos_sim:
 
 cos_sim
 -------
-..  autoclass:: paddle.v2.layer.cos_sim
+..  autofunction:: paddle.v2.layer.cos_sim
     :noindex:
 
 l2_distance
 -----------
-..  autoclass:: paddle.v2.layer.l2_distance
+..  autofunction:: paddle.v2.layer.l2_distance
     :noindex:
 
 trans
 -----
-..  autoclass:: paddle.v2.layer.trans
+..  autofunction:: paddle.v2.layer.trans
     :noindex:
 
 scale_shift
 -----------
-..  autoclass:: paddle.v2.layer.scale_shift
+..  autofunction:: paddle.v2.layer.scale_shift
     :noindex:
 
 factorization_machine
 ---------------------
-..  autoclass:: paddle.v2.layer.factorization_machine
+..  autofunction:: paddle.v2.layer.factorization_machine
     :noindex:
 
 Sampling Layers
@@ -427,17 +422,17 @@ Sampling Layers
 
 maxid
 -----
-..  autoclass:: paddle.v2.layer.max_id
+..  autofunction:: paddle.v2.layer.max_id
     :noindex:
 
 sampling_id
 -----------
-..  autoclass:: paddle.v2.layer.sampling_id
+..  autofunction:: paddle.v2.layer.sampling_id
     :noindex:
 
 multiplex
 ---------
-..  autoclass:: paddle.v2.layer.multiplex
+..  autofunction:: paddle.v2.layer.multiplex
     :noindex:
 
 ..  _api_v2.layer_costs:
@@ -447,97 +442,97 @@ Cost Layers
 
 cross_entropy_cost
 ------------------
-..  autoclass:: paddle.v2.layer.cross_entropy_cost
+..  autofunction:: paddle.v2.layer.cross_entropy_cost
     :noindex:
 
 cross_entropy_with_selfnorm_cost
 --------------------------------
-..  autoclass:: paddle.v2.layer.cross_entropy_with_selfnorm_cost
+..  autofunction:: paddle.v2.layer.cross_entropy_with_selfnorm_cost
     :noindex:
 
 multi_binary_label_cross_entropy_cost
 -------------------------------------
-..  autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
+..  autofunction:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
     :noindex:
 
 classification_cost
 -------------------
-.. autoclass:: paddle.v2.layer.classification_cost
+.. autofunction:: paddle.v2.layer.classification_cost
    :noindex:
 
 huber_regression_cost
 -------------------------
-..  autoclass:: paddle.v2.layer.huber_regression_cost
+..  autofunction:: paddle.v2.layer.huber_regression_cost
     :noindex:
 
 huber_classification_cost
 -------------------------
-..  autoclass:: paddle.v2.layer.huber_classification_cost
+..  autofunction:: paddle.v2.layer.huber_classification_cost
     :noindex:
 
 lambda_cost
 -----------
-..  autoclass:: paddle.v2.layer.lambda_cost
+..  autofunction:: paddle.v2.layer.lambda_cost
     :noindex:
 
 square_error_cost
 -----------------
-..  autoclass:: paddle.v2.layer.square_error_cost
+..  autofunction:: paddle.v2.layer.square_error_cost
     :noindex:
 
 rank_cost
 ---------
-..  autoclass:: paddle.v2.layer.rank_cost
+..  autofunction:: paddle.v2.layer.rank_cost
     :noindex:
 
 sum_cost
 ---------
-..  autoclass:: paddle.v2.layer.sum_cost
+..  autofunction:: paddle.v2.layer.sum_cost
     :noindex:
 
 crf
 ---
-..  autoclass:: paddle.v2.layer.crf
+..  autofunction:: paddle.v2.layer.crf
     :noindex:
 
 crf_decoding
 ------------
-..  autoclass:: paddle.v2.layer.crf_decoding
+..  autofunction:: paddle.v2.layer.crf_decoding
     :noindex:
 
 ctc
 ---
-..  autoclass:: paddle.v2.layer.ctc
+..  autofunction:: paddle.v2.layer.ctc
     :noindex:
 
 warp_ctc
 --------
-..  autoclass:: paddle.v2.layer.warp_ctc
+..  autofunction:: paddle.v2.layer.warp_ctc
     :noindex:
 
 nce
 ---
-..  autoclass:: paddle.v2.layer.nce
+..  autofunction:: paddle.v2.layer.nce
     :noindex:
 
 hsigmoid
 ---------
-..  autoclass:: paddle.v2.layer.hsigmoid
+..  autofunction:: paddle.v2.layer.hsigmoid
     :noindex:
 
 smooth_l1_cost
 --------------
-..  autoclass:: paddle.v2.layer.smooth_l1_cost
+..  autofunction:: paddle.v2.layer.smooth_l1_cost
     :noindex:
 
 multibox_loss
 --------------
-..  autoclass:: paddle.v2.layer.multibox_loss
+..  autofunction:: paddle.v2.layer.multibox_loss
     :noindex:
 
 detection_output
 ----------------
-..  autoclass:: paddle.v2.layer.detection_output
+..  autofunction:: paddle.v2.layer.detection_output
     :noindex:
 
 Check Layer
@@ -545,7 +540,7 @@ Check Layer
 
 eos
 ---
-..  autoclass:: paddle.v2.layer.eos
+..  autofunction:: paddle.v2.layer.eos
     :noindex:
 
 Activation
@@ -553,5 +548,5 @@ Activation
 
 prelu
 --------
-..  autoclass:: paddle.v2.layer.prelu
+..  autofunction:: paddle.v2.layer.prelu
     :noindex:
diff --git a/doc/v2/api/index_en.rst b/doc/v2/api/index_en.rst
index b11cd449affd1dcd9d3f42492961469331350942..70c5c524aaf0a9ae003bf4340c3f268c225d4419 100644
--- a/doc/v2/api/index_en.rst
+++ b/doc/v2/api/index_en.rst
@@ -8,4 +8,3 @@ API
     model_configs.rst
     data.rst
     run_logic.rst
-    fluid/index.rst
diff --git a/doc/v2/build_and_install/pip_install_cn.rst b/doc/v2/build_and_install/pip_install_cn.rst
index 853bdb21bbcf07ae1742d2196dbcfe4668828b7b..095da19cd41d29bfa72ab23abd24bec45f925a86 100644
--- a/doc/v2/build_and_install/pip_install_cn.rst
+++ b/doc/v2/build_and_install/pip_install_cn.rst
@@ -60,6 +60,7 @@ paddlepaddle-gpu==0.11.0            使用CUDA 7.5和cuDNN 5编译的0.11.0版
     "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
     "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
     "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
 
 .. _pip_dependency:
 
diff --git a/doc/v2/build_and_install/pip_install_en.rst b/doc/v2/build_and_install/pip_install_en.rst
index fecf6d3712feac3265100a6121901ba784f7d5cc..8406e4aa1fbb953c3b615b10d1bcb2c45974dde0 100644
--- a/doc/v2/build_and_install/pip_install_en.rst
+++ b/doc/v2/build_and_install/pip_install_en.rst
@@ -63,6 +63,7 @@ If the links below shows up the login form, just click "Log in as guest" to star
     "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
     "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
     "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
 
 .. _pip_dependency:
 
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 6bc770580640f242cfce6a9838f00210f785010a..6286dda4a54991b7a1042aed9886fdcb694198ba 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -84,7 +84,7 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
 
 if(WITH_DISTRIBUTE)
-  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr)
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 4a6f53cba1f46214dbff3058b221f878ecf46613..e15232a77bb9c3e325b55737ea7abc55e3121708 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -330,8 +330,12 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   }
 
   for (auto& op : ctx->ops_) {
-    VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
+    VLOG(4) << place_ << " " << op->DebugStringEx(local_scope);
     op->Run(*local_scope, place_);
+    // NOTE! Please do not delete this line, it's usefull because the debug
+    // string before and after op.run are different, after run the output
+    // will have right shape which is usefull for debug.
+    VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
 
     if (FLAGS_benchmark) {
       VLOG(2) << "Memory used after operator " + op->Type() + " running: "
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index c633a2f847683debce08c40b0c2ed6e58c0a7ad1..d22ac66c5c5f5c395849286f3f42e74ebd8f1f13 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -69,6 +69,19 @@ static DDim GetDims(const Scope& scope, const std::string& name,
   }
 }
 
+static int GetRowSize(const Scope& scope, const std::string& name) {
+  Variable* var = scope.FindVar(name);
+  if (var == nullptr) {
+    return -1;
+  }
+
+  if (var->IsType<SelectedRows>()) {
+    return var->Get<SelectedRows>().rows().size();
+  }
+
+  return -1;
+}
+
 static LoD GetLoD(const Scope& scope, const std::string& name) {
   Variable* var = scope.FindVar(name);
   auto default_lod = LoD({{}});
@@ -153,6 +166,10 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
     for (size_t i = 0; i < input.second.size(); ++i) {
       ss << input.second[i];
       if (scope) {
+        int row_size = GetRowSize(*scope, input.second[i]);
+        if (row_size >= 0) {
+          ss << "[row_size=" << row_size << "]";
+        }
         ss << "[" << GetDims(*scope, input.second[i], true) << "]";
         ss << "(" << GetLoD(*scope, input.second[i]) << ")";
       }
@@ -173,6 +190,10 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
     for (size_t i = 0; i < output.second.size(); ++i) {
       ss << output.second[i];
       if (scope) {
+        int row_size = GetRowSize(*scope, output.second[i]);
+        if (row_size >= 0) {
+          ss << "[row_size=" << row_size << "]";
+        }
         ss << "[" << GetDims(*scope, output.second[i], true) << "]";
         ss << "(" << GetLoD(*scope, output.second[i]) << ")";
       }
diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
index 0e4a56d4a45a732cfcf43b09228bc0c44df5924c..8206cc9890160da756efb13c991020f09b20126a 100644
--- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
@@ -19,10 +19,17 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
+using batch_norm_bwd = mkldnn::batch_normalization_backward;
+using batch_norm_fwd = mkldnn::batch_normalization_forward;
+using framework::DataLayout;
+using framework::Tensor;
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::reorder;
+using mkldnn::stream;
 using paddle::platform::MKLDNNDeviceContext;
 using paddle::platform::MKLDNNMemDesc;
-using mkldnn::memory;
+using platform::to_void_cast;
 
 template <typename T>
 using EigenArrayMap =
@@ -64,21 +71,12 @@ void run_batch_norm_op(Args &&... args) {
   mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
 }
 
-template <typename T>
-inline void *cast_const_to_void(const T *t) {
-  return static_cast<void *>(const_cast<T *>(t));
-}
 }  // namespace
 
 template <typename T>
 class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto data_layout_str = ctx.Attr<std::string>("data_layout");
-    auto data_layout = framework::StringToDataLayout(data_layout_str);
-    PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW,
-                   "MKLDNN batch normalization handles only NCHW data layout");
-
     const float epsilon = ctx.Attr<float>("epsilon");
     const float momentum = ctx.Attr<float>("momentum");
     const bool is_test = ctx.Attr<bool>("is_test");
@@ -99,41 +97,53 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const auto *scale = ctx.Input<Tensor>("Scale");
     const auto *shift = ctx.Input<Tensor>("Bias");
 
-    y->mutable_data<T>(ctx.GetPlace());
-    mean_out->mutable_data<T>(ctx.GetPlace());
-    variance_out->mutable_data<T>(ctx.GetPlace());
+    PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
+                       x->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input x tensor");
+
+    const T *x_data = x->data<T>();
+    const T *mean_data = mean->data<T>();
+    const T *variance_data = variance->data<T>();
+    T *y_data = y->mutable_data<T>(ctx.GetPlace());
+    T *mean_out_data = mean_out->mutable_data<T>(ctx.GetPlace());
+    T *variance_out_data = variance_out->mutable_data<T>(ctx.GetPlace());
+    T *batch_mean_data = nullptr;
+    T *batch_variance_data = nullptr;
 
     if (!is_test) {
-      batch_mean->mutable_data<T>(ctx.GetPlace());
-      batch_variance->mutable_data<T>(ctx.GetPlace());
+      batch_mean_data = batch_mean->mutable_data<T>(ctx.GetPlace());
+      batch_variance_data = batch_variance->mutable_data<T>(ctx.GetPlace());
     }
 
     auto propagation = is_test == true ? mkldnn::prop_kind::forward_scoring
                                        : mkldnn::prop_kind::forward_training;
 
-    auto dims = paddle::framework::vectorize2int(x->dims());
-
-    auto src_md =
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
-    auto dst_md =
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
-
-    auto src_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine};
-    auto dst_pd = mkldnn::memory::primitive_desc{dst_md, mkldnn_engine};
-
-    auto src = mkldnn::memory{src_pd, cast_const_to_void(x->data<T>())};
-    auto dst = mkldnn::memory{dst_pd, y->data<T>()};
+    auto src_tz = paddle::framework::vectorize2int(x->dims());
+    auto scale_tz = paddle::framework::vectorize2int(scale->dims());
+    PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1");
+    const unsigned int ic = scale_tz[0];
 
     unsigned flags = mkldnn::use_scale_shift;
     if (is_test) flags |= mkldnn::use_global_stats;
 
+    // create mkldnn memory from input x tensor
+    auto src_memory =
+        memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
+               to_void_cast(x_data));
+
+    // create primitive descriptor for batch norm forward
     using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
-    auto batch_norm_fwd_desc =
-        bn_fwd_types::op_desc{propagation, src_md, epsilon, flags};
-    auto batch_norm_fwd_pd =
-        bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine};
+    auto batch_norm_fwd_desc = bn_fwd_types::op_desc{
+        propagation, src_memory.get_primitive_desc().desc(), epsilon, flags};
+    std::shared_ptr<batch_norm_fwd::primitive_desc> batch_norm_fwd_pd =
+        std::shared_ptr<batch_norm_fwd::primitive_desc>(
+            new batch_norm_fwd::primitive_desc(batch_norm_fwd_desc,
+                                               mkldnn_engine));
 
-    const unsigned int ic = dims[1];
+    // Save the pd to be used in backward pass
+    const std::string key = ctx.op().Output("SavedMean");
+    const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
+    dev_ctx.SetBlob(key_batch_norm_fwd_pd, batch_norm_fwd_pd);
 
     // MKLDNN requires a single piece of memory for scale and shift/bias data
     const size_t scaleshift_size = 2 * ic;
@@ -143,73 +153,58 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     copy_to_weights(scale->data<T>(), scale->data<T>() + ic, shift->data<T>(),
                     shift->data<T>() + ic, &scaleshift_data);
 
-    auto scaleshift_memory = mkldnn::memory{
-        batch_norm_fwd_pd.weights_primitive_desc(), scaleshift_data.data()};
+    // crate mkldnn memory for weights(scale/shift)
+    auto scaleshift_memory = memory(batch_norm_fwd_pd->weights_primitive_desc(),
+                                    scaleshift_data.data());
 
-    if (is_test) {
-      auto mean_memory = mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(),
-                                        cast_const_to_void(mean->data<T>())};
+    // create mkldnn memory for output y tensor
+    auto dst_memory = memory(batch_norm_fwd_pd->dst_primitive_desc(), y_data);
 
+    if (is_test) {
+      // create mkldnn memory for stats (as input)
+      auto mean_memory = memory(batch_norm_fwd_pd->mean_primitive_desc(),
+                                to_void_cast(mean_data));
       auto variance_memory =
-          mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(),
-                         cast_const_to_void(variance->data<T>())};
+          memory(batch_norm_fwd_pd->variance_primitive_desc(),
+                 to_void_cast(variance_data));
 
       run_batch_norm_op<typename bn_fwd_types::op_type>(
-          batch_norm_fwd_pd, src, (const mkldnn::primitive::at &)mean_memory,
+          *batch_norm_fwd_pd, src_memory,
+          (const mkldnn::primitive::at &)mean_memory,
           (const mkldnn::primitive::at &)variance_memory, scaleshift_memory,
-          dst);
+          dst_memory);
     } else {
+      // create mkldnn memory for stats (as output)
       auto mean_memory =
-          mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(),
-                         cast_const_to_void(batch_mean->data<T>())};
-
-      auto variance_memory =
-          mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(),
-                         cast_const_to_void(batch_variance->data<T>())};
+          memory(batch_norm_fwd_pd->mean_primitive_desc(), batch_mean_data);
+      auto variance_memory = memory(
+          batch_norm_fwd_pd->variance_primitive_desc(), batch_variance_data);
 
-      run_batch_norm_op<bn_fwd_types::op_type>(batch_norm_fwd_pd, src,
-                                               scaleshift_memory, dst,
+      run_batch_norm_op<bn_fwd_types::op_type>(*batch_norm_fwd_pd, src_memory,
+                                               scaleshift_memory, dst_memory,
                                                mean_memory, variance_memory);
     }
 
     if (!is_test) {
-      const unsigned int in = dims[0];
-      const unsigned int sample_size = x->numel() / in / ic;
-
-      // saved_xx is use just in this batch of data
-      EigenVectorArrayMap<T> saved_mean_e(
-          batch_mean->mutable_data<T>(ctx.GetPlace()), ic);
-      EigenVectorArrayMap<T> saved_variance_e(
-          batch_variance->mutable_data<T>(ctx.GetPlace()), ic);
-      saved_mean_e.setZero();
-      saved_variance_e.setZero();
-
-      const unsigned int x_arr_size = in * ic;
-      ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, x_arr_size);
-      for (unsigned int nc = 0; nc < x_arr_size; ++nc) {
-        saved_mean_e(nc % ic) += x_arr.col(nc).sum();
-      }
-      saved_mean_e /= in * sample_size;
-      for (unsigned int nc = 0; nc < x_arr_size; ++nc) {
-        saved_variance_e(nc % ic) +=
-            (x_arr.col(nc) - saved_mean_e(nc % ic)).matrix().squaredNorm();
-      }
-      saved_variance_e /= in * sample_size;
-
-      ConstEigenVectorArrayMap<T> mean_arr{mean->data<T>(), ic};
-      ConstEigenVectorArrayMap<T> variance_arr{variance->data<T>(), ic};
-
-      EigenVectorArrayMap<T> running_mean_arr(
-          mean_out->mutable_data<T>(ctx.GetPlace()), ic);
-      EigenVectorArrayMap<T> running_var_arr(
-          variance_out->mutable_data<T>(ctx.GetPlace()), ic);
+      // mkldnn only compute stats for current batch
+      // so we need compute momentum stats via Eigen lib
+      EigenVectorArrayMap<T> batch_mean_e(batch_mean_data, ic);
+      EigenVectorArrayMap<T> batch_variance_e(batch_variance_data, ic);
+      ConstEigenVectorArrayMap<T> mean_e(mean_data, ic);
+      ConstEigenVectorArrayMap<T> variance_e{variance_data, ic};
+
+      EigenVectorArrayMap<T> running_mean_e(mean_out_data, ic);
+      EigenVectorArrayMap<T> running_variance_e(variance_out_data, ic);
 
       auto one_minus_momentum = 1. - momentum;
-      running_mean_arr =
-          mean_arr * momentum + saved_mean_e * one_minus_momentum;
-      running_var_arr =
-          variance_arr * momentum + saved_variance_e * one_minus_momentum;
+      running_mean_e = mean_e * momentum + batch_mean_e * one_minus_momentum;
+      running_variance_e =
+          variance_e * momentum + batch_variance_e * one_minus_momentum;
     }
+
+    y->set_layout(DataLayout::kMKLDNN);
+    y->set_format(
+        (memory::format)dst_memory.get_primitive_desc().desc().data.format);
   }
 };
 
@@ -217,11 +212,6 @@ template <typename T>
 class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
  public:
   void Compute(const paddle::framework::ExecutionContext &ctx) const override {
-    auto data_layout_str = ctx.Attr<std::string>("data_layout");
-    auto data_layout = framework::StringToDataLayout(data_layout_str);
-    PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW,
-                   "MKLDNN batch normalization handles only NCHW data layout");
-
     auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
     auto mkldnn_engine = dev_ctx.GetEngine();
 
@@ -238,88 +228,132 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     auto *diff_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
     auto *diff_shift = ctx.Output<Tensor>(framework::GradVarName("Bias"));
 
-    diff_x->mutable_data<T>(ctx.GetPlace());
-    diff_scale->mutable_data<T>(ctx.GetPlace());
-    diff_shift->mutable_data<T>(ctx.GetPlace());
+    PADDLE_ENFORCE(diff_y->layout() == DataLayout::kMKLDNN &&
+                       diff_y->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input diff_y tensor");
+
+    const T *x_data = x->data<T>();
+    const T *diff_y_data = diff_y->data<T>();
+    const T *batch_mean_data = batch_mean->data<T>();
+    const T *batch_variance_data = batch_variance->data<T>();
+    const T *scale_data = scale->data<T>();
+    const T *shift_data = shift->data<T>();
+    T *diff_x_data = diff_x->mutable_data<T>(ctx.GetPlace());
+    T *diff_scale_data = diff_scale->mutable_data<T>(ctx.GetPlace());
+    T *diff_shift_data = diff_shift->mutable_data<T>(ctx.GetPlace());
+
+    auto src_tz = paddle::framework::vectorize2int(x->dims());
+    auto diff_src_tz = src_tz;
+    auto dst_tz = src_tz;
+    auto diff_dst_tz = dst_tz;
+    auto scale_tz = paddle::framework::vectorize2int(scale->dims());
+    PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1");
+
+    const unsigned int ic = scale_tz[0];
+
+    // Retrieve bn_fwd_pd from device context
+    const std::string key = ctx.op().Input("SavedMean");
+    const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
+    auto batch_norm_fwd_pd =
+        std::static_pointer_cast<batch_norm_fwd::primitive_desc>(
+            dev_ctx.GetBlob(key_batch_norm_fwd_pd));
+    PADDLE_ENFORCE(batch_norm_fwd_pd != nullptr,
+                   "Fail to find batch_norm_fwd_pd in device context");
 
-    auto dims = paddle::framework::vectorize2int(x->dims());
-    unsigned flags = mkldnn::use_scale_shift | !mkldnn::use_global_stats;
+    using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
 
-    auto src_md =
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
-    auto dst_md =
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
-    auto diff_src_md =
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
-    auto diff_dst_md =
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
+    // create mkldnn memory from input diff_y tensor
+    auto user_diff_dst_memory =
+        memory({{{diff_dst_tz}, memory::data_type::f32, diff_y->format()},
+                mkldnn_engine},
+               to_void_cast(diff_y_data));
 
-    using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
-    using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
+    // create mkldnn memory from input x tensor
+    auto src_memory =
+        memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
+               to_void_cast(x_data));
 
-    auto batch_norm_fwd_desc = bn_fwd_types::op_desc{
-        mkldnn::prop_kind::forward_training, src_md, epsilon, flags};
-    auto batch_norm_fwd_pd =
-        bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine};
+    // for diff_dst, try to use same format as dst in forward pass
+    auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc();
+    auto diff_dst_md = diff_dst_pd.desc();
 
+    // create primitive descriptor for batch norm backward
+    unsigned flags = mkldnn::use_scale_shift;
     auto batch_norm_bwd_desc = bn_bwd_types::op_desc{
-        mkldnn::prop_kind::backward, diff_dst_md, dst_md, epsilon, flags};
+        mkldnn::prop_kind::backward, diff_dst_md,
+        src_memory.get_primitive_desc().desc(), epsilon, flags};
     auto batch_norm_bwd_pd = bn_bwd_types::op_prim{
-        batch_norm_bwd_desc, mkldnn_engine, batch_norm_fwd_pd};
-
-    auto src = mkldnn::memory{{src_md, mkldnn_engine},
-                              cast_const_to_void(x->data<T>())};
-
-    auto mean = mkldnn::memory{batch_norm_bwd_pd.mean_primitive_desc(),
-                               cast_const_to_void(batch_mean->data<T>())};
-
-    auto variance =
-        mkldnn::memory{batch_norm_bwd_pd.variance_primitive_desc(),
-                       cast_const_to_void(batch_variance->data<T>())};
-
-    auto diff_dst = mkldnn::memory{{diff_dst_md, mkldnn_engine},
-                                   cast_const_to_void(diff_y->data<T>())};
+        batch_norm_bwd_desc, mkldnn_engine, *batch_norm_fwd_pd};
+
+    // reorder user_diff_dst if it's not in preferred format
+    auto diff_dst_memory = user_diff_dst_memory;
+    primitive reorder_diff_dst;
+    bool is_diff_dst_reordered = false;
+    if (diff_dst_pd != user_diff_dst_memory.get_primitive_desc()) {
+      diff_dst_memory = memory(diff_dst_pd);
+      reorder_diff_dst = reorder(user_diff_dst_memory, diff_dst_memory);
+      is_diff_dst_reordered = true;
+    }
 
-    const unsigned int ic = dims[1];
+    // create mkldnn memory for input tensors (src/mean/variance)
+    auto mean_memory = memory(batch_norm_bwd_pd.mean_primitive_desc(),
+                              to_void_cast(batch_mean_data));
+    auto variance_memory = memory(batch_norm_bwd_pd.variance_primitive_desc(),
+                                  to_void_cast(batch_variance_data));
 
+    // MKLDNN requires a single piece of memory for scale and shift/bias data
     const size_t scaleshift_size = 2 * ic;
 
     std::vector<T> scaleshift_data;
     scaleshift_data.reserve(scaleshift_size);
-    copy_to_weights(scale->data<T>(), scale->data<T>() + ic, shift->data<T>(),
-                    shift->data<T>() + ic, &scaleshift_data);
+    copy_to_weights(scale_data, scale_data + ic, shift_data, shift_data + ic,
+                    &scaleshift_data);
 
-    auto scaleshift_memory = mkldnn::memory{
-        batch_norm_bwd_pd.weights_primitive_desc(), scaleshift_data.data()};
+    // create mkldnn memory for input tensors (scale/shift)
+    auto scaleshift_memory = memory(batch_norm_bwd_pd.weights_primitive_desc(),
+                                    scaleshift_data.data());
 
+    // create mkldnn memory for output diff weights (combined scale/shift)
     std::vector<T> diff_scaleshift_data;
     diff_scaleshift_data.reserve(scaleshift_size);
-    copy_to_weights(diff_scale->data<T>(), diff_scale->data<T>() + ic,
-                    diff_shift->data<T>(), diff_shift->data<T>() + ic,
-                    &diff_scaleshift_data);
-
     auto diff_scaleshift_memory =
-        mkldnn::memory{batch_norm_bwd_pd.diff_weights_primitive_desc(),
-                       diff_scaleshift_data.data()};
-
-    auto diff_src = mkldnn::memory{{diff_src_md, mkldnn_engine},
-                                   static_cast<void *>(diff_x->data<T>())};
-
-    run_batch_norm_op<bn_bwd_types::op_type>(
-        batch_norm_bwd_pd, src, mean, variance, diff_dst, scaleshift_memory,
-        diff_src, diff_scaleshift_memory);
-
+        memory(batch_norm_bwd_pd.diff_weights_primitive_desc(),
+               diff_scaleshift_data.data());
+
+    // here assume diff_src is in the same format of src
+    auto diff_src_memory = memory(src_memory.get_primitive_desc(), diff_x_data);
+
+    // finally create batch_norm backward primitive
+    auto batch_norm_bwd_prim =
+        batch_norm_bwd(batch_norm_bwd_pd, src_memory, mean_memory,
+                       variance_memory, diff_dst_memory, scaleshift_memory,
+                       diff_src_memory, diff_scaleshift_memory);
+
+    // execute optional reorder and batch_norm backward primitive
+    std::vector<primitive> pipeline;
+    if (is_diff_dst_reordered) pipeline.push_back(reorder_diff_dst);
+    pipeline.push_back(batch_norm_bwd_prim);
+    stream(stream::kind::eager).submit(pipeline).wait();
+
+    // copy back diff sacle/shift to output tensors (diff scale/shift)
+    diff_scaleshift_data.resize(scaleshift_size);
     auto it = std::begin(diff_scaleshift_data);
-    std::copy(it, std::next(it, ic), diff_scale->data<T>());
+    std::copy(it, std::next(it, ic), diff_scale_data);
     std::copy(std::next(it, ic), std::end(diff_scaleshift_data),
-              diff_shift->data<T>());
+              diff_shift_data);
+
+    // set layout/format of output tensors
+    diff_x->set_layout(DataLayout::kMKLDNN);
+    diff_x->set_format((memory::format)diff_src_memory.get_primitive_desc()
+                           .desc()
+                           .data.format);
   }
 };
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_KERNEL(batch_norm, MKLDNN, paddle::platform::CPUPlace,
+REGISTER_OP_KERNEL(batch_norm, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::BatchNormMKLDNNOpKernel<float>);
-REGISTER_OP_KERNEL(batch_norm_grad, MKLDNN, paddle::platform::CPUPlace,
+REGISTER_OP_KERNEL(batch_norm_grad, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::BatchNormMKLDNNGradOpKernel<float>);
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 92fbb9adaf6a6a335abee3c9443d4b1d6097021b..625ca2d7c4c70d1098b0fb28380d8d1eb24cb338 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -110,19 +110,19 @@ class BatchNormOp : public framework::OperatorWithKernel {
                                          ctx.Input<Tensor>("Variance")->type()),
                       "Variance input should be of float type");
 
-    framework::LibraryType library_{framework::LibraryType::kPlain};
     // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    framework::LibraryType library = framework::LibraryType::kPlain;
     framework::DataLayout layout = framework::DataLayout::kAnyLayout;
-
 #ifdef PADDLE_WITH_MKLDNN
-    if (library_ == framework::LibraryType::kPlain &&
+    if (library == framework::LibraryType::kPlain &&
         platform::CanMKLDNNBeUsed(ctx)) {
-      library_ = framework::LibraryType::kMKLDNN;
+      library = framework::LibraryType::kMKLDNN;
       layout = framework::DataLayout::kMKLDNN;
     }
 #endif
+
     return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
-                                   library_);
+                                   library);
   }
 };
 
@@ -370,19 +370,21 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
       PADDLE_THROW("can't find Y@GRAD");
     }
 
-    framework::LibraryType library_{framework::LibraryType::kPlain};
     // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+
 #ifdef PADDLE_WITH_MKLDNN
-    if (library_ == framework::LibraryType::kPlain &&
+    if (library == framework::LibraryType::kPlain &&
         platform::CanMKLDNNBeUsed(ctx)) {
-      library_ = framework::LibraryType::kMKLDNN;
-      layout_ = framework::DataLayout::kMKLDNN;
+      library = framework::LibraryType::kMKLDNN;
+      layout = framework::DataLayout::kMKLDNN;
     }
 #endif
+
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        layout_, library_);
+        layout, library);
   }
 };
 
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index 63d371310d2a26a1460e527fc51923dfd6e0b8bc..6b06913d1c83f4534238ac3dd22ac4035c0f0fbf 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -18,6 +18,17 @@
 namespace paddle {
 namespace operators {
 
+using conv_bwd_data = mkldnn::convolution_backward_data;
+using conv_bwd_weights = mkldnn::convolution_backward_weights;
+using conv_fwd = mkldnn::convolution_forward;
+using framework::DataLayout;
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::reorder;
+using mkldnn::stream;
+using platform::to_void_cast;
+using platform::GetMKLDNNFormat;
+
 template <typename T>
 class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
@@ -25,6 +36,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                    "It must use CPUPlace.");
 
+    // Get unique name for index
+    const std::string key = ctx.op().Output("Output");
+    const std::string key_conv_pd = key + "@conv_pd";
+
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
@@ -33,10 +48,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto* filter = ctx.Input<Tensor>("Filter");
     auto* output = ctx.Output<Tensor>("Output");
 
-    // Get an unique name from "argument" name of "Output" variable
-    // This name will be used as key when saving info into device context
-    const std::string key = ctx.op().Output("Output");
-    const std::string key_conv_pd = key + "@conv_pd";
+    PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
+                       input->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input tensor");
+    PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
+                       filter->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Filter tensor");
 
     std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
@@ -63,60 +80,86 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         paddle::framework::vectorize2int(filter->dims());
     std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
 
-    // TODO(pzelazko-intel): support more formats
-    auto src_md = platform::MKLDNNMemDesc(
-        src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
-    auto weights_md =
-        platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32,
-                                mkldnn::memory::format::oihw);
-    auto dst_md = platform::MKLDNNMemDesc(
-        dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
-
-    auto src_memory =
-        mkldnn::memory({src_md, mkldnn_engine},
-                       reinterpret_cast<void*>(const_cast<T*>(input_data)));
-    auto weights_memory =
-        mkldnn::memory({weights_md, mkldnn_engine},
-                       reinterpret_cast<void*>(const_cast<T*>(filter_data)));
-    auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine}, output_data);
-
-    std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd =
-        ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
-                             mkldnn_engine);
-
-    // save conv_pd into global device context to be referred in backward path
-    dev_ctx.SetBlob(key_conv_pd, conv_pd);
+    // create mkldnn memory from input tensors (data/weights)
+    auto user_src_memory = memory(
+        {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine},
+        to_void_cast(input_data));
+    auto user_weights_memory =
+        memory({{{weights_tz}, memory::data_type::f32, filter->format()},
+                mkldnn_engine},
+               to_void_cast(filter_data));
+
+    /* create memory descriptor for convolution without specified format
+     * ('any') which lets a primitive (convolution in this case) choose
+     * the memory format preferred for best performance
+     */
+    auto src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32,
+                                          memory::format::any);
+    auto weights_md = platform::MKLDNNMemDesc(
+        weights_tz, memory::data_type::f32, memory::format::any);
+    auto dst_md = platform::MKLDNNMemDesc(dst_tz, memory::data_type::f32,
+                                          memory::format::any);
+
+    // create a conv primitive descriptor and save it for usage in backward
+    std::shared_ptr<conv_fwd::primitive_desc> conv_pd = ConvFwdPrimitiveDesc(
+        src_md, weights_md, dst_md, strides, paddings, mkldnn_engine);
+
+    // create reorder primitive if the input format is not the preferred one
+    auto src_memory = user_src_memory;
+    primitive reorder_src;
+    bool is_src_reordered = false;
+    if (memory::primitive_desc(conv_pd->src_primitive_desc()) !=
+        user_src_memory.get_primitive_desc()) {
+      src_memory = memory(conv_pd->src_primitive_desc());
+      reorder_src = reorder(user_src_memory, src_memory);
+      is_src_reordered = true;
+    }
+    auto weights_memory = user_weights_memory;
+    primitive reorder_weights;
+    bool is_weights_reordered = false;
+    if (memory::primitive_desc(conv_pd->weights_primitive_desc()) !=
+        user_weights_memory.get_primitive_desc()) {
+      weights_memory = memory(conv_pd->weights_primitive_desc());
+      reorder_weights = reorder(user_weights_memory, weights_memory);
+      is_weights_reordered = true;
+    }
+
+    // create memory primitive for conv dst
+    auto dst_memory = memory(conv_pd->dst_primitive_desc(), output_data);
 
     // create convolution op primitive
-    auto conv_prim = mkldnn::convolution_forward(*conv_pd, src_memory,
-                                                 weights_memory, dst_memory);
+    auto conv_prim = conv_fwd(*conv_pd, src_memory, weights_memory, dst_memory);
 
     // push primitive to stream and wait until it's executed
-    std::vector<mkldnn::primitive> pipeline{conv_prim};
-    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+    std::vector<primitive> pipeline;
+    if (is_src_reordered) pipeline.push_back(reorder_src);
+    if (is_weights_reordered) pipeline.push_back(reorder_weights);
+    pipeline.push_back(conv_prim);
+    stream(stream::kind::eager).submit(pipeline).wait();
+
+    // Save conv_pd/src_memory/weights_memory for backward pass
+    dev_ctx.SetBlob(key_conv_pd, conv_pd);
+
+    output->set_layout(DataLayout::kMKLDNN);
+    output->set_format(GetMKLDNNFormat(dst_memory));
   }
 
  private:
-  std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
-  ConvFwdPrimitiveDesc(const mkldnn::memory::desc& src,
-                       const mkldnn::memory::desc& weights,
-                       const mkldnn::memory::desc& dst,
-                       const std::vector<int>& strides,
-                       const std::vector<int>& paddings,
-                       const mkldnn::engine& engine) const {
-    mkldnn::memory::dims stride_dims = {strides[0], strides[1]};
-    mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]};
-
-    auto conv_desc = mkldnn::convolution_forward::desc(
-        mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights,
-        dst, stride_dims, padding_dims, padding_dims,
-        mkldnn::padding_kind::zero);
-
-    auto p_conv_pd =
-        new mkldnn::convolution_forward::primitive_desc(conv_desc, engine);
-
-    return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
-        p_conv_pd);
+  std::unique_ptr<conv_fwd::primitive_desc> ConvFwdPrimitiveDesc(
+      const memory::desc& src, const memory::desc& weights,
+      const memory::desc& dst, const std::vector<int>& strides,
+      const std::vector<int>& paddings, const mkldnn::engine& engine) const {
+    memory::dims stride_dims = {strides[0], strides[1]};
+    memory::dims padding_dims = {paddings[0], paddings[1]};
+
+    auto conv_desc =
+        conv_fwd::desc(mkldnn::prop_kind::forward, mkldnn::convolution_direct,
+                       src, weights, dst, stride_dims, padding_dims,
+                       padding_dims, mkldnn::padding_kind::zero);
+
+    auto p_conv_pd = new conv_fwd::primitive_desc(conv_desc, engine);
+
+    return std::unique_ptr<conv_fwd::primitive_desc>(p_conv_pd);
   }
 };
 
@@ -139,6 +182,19 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
     Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
 
+    PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
+                       input->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input tensor");
+    PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
+                       filter->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Filter tensor");
+    PADDLE_ENFORCE(output->layout() == DataLayout::kMKLDNN &&
+                       output->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Output tensor");
+    PADDLE_ENFORCE(output_grad->layout() == DataLayout::kMKLDNN &&
+                       output_grad->format() != memory::format::format_undef,
+                   "Wrong layout/format set for output_grad tensor");
+
     if (!input_grad && !filter_grad) return;
 
     // Get an unique name from "argument" name of "Output" variable
@@ -167,108 +223,147 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
         paddle::framework::vectorize2int(filter->dims());
     std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
 
-    // TODO(pzelazko-intel): support more formats
-    auto src_md = platform::MKLDNNMemDesc(
-        src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
-    auto diff_src_md = platform::MKLDNNMemDesc(
-        src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
-    auto weights_md =
-        platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32,
-                                mkldnn::memory::format::oihw);
-    auto diff_weights_md =
-        platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32,
-                                mkldnn::memory::format::oihw);
-    auto diff_dst_md = platform::MKLDNNMemDesc(
-        dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
-
-    // create memory
-    auto diff_dst_memory = mkldnn::memory(
-        {diff_weights_md, mkldnn_engine},
-        reinterpret_cast<void*>(const_cast<T*>(output_grad_data)));
+    // create mkldnn memory from input tensors (input/weights/output_grad)
+    auto user_src_memory = memory(
+        {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine},
+        to_void_cast(input_data));
+    auto user_weights_memory =
+        memory({{{weights_tz}, memory::data_type::f32, filter->format()},
+                mkldnn_engine},
+               to_void_cast(filter_data));
+    auto user_diff_dst_memory =
+        memory({{{dst_tz}, memory::data_type::f32, output_grad->format()},
+                mkldnn_engine},
+               to_void_cast(output_grad_data));
+
+    /* create memory descriptor for conv backward without specified format
+     * ('any') which lets a primitive (conv backward in this case) choose
+     * the memory format preferred for best performance
+     */
+    auto src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32,
+                                          memory::format::any);
+    auto diff_src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32,
+                                               memory::format::any);
+    auto weights_md = platform::MKLDNNMemDesc(
+        weights_tz, memory::data_type::f32, memory::format::any);
+    auto diff_weights_md = platform::MKLDNNMemDesc(
+        weights_tz, memory::data_type::f32, memory::format::any);
+    auto diff_dst_md = platform::MKLDNNMemDesc(dst_tz, memory::data_type::f32,
+                                               memory::format::any);
+
     // Retrieve conv_pd from device context
-    auto conv_pd =
-        std::static_pointer_cast<mkldnn::convolution_forward::primitive_desc>(
-            dev_ctx.GetBlob(key_conv_pd));
+    auto conv_pd = std::static_pointer_cast<conv_fwd::primitive_desc>(
+        dev_ctx.GetBlob(key_conv_pd));
     PADDLE_ENFORCE(conv_pd != nullptr,
                    "Fail to find conv_pd in device context");
 
     // create backward conv primitive for weights
     if (filter_grad) {
-      // create primitive descriptor
-      mkldnn::convolution_backward_weights::primitive_desc conv_bwd_weights_pd =
-          ConvBwdWeightsPrimitiveDesc(src_md, diff_weights_md, diff_dst_md,
-                                      strides, paddings, *conv_pd,
-                                      mkldnn_engine);
-
-      // create memory
+      // create backward convolution primitive descriptor
+      auto conv_bwd_weights_desc = conv_bwd_weights::desc(
+          mkldnn::convolution_direct, src_md, diff_weights_md, diff_dst_md,
+          strides, paddings, paddings, mkldnn::padding_kind::zero);
+      auto conv_bwd_weights_pd = conv_bwd_weights::primitive_desc(
+          conv_bwd_weights_desc, mkldnn_engine, *conv_pd);
+
+      // create reorder primitive if the input format is not the preferred one
+      auto src_memory = user_src_memory;
+      primitive reorder_src;
+      bool is_src_reordered = false;
+      if (memory::primitive_desc(conv_bwd_weights_pd.src_primitive_desc()) !=
+          user_src_memory.get_primitive_desc()) {
+        src_memory = memory(conv_bwd_weights_pd.src_primitive_desc());
+        reorder_src = reorder(user_src_memory, src_memory);
+        is_src_reordered = true;
+      }
+
+      auto diff_dst_memory_4filter = user_diff_dst_memory;
+      primitive reorder_diff_dst_4filter;
+      bool is_diff_dst_reordered_4filter = false;
+      if (memory::primitive_desc(
+              conv_bwd_weights_pd.diff_dst_primitive_desc()) !=
+          user_diff_dst_memory.get_primitive_desc()) {
+        diff_dst_memory_4filter =
+            memory(conv_bwd_weights_pd.diff_dst_primitive_desc());
+        reorder_diff_dst_4filter =
+            reorder(user_diff_dst_memory, diff_dst_memory_4filter);
+        is_diff_dst_reordered_4filter = true;
+      }
+
+      // create mkldnn memory for output (i.e. diff weights)
       auto diff_weights_memory =
-          mkldnn::memory({diff_weights_md, mkldnn_engine},
-                         reinterpret_cast<void*>(filter_grad_data));
-      auto src_memory =
-          mkldnn::memory({src_md, mkldnn_engine},
-                         reinterpret_cast<void*>(const_cast<T*>(input_data)));
+          memory(conv_bwd_weights_pd.diff_weights_primitive_desc(),
+                 reinterpret_cast<void*>(filter_grad_data));
 
       // create backward conv primitive for weights
-      auto conv_bwd_weights_prim = mkldnn::convolution_backward_weights(
-          conv_bwd_weights_pd, src_memory, diff_dst_memory,
-          diff_weights_memory);
+      auto conv_bwd_weights_prim =
+          conv_bwd_weights(conv_bwd_weights_pd, src_memory,
+                           diff_dst_memory_4filter, diff_weights_memory);
 
       // push primitive and execute it
-      std::vector<mkldnn::primitive> pipeline{conv_bwd_weights_prim};
-      mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+      std::vector<primitive> pipeline;
+      if (is_src_reordered) pipeline.push_back(reorder_src);
+      if (is_diff_dst_reordered_4filter)
+        pipeline.push_back(reorder_diff_dst_4filter);
+      pipeline.push_back(conv_bwd_weights_prim);
+      stream(stream::kind::eager).submit(pipeline).wait();
+
+      filter_grad->set_layout(DataLayout::kMKLDNN);
+      filter_grad->set_format(GetMKLDNNFormat(diff_weights_memory));
     }
 
     if (input_grad) {
-      // create primitive descriptor
-      mkldnn::convolution_backward_data::primitive_desc conv_bwd_data_pd =
-          ConvBwdDataPrimitiveDesc(diff_src_md, weights_md, diff_dst_md,
-                                   strides, paddings, *conv_pd, mkldnn_engine);
-
-      // create memory
-      auto diff_src_memory = mkldnn::memory(
-          {diff_src_md, mkldnn_engine},
-          reinterpret_cast<void*>(const_cast<T*>(input_grad_data)));
-      auto weights_memory =
-          mkldnn::memory({weights_md, mkldnn_engine},
-                         reinterpret_cast<void*>(const_cast<T*>(filter_data)));
+      // create backward convolution primitive descriptor
+      auto conv_bwd_data_desc = conv_bwd_data::desc(
+          mkldnn::convolution_direct, diff_src_md, weights_md, diff_dst_md,
+          strides, paddings, paddings, mkldnn::padding_kind::zero);
+      auto conv_bwd_data_pd = conv_bwd_data::primitive_desc(
+          conv_bwd_data_desc, mkldnn_engine, *conv_pd);
+
+      // create reorder primitive if the input format is not the preferred one
+      auto weights_memory = user_weights_memory;
+      primitive reorder_weights;
+      bool is_weights_reordered = false;
+      if (memory::primitive_desc(conv_bwd_data_pd.weights_primitive_desc()) !=
+          user_weights_memory.get_primitive_desc()) {
+        weights_memory = memory(conv_bwd_data_pd.weights_primitive_desc());
+        reorder_weights = reorder(user_weights_memory, weights_memory);
+        is_weights_reordered = true;
+      }
+
+      auto diff_dst_memory_4data = user_diff_dst_memory;
+      primitive reorder_diff_dst_4data;
+      bool is_diff_dst_reordered_4data = false;
+      if (memory::primitive_desc(conv_bwd_data_pd.diff_dst_primitive_desc()) !=
+          user_diff_dst_memory.get_primitive_desc()) {
+        diff_dst_memory_4data =
+            memory(conv_bwd_data_pd.diff_dst_primitive_desc());
+        reorder_diff_dst_4data =
+            reorder(user_diff_dst_memory, diff_dst_memory_4data);
+        is_diff_dst_reordered_4data = true;
+      }
+
+      // create mkldnn memory for output (i.e. diff src)
+      auto diff_src_memory = memory(conv_bwd_data_pd.diff_src_primitive_desc(),
+                                    reinterpret_cast<void*>(input_grad_data));
 
       // create backward conv primitive for data
-      auto conv_bwd_data_prim = mkldnn::convolution_backward_data(
-          conv_bwd_data_pd, diff_dst_memory, weights_memory, diff_src_memory);
+      auto conv_bwd_data_prim =
+          conv_bwd_data(conv_bwd_data_pd, diff_dst_memory_4data, weights_memory,
+                        diff_src_memory);
 
-      // push primitive to stream and wait until it's executed
-      std::vector<mkldnn::primitive> pipeline{conv_bwd_data_prim};
-      mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+      // push primitive and execute it
+      std::vector<primitive> pipeline;
+      if (is_weights_reordered) pipeline.push_back(reorder_weights);
+      if (is_diff_dst_reordered_4data)
+        pipeline.push_back(reorder_diff_dst_4data);
+      pipeline.push_back(conv_bwd_data_prim);
+      stream(stream::kind::eager).submit(pipeline).wait();
+
+      input_grad->set_layout(DataLayout::kMKLDNN);
+      input_grad->set_format(GetMKLDNNFormat(diff_src_memory));
     }
   }  // Compute()
-
- private:
-  mkldnn::convolution_backward_weights::primitive_desc
-  ConvBwdWeightsPrimitiveDesc(
-      const mkldnn::memory::desc& src, const mkldnn::memory::desc& diff_weights,
-      const mkldnn::memory::desc& diff_dst, const std::vector<int>& strides,
-      const std::vector<int>& paddings,
-      const mkldnn::convolution_forward::primitive_desc& conv_pd,
-      const mkldnn::engine& engine) const {
-    auto conv_bwd_weights_desc = mkldnn::convolution_backward_weights::desc(
-        mkldnn::convolution_direct, src, diff_weights, diff_dst, strides,
-        paddings, paddings, mkldnn::padding_kind::zero);
-    return mkldnn::convolution_backward_weights::primitive_desc(
-        conv_bwd_weights_desc, engine, conv_pd);
-  }
-
-  mkldnn::convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc(
-      const mkldnn::memory::desc& diff_src, const mkldnn::memory::desc& weights,
-      const mkldnn::memory::desc& diff_dst, const std::vector<int>& strides,
-      const std::vector<int>& paddings,
-      const mkldnn::convolution_forward::primitive_desc& conv_pd,
-      const mkldnn::engine& engine) const {
-    auto conv_bwd_data_desc = mkldnn::convolution_backward_data::desc(
-        mkldnn::convolution_direct, diff_src, weights, diff_dst, strides,
-        paddings, paddings, mkldnn::padding_kind::zero);
-    return mkldnn::convolution_backward_data::primitive_desc(conv_bwd_data_desc,
-                                                             engine, conv_pd);
-  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 27f1313116aad99d34fa8f1d3d6a1e7aced4d394..37153d58439a90190eb2ad82d5dcc145e22dfa48 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -75,9 +75,8 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   framework::LibraryType library{framework::LibraryType::kPlain};
-
-  std::string data_format = ctx.Attr<std::string>("data_format");
   // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  std::string data_format = ctx.Attr<std::string>("data_format");
   framework::DataLayout layout = framework::StringToDataLayout(data_format);
 
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/gen_nccl_id_op.cc b/paddle/fluid/operators/gen_nccl_id_op.cc
index 111e58844c83806af4ebe0aa9e2126a9ddec1d8a..f824eee4e7d1ef19c9a38fd5d3369265f9c549a0 100644
--- a/paddle/fluid/operators/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/gen_nccl_id_op.cc
@@ -67,6 +67,10 @@ class GenNCCLIdOp : public framework::OperatorBase {
       client->AsyncSendVar(ep, dev_ctx, *scope, NCCL_ID_VARNAME);
     }
     client->Wait();
+    for (auto& ep : endpoint_list) {
+      client->AsyncSendBatchBarrier(ep);
+    }
+    client->Wait();
     VLOG(3) << "sending completed...";
   }
 
diff --git a/paddle/fluid/operators/merge_ids_op.cc b/paddle/fluid/operators/merge_ids_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c6ec4ab047d5e91625e646fd26108d2e477cdce5
--- /dev/null
+++ b/paddle/fluid/operators/merge_ids_op.cc
@@ -0,0 +1,128 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/merge_ids_op.h"
+
+namespace paddle {
+namespace operators {
+
+class MergeIdsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}");
+    AddInput(
+        "X",
+        "(LoDTensors) multi input tensor with shape{batch_num, N}, N is the "
+        "size of embedding table")
+        .AsDuplicable();
+    AddOutput("Out", "(LoDTensor) The merged outputs of the input tensors.");
+
+    AddComment(R"DOC(
+Merge multi LoDTensor's into one according to Ids's shard num.
+
+
+split_ids_op -> prefetch_op -> merge_ids_op
+
+
+merge_ids_op should be used after split_ids_op and prefetch_op, split_ids_op
+ will split input Ids into multiple tensors according to Id's shard number.
+prefetch_op will send them to parameter server to prefetch embedding value
+back. During split, the order of ids is disordered. In merge_ids_op we use
+the original Ids to restore the order of the fetched embedding value and
+ also pass the lod information to the merged output.
+
+
+Example:
+
+    Ids = [1,2,3,4,5,6] # 3 shared
+
+split_ids_op ->
+
+    Id0 = [3, 6] # id % 3 == 0
+    Id1 = [1, 4] # id % 3 == 1
+    Id2 = [2, 5] # id % 3 == 2
+
+prefetch_op ->
+
+    X0 = [[0.3 0.3]   # 3
+          [0.6 0.6]]  # 6
+    X1 = [[0.1 0.1]   # 1
+          [0.4 0.4]]  # 4
+    X2 = [[0.2 0.2]   # 2
+          [0.5 0.5]]  # 5
+
+merge_ids_op ->
+
+    Out = [[0.1 0.1]  # 1
+           [0.2 0.2]  # 2
+           [0.3 0.3]  # 3
+           [0.4 0.4]  # 4
+           [0.5 0.5]  # 5
+           [0.6 0.6]] # 6
+)DOC");
+  }
+};
+
+class MergeIdsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Ids"), "MergeIdsOp must has input Ids.");
+    PADDLE_ENFORCE(ctx->HasInputs("X"), "MergeIdsOp must has input X.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "MergeIdsOp must has output Out.");
+
+    auto ids_var_type = ctx->GetInputsVarType("Ids").front();
+    auto ids_dims = ctx->GetInputDim("Ids");
+    if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
+      PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
+      PADDLE_ENFORCE_EQ(ids_dims[1], 1);
+    }
+    auto x_var_type = ctx->GetInputsVarType("X");
+    for (auto &var_type : x_var_type) {
+      PADDLE_ENFORCE_EQ(var_type, framework::proto::VarType::LOD_TENSOR,
+                        "input X only support lod tensors");
+    }
+    ctx->ShareLoD("Ids", "Out");
+  }
+
+ private:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.MultiInput<framework::Tensor>("X").front()->type()),
+        ctx.GetPlace());
+  }
+};
+
+class MergeIdsOpInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    auto *input_var = block->Var(op_desc.Input("Ids")[0]);
+    for (auto &out_var : op_desc.Output("Out")) {
+      block->Var(out_var)->SetType(input_var->GetType());
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(merge_ids, ops::MergeIdsOp, ops::MergeIdsOpMaker,
+                  ops::MergeIdsOpInferVarType);
+REGISTER_OP_CPU_KERNEL(
+    merge_ids, ops::MergeIdsOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/merge_ids_op.h b/paddle/fluid/operators/merge_ids_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..83712a8519c6817151e1922c606c0fdd4682a2db
--- /dev/null
+++ b/paddle/fluid/operators/merge_ids_op.h
@@ -0,0 +1,92 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class MergeIdsOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto place = ctx.GetPlace();
+    if (!platform::is_cpu_place(place)) {
+      PADDLE_THROW("MergeIds do not support GPU kernel");
+    }
+    VLOG(3) << "run in MergeIdsOpKernel";
+
+    const auto *ids_var = ctx.InputVar("Ids");
+    PADDLE_ENFORCE(ids_var->IsType<framework::LoDTensor>(),
+                   "only support to merge Ids of LoDTensor");
+
+    const auto &ids_tensor = ids_var->Get<framework::LoDTensor>();
+    const auto &ids_dims = ids_tensor.dims();
+    const int64_t *ids = ids_tensor.data<int64_t>();
+
+    auto x_tensors = ctx.MultiInput<framework::LoDTensor>("X");
+
+    auto *out = ctx.Output<framework::LoDTensor>("Out");
+
+    int batch_size = 0;
+    int embedding_size = 0;
+    for (auto &input : x_tensors) {
+      if (framework::product(input->dims()) != 0) {
+        if (embedding_size == 0) {
+          embedding_size = input->dims()[1];
+        }
+        PADDLE_ENFORCE_EQ(embedding_size, input->dims()[1],
+                          "embedding size of all input should be the same");
+        batch_size += input->dims()[0];
+      }
+    }
+    PADDLE_ENFORCE_EQ(
+        batch_size, ids_dims[0],
+        "the batch size of ids and merged embedding value should be the same");
+
+    const size_t shard_num = x_tensors.size();
+
+    if (shard_num == 1) {
+      VLOG(3) << "only one shard, we can copy the data directly";
+      TensorCopy(*x_tensors[0], place, out);
+    } else {
+      std::vector<int> in_indexs(shard_num, 0);
+      auto *out_data = out->mutable_data<T>(
+          framework::make_ddim({batch_size, embedding_size}), place);
+      // copy data from ins[shard_num] to out.
+      for (int i = 0; i < ids_dims[0]; ++i) {
+        int64_t id = ids[i];
+        size_t shard_id = static_cast<size_t>(id) % shard_num;
+        int index = in_indexs[shard_id];
+        memcpy(out_data + embedding_size * i,
+               x_tensors[shard_id]->data<T>() + index * embedding_size,
+               sizeof(T) * embedding_size);
+        in_indexs[shard_id] += 1;
+      }
+
+      for (size_t i = 0; i < shard_num; ++i) {
+        PADDLE_ENFORCE_EQ(in_indexs[i], x_tensors[i]->dims()[0],
+                          "after merge, all data in x_tensor should be used");
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index c202eed354c5f1a91e93e1c3919d1bfebc1bc401..40dc7c9a0b6a40f2419ace3ce7e0e5e82bc95c1a 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -59,9 +59,10 @@ inline size_t CpuTotalPhysicalMemory() {
 size_t CpuMaxAllocSize() {
   // For distributed systems, it requires configuring and limiting
   // the fraction of memory to use.
-  return std::min(static_cast<size_t>(FLAGS_fraction_of_cpu_memory_to_use *
-                                      CpuTotalPhysicalMemory()),
-                  FLAGS_initial_cpu_memory_in_mb * 1 << 20);
+  return std::min(
+      static_cast<size_t>(FLAGS_fraction_of_cpu_memory_to_use *
+                          CpuTotalPhysicalMemory()),
+      static_cast<size_t>(FLAGS_initial_cpu_memory_in_mb * 1 << 20));
 }
 
 size_t CpuMinChunkSize() {
diff --git a/python/paddle/batch.py b/python/paddle/batch.py
index d48c54fcbb66487617b1946bc69724870c8f879c..3c6a53db3c2287e8ef5931a06ca5dad455665ee0 100644
--- a/python/paddle/batch.py
+++ b/python/paddle/batch.py
@@ -15,7 +15,7 @@
 __all__ = ['batch']
 
 
-def batch(reader, batch_size, drop_last=False):
+def batch(reader, batch_size, drop_last=True):
     """
     Create a batched reader.
 
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index bbd35aaecba27ea9fd66b9be585a972690980ab8..f6438c82ac207d0e38d8be5e9d6252b28e72826e 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -382,7 +382,7 @@ class Operator(object):
         'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
         'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine',
         'ncclInit', 'channel_create', 'channel_close', 'channel_send',
-        'channel_recv', 'select'
+        'channel_recv', 'select', 'gen_nccl_id'
     }
 
     def __init__(self,
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index d5db75ebea4202bdc9afba251e3b24f760051123..80452a1e8b26cb9ca4bc1176349e5daf1fe09e2f 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -261,9 +261,10 @@ def embedding(input,
     return tmp
 
 
-# TODO(qijun): expose H0 and C0
 def dynamic_lstm(input,
                  size,
+                 h_0=None,
+                 c_0=None,
                  param_attr=None,
                  bias_attr=None,
                  use_peepholes=True,
@@ -324,6 +325,13 @@ def dynamic_lstm(input,
                          (T X 4D), where T is the total time steps in this
                          mini-batch, D is the hidden size.
         size(int): 4 * hidden size.
+        h_0(Variable): The initial hidden state is an optional input, default is zero.
+                       This is a tensor with shape (N x D), where N is the
+                       batch size and D is the hidden size.
+        c_0(Variable): The initial cell state is an optional input, default is zero.
+                       This is a tensor with shape (N x D), where N is the
+                       batch size. `h_0` and `c_0` can be NULL but only at the same time.
+
         param_attr(ParamAttr|None): The parameter attribute for the learnable
                                hidden-hidden weights.
 
@@ -387,12 +395,20 @@ def dynamic_lstm(input,
     cell = helper.create_tmp_variable(dtype)
     batch_gate = helper.create_tmp_variable(dtype)
     batch_cell_pre_act = helper.create_tmp_variable(dtype)
+    inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
+    batch_size = input.shape[0]
+    if h_0:
+        assert h_0.shape == (batch_size, size), \
+            'The shape of h0 should be (batch_size, %d)' % size
+        inputs['H0'] = h_0
+    if c_0:
+        assert c_0.shape == (batch_size, size), \
+            'The shape of c0 should be (batch_size, %d)' % size
+        inputs['C0'] = c_0
 
     helper.append_op(
         type='lstm',
-        inputs={'Input': input,
-                'Weight': weight,
-                'Bias': bias},
+        inputs=inputs,
         outputs={
             'Hidden': hidden,
             'Cell': cell,
@@ -677,11 +693,13 @@ def dynamic_gru(input,
         attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
     bias = helper.create_parameter(
         attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True)
+    batch_size = input.shape[0]
     inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
     if h_0 != None:
         assert h_0.shape == (
-            size, size), 'The shape of h0 should be(%d, %d)' % (size, size)
-        inputs['h0'] = h_0
+            batch_size, size
+        ), 'The shape of h0 should be(batch_size, %d)' % size
+        inputs['H0'] = h_0
 
     hidden = helper.create_tmp_variable(dtype)
     batch_gate = helper.create_tmp_variable(dtype)
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
index 2df3da9cca7042222317de626460909f018cb107..8e222d26907e8fe697b596a67e62cc9df84afe0e 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
@@ -96,10 +96,11 @@ def train(use_cuda, train_program, params_dirname):
     train_reader = paddle.batch(
         paddle.reader.shuffle(
             cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10),
-        batch_size=BATCH_SIZE)
+        batch_size=BATCH_SIZE,
+        drop_last=False)
 
     test_reader = paddle.batch(
-        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
+        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False)
 
     def event_handler(event):
         if isinstance(event, fluid.EndStepEvent):
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
index 224cca417e717bbcc54b58be6ac0219be207dea3..dbc7bc06c93157f271c79e85b6925468e861e57f 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
@@ -73,10 +73,11 @@ def train(use_cuda, train_program, params_dirname):
     train_reader = paddle.batch(
         paddle.reader.shuffle(
             cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10),
-        batch_size=BATCH_SIZE)
+        batch_size=BATCH_SIZE,
+        drop_last=False)
 
     test_reader = paddle.batch(
-        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
+        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False)
 
     def event_handler(event):
         if isinstance(event, fluid.EndStepEvent):
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
index 113dda88ca974c9e6241f127091bd96fb2af4a70..8c74be0f08855c20f5aa3ecd75622a51e94a0304 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
@@ -87,7 +87,9 @@ def train(use_cuda, train_program, params_dirname):
     def event_handler(event):
         if isinstance(event, fluid.EndEpochEvent):
             test_reader = paddle.batch(
-                paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE)
+                paddle.dataset.imdb.test(word_dict),
+                batch_size=BATCH_SIZE,
+                drop_last=False)
             avg_cost, acc = trainer.test(
                 reader=test_reader, feed_order=['words', 'label'])
 
@@ -113,7 +115,8 @@ def train(use_cuda, train_program, params_dirname):
     train_reader = paddle.batch(
         paddle.reader.shuffle(
             paddle.dataset.imdb.train(word_dict), buf_size=25000),
-        batch_size=BATCH_SIZE)
+        batch_size=BATCH_SIZE,
+        drop_last=False)
 
     trainer.train(
         num_epochs=1,
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
index 8818cf96fa8f08036f9e23aae786f67b5614b2b9..be347cd5315668dde0454d7959dbf9bcfa465b5f 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -56,7 +56,7 @@ BATCH_SIZE = 200
 
 # fix the order of training data
 train_reader = paddle.batch(
-    paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE)
+    paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE, drop_last=False)
 
 # train_reader = paddle.batch(
 #     paddle.reader.shuffle(
diff --git a/python/paddle/fluid/tests/unittests/test_merge_ids_op.py b/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..f209bdf30faffc0b2c7932b7b10f384d6d61a831
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
@@ -0,0 +1,38 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestMergeIdsOp(OpTest):
+    def setUp(self):
+        self.op_type = "merge_ids"
+        ids = np.array([[0], [2], [2], [3], [5], [5], [6]]).astype('int64')
+        x0 = np.array([[0.1, 0.2], [0.2, 0.3], [0.3, 0.4]]).astype('float32')
+        x1 = np.array([]).astype('float32')
+        x2 = np.array([[0.4, 0.5], [0.4, 0.5], [0.5, 0.6],
+                       [0.5, 0.6]]).astype('float32')
+        out = np.array([[0.1, 0.2], [0.4, 0.5], [0.4, 0.5], [0.2, 0.3],
+                        [0.5, 0.6], [0.5, 0.6], [0.3, 0.4]]).astype('float32')
+        self.inputs = {'Ids': ids, "X": [('x0', x0), ('x1', x1), ('x2', x2)]}
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 2480d4e76a1b5fd76b7dc8299c2f8fcae967145e..9c604170b8b53c9cbcf39b4978ae60ccad84648c 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -629,7 +629,7 @@ class DistributeTranspiler:
                 if op.type == LOOKUP_TABLE_TYPE:
                     continue_search_lookup_table_op = True
 
-                    op_index = list(all_ops).index(op)
+                    lookup_table_op_index = list(all_ops).index(op)
                     ids_name = op.input("Ids")
                     out_name = op.output("Out")
 
@@ -649,7 +649,7 @@ class DistributeTranspiler:
 
                     # insert split_ids_op
                     program.global_block().insert_op(
-                        index=op_index,
+                        index=lookup_table_op_index,
                         type="split_ids",
                         inputs={
                             'Ids': [
@@ -661,7 +661,7 @@ class DistributeTranspiler:
 
                     # insert prefetch_op
                     program.global_block().insert_op(
-                        index=op_index + 1,
+                        index=lookup_table_op_index + 1,
                         type="prefetch",
                         inputs={'X': prefetch_input_vars},
                         outputs={"Out": prefetch_output_vars},
@@ -672,16 +672,21 @@ class DistributeTranspiler:
 
                     # insert concat_op
                     program.global_block().insert_op(
-                        index=op_index + 2,
-                        type="concat",
-                        inputs={'X': prefetch_output_vars},
+                        index=lookup_table_op_index + 2,
+                        type="merge_ids",
+                        inputs={
+                            'Ids': [
+                                program.global_block().vars[varname]
+                                for varname in ids_name
+                            ],
+                            'X': prefetch_output_vars
+                        },
                         outputs={
                             "Out": [
                                 program.global_block().vars[varname]
                                 for varname in out_name
                             ]
-                        },
-                        attrs={"axis": 0})
+                        })
 
                     # delete lookup_table_op
                     delete_ops(program.global_block(), [op])
diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py
index e6f87ce61b1d16d4f98f111626776aa52c2ec35b..4e3beaf639bad9fed2862a5477095b66ef4b9aee 100644
--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -240,14 +240,15 @@ class ExtraLayerAttribute(object):
     :type error_clipping_threshold: float
     :param drop_rate: Dropout rate. Dropout will create a mask on layer output.
                       The dropout rate is the zero rate of this mask. The
-                      details of what dropout is please refer to `here
-                      <https://www.cs.toronto.edu/~hinton/absps/
-                      JMLRdropout.pdf>`_.
+                      details of what dropout is please refer to `JMLRdropout
+                      <https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf
+                      >`_.
     :type drop_rate: float
     :param device: device ID of layer. device=-1, use CPU. device>=0, use GPU.
-                   The details allocation in parallel_nn please refer to `here
-                   <http://www.paddlepaddle.org/doc/ui/cmd_argument/
-                   use_case.html#case-2-specify-layers-in-different-devices>`_.
+                   The details allocation in parallel_nn please refer to `use_case
+                   <https://github.com/PaddlePaddle/Paddle/blob/develop/doc/v2
+                   /howto/cmd_parameter/use_case_en.md#case-2-specify-layers-in
+                   -different-devices>`_.
     :type device: int
     """
 
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index ebc31b23e0f5504b4bebccabe996b054c7fbce3b..e6a03759ef431086390e217eabcdff47e610346c 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -2556,7 +2556,7 @@ def img_conv_layer(input,
     the output will be obtained by concatenating the two results.
 
     The details of grouped convolution, please refer to:
-    `ImageNet Classification with Deep Convolutional Neural Networks
+    `ImageNet Classification With Deep Convolutional Neural Networks
     <http://www.cs.toronto.edu/~kriz/imagenet_classification_with_deep_convolutional.pdf>`_
     
     The example usage is:
@@ -5678,8 +5678,8 @@ def warp_ctc_layer(input,
     <https://github.com/baidu-research/warp-ctc>`_ library, which is used in
     `Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin
     <https://arxiv.org/pdf/1512.02595v1.pdf>`_, to compute Connectionist Temporal
-    Classification (CTC) loss. Besides, another `warp-ctc
-    <https://github.com/gangliao/warp-ctc>`_ repository, which is forked from
+    Classification (CTC) loss. Besides, another `warp-ctc repository
+    <https://github.com/gangliao/warp-ctc>`_ , which is forked from
     the official one, is maintained to enable more compiling options. During the
     building process, PaddlePaddle will clone the source codes, build and
     install it to :code:`third_party/install/warpctc` directory.
diff --git a/python/paddle/v2/minibatch.py b/python/paddle/v2/minibatch.py
index d48c54fcbb66487617b1946bc69724870c8f879c..3c6a53db3c2287e8ef5931a06ca5dad455665ee0 100644
--- a/python/paddle/v2/minibatch.py
+++ b/python/paddle/v2/minibatch.py
@@ -15,7 +15,7 @@
 __all__ = ['batch']
 
 
-def batch(reader, batch_size, drop_last=False):
+def batch(reader, batch_size, drop_last=True):
     """
     Create a batched reader.