Merge remote-tracking branch 'ups/develop' into fix

85c3bfc1 · tensor-tang · 6602db5b · d07d9535 · 85c3bfc1 · 85c3bfc1
66 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -61,6 +61,7 @@ option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen"            OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
+option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 # CMAKE_BUILD_TYPE
@@ -193,7 +194,10 @@ set(EXTERNAL_LIBS
 if(WITH_GPU)
    include(cuda)
    include(tensorrt)
-endif(WITH_GPU)
+    include(external/anakin)
+else()
+  set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
+endif()
 if(WITH_AMD_GPU)
    find_package(HIP)

--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
+if (NOT WITH_ANAKIN)
+  return()
+endif()
+set(ANAKIN_INSTALL_DIR "${THIRD_PARTY_PATH}/install/anakin" CACHE PATH
+  "Anakin install path." FORCE)
+set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header files")
+set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library")
+set(ANAKIN_COMPILE_EXTRA_FLAGS -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp)
+set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/3.0/anakin_release_simple.tar.gz")
+# A helper function used in Anakin, currently, to use it, one need to recursively include
+# nearly all the header files.
+function(fetch_include_recursively root_dir)
+    if (IS_DIRECTORY ${root_dir})
+        include_directories(${root_dir})
+    endif()
+    file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*)
+    foreach(sub ${ALL_SUB})
+        if (IS_DIRECTORY ${root_dir}/${sub})
+            fetch_include_recursively(${root_dir}/${sub})
+        endif()
+    endforeach()
+endfunction()
+# download library
+message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
+execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
+execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
+execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
+execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
+execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
+if (WITH_ANAKIN)
+    message(STATUS "Anakin for inference is enabled")
+    message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
+    fetch_include_recursively(${ANAKIN_INCLUDE})
+    link_directories(${ANAKIN_LIBRARY})
+endif()
--- a/doc/fluid/api/detection.rst
+++ b/doc/fluid/api/detection.rst
--- a/doc/fluid/api/gen_doc.sh
+++ b/doc/fluid/api/gen_doc.sh
 #!/bin/bash
-python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst
+python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler > layers.rst
 for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer
 do

--- a/doc/fluid/api/io.rst
+++ b/doc/fluid/api/io.rst
@@ -59,21 +59,3 @@ get_inference_program
 ..  autofunction:: paddle.fluid.io.get_inference_program
    :noindex:
-save_checkpoint
---------------
-..  autofunction:: paddle.fluid.io.save_checkpoint
-    :noindex:
-load_checkpoint
---------------
-..  autofunction:: paddle.fluid.io.load_checkpoint
-    :noindex:
-clean_checkpoint
----------------
-..  autofunction:: paddle.fluid.io.clean_checkpoint
-    :noindex:
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -181,12 +181,6 @@ Print
 ..  autofunction:: paddle.fluid.layers.Print
    :noindex:
-is_empty
--------
-..  autofunction:: paddle.fluid.layers.is_empty
-    :noindex:
 device
 ======
@@ -261,19 +255,6 @@ double_buffer
 ..  autofunction:: paddle.fluid.layers.double_buffer
    :noindex:
-random_data_generator
---------------------
-..  autofunction:: paddle.fluid.layers.random_data_generator
-    :noindex:
-Preprocessor
------------
-..  autoclass:: paddle.fluid.layers.Preprocessor
-    :members:
-    :noindex:
 nn
 ==
@@ -613,30 +594,6 @@ roi_pool
 ..  autofunction:: paddle.fluid.layers.roi_pool
    :noindex:
-dice_loss
---------
-..  autofunction:: paddle.fluid.layers.dice_loss
-    :noindex:
-resize_bilinear
---------------
-..  autofunction:: paddle.fluid.layers.resize_bilinear
-    :noindex:
-gather
------
-..  autofunction:: paddle.fluid.layers.gather
-    :noindex:
-random_crop
-----------
-..  autofunction:: paddle.fluid.layers.random_crop
-    :noindex:
 ops
 ===
@@ -784,12 +741,6 @@ sum
 ..  autofunction:: paddle.fluid.layers.sum
    :noindex:
-shape
-----
-..  autofunction:: paddle.fluid.layers.shape
-    :noindex:
 sigmoid
 -------
@@ -1039,3 +990,93 @@ zeros
 ..  autofunction:: paddle.fluid.layers.zeros
    :noindex:
+detection
+=========
+multi_box_head
+--------------
+..  autofunction:: paddle.fluid.layers.multi_box_head
+    :noindex:
+bipartite_match
+---------------
+..  autofunction:: paddle.fluid.layers.bipartite_match
+    :noindex:
+target_assign
+-------------
+..  autofunction:: paddle.fluid.layers.target_assign
+    :noindex:
+detection_output
+----------------
+..  autofunction:: paddle.fluid.layers.detection_output
+    :noindex:
+ssd_loss
+--------
+..  autofunction:: paddle.fluid.layers.ssd_loss
+    :noindex:
+detection_map
+-------------
+..  autofunction:: paddle.fluid.layers.detection_map
+    :noindex:
+iou_similarity
+--------------
+..  autofunction:: paddle.fluid.layers.iou_similarity
+    :noindex:
+box_coder
+---------
+..  autofunction:: paddle.fluid.layers.box_coder
+    :noindex:
+learning_rate_scheduler
+=======================
+exponential_decay
+-----------------
+..  autofunction:: paddle.fluid.layers.exponential_decay
+    :noindex:
+natural_exp_decay
+-----------------
+..  autofunction:: paddle.fluid.layers.natural_exp_decay
+    :noindex:
+inverse_time_decay
+------------------
+..  autofunction:: paddle.fluid.layers.inverse_time_decay
+    :noindex:
+polynomial_decay
+----------------
+..  autofunction:: paddle.fluid.layers.polynomial_decay
+    :noindex:
+piecewise_decay
+---------------
+..  autofunction:: paddle.fluid.layers.piecewise_decay
+    :noindex:
+noam_decay
+----------
+..  autofunction:: paddle.fluid.layers.noam_decay
+    :noindex:
--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
@@ -89,13 +89,6 @@ DecayedAdagradOptimizer
    :members:
    :noindex:
-RMSPropOptimizer
----------------
-..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
-    :members:
-    :noindex:
 Adadelta
 --------

--- a/doc/fluid/api/profiler.rst
+++ b/doc/fluid/api/profiler.rst
@@ -23,15 +23,3 @@ profiler
 ..  autofunction:: paddle.fluid.profiler.profiler
    :noindex:
-start_profiler
--------------
-..  autofunction:: paddle.fluid.profiler.start_profiler
-    :noindex:
-stop_profiler
-------------
-..  autofunction:: paddle.fluid.profiler.stop_profiler
-    :noindex:
--- a/doc/survey/dynamic_graph.md
+++ b/doc/survey/dynamic_graph.md
@@ -171,7 +171,7 @@ Pytorch chooses immediate evaluation. It avoids ever materializing a "forward gr
 ## What can fluid learn from them?
-TBD
+Please refer to `paddle/contrib/dynamic/`.
 # Appendix

--- a/doc/v2/api/config/evaluators.rst
+++ b/doc/v2/api/config/evaluators.rst
@@ -101,7 +101,7 @@ value_printer
    :noindex:
 Detection
-=====
+==========
 detection_map
 -------------

--- a/doc/v2/api/config/layer.rst
+++ b/doc/v2/api/config/layer.rst
@@ -11,7 +11,7 @@ Data layer
 data
 ----
-..  autoclass:: paddle.v2.layer.data
+..  autofunction:: paddle.v2.layer.data
    :noindex:
 Fully Connected Layers
@@ -21,12 +21,12 @@ Fully Connected Layers
 fc
 --
-..  autoclass:: paddle.v2.layer.fc
+..  autofunction:: paddle.v2.layer.fc
    :noindex:
 selective_fc
 ------------
-..  autoclass:: paddle.v2.layer.selective_fc
+..  autofunction:: paddle.v2.layer.selective_fc
    :noindex:
 Conv Layers
@@ -34,34 +34,34 @@ Conv Layers
 conv_operator
 -------------
-..  autoclass:: paddle.v2.layer.conv_operator
+..  autofunction:: paddle.v2.layer.conv_operator
    :noindex:
 conv_projection
 ---------------
-..  autoclass:: paddle.v2.layer.conv_projection
+..  autofunction:: paddle.v2.layer.conv_projection
    :noindex:
 conv_shift
 ----------
-..  autoclass:: paddle.v2.layer.conv_shift
+..  autofunction:: paddle.v2.layer.conv_shift
    :noindex:
 img_conv
 --------
-..  autoclass:: paddle.v2.layer.img_conv
+..  autofunction:: paddle.v2.layer.img_conv
    :noindex:
 ..  _api_v2.layer_context_projection:
 context_projection
 ------------------
-..  autoclass:: paddle.v2.layer.context_projection
+..  autofunction:: paddle.v2.layer.context_projection
    :noindex:
 row_conv
 --------
-..  autoclass:: paddle.v2.layer.row_conv
+..  autofunction:: paddle.v2.layer.row_conv
    :noindex:
 Image Pooling Layer
@@ -69,27 +69,27 @@ Image Pooling Layer
 img_pool
 --------
-..  autoclass:: paddle.v2.layer.img_pool
+..  autofunction:: paddle.v2.layer.img_pool
    :noindex:
 spp
 ---
-..  autoclass:: paddle.v2.layer.spp
+..  autofunction:: paddle.v2.layer.spp
    :noindex:
 maxout
 ------
-..  autoclass:: paddle.v2.layer.maxout
+..  autofunction:: paddle.v2.layer.maxout
    :noindex:
 roi_pool
 --------
-..  autoclass:: paddle.v2.layer.roi_pool
+..  autofunction:: paddle.v2.layer.roi_pool
    :noindex:
 pad
 ----
-..  autoclass:: paddle.v2.layer.pad
+..  autofunction:: paddle.v2.layer.pad
    :noindex:
 Norm Layer
@@ -97,27 +97,27 @@ Norm Layer
 img_cmrnorm
 -----------
-..  autoclass:: paddle.v2.layer.img_cmrnorm
+..  autofunction:: paddle.v2.layer.img_cmrnorm
    :noindex:
 batch_norm
 ----------
-..  autoclass:: paddle.v2.layer.batch_norm
+..  autofunction:: paddle.v2.layer.batch_norm
    :noindex:
 sum_to_one_norm
 ---------------
-..  autoclass:: paddle.v2.layer.sum_to_one_norm
+..  autofunction:: paddle.v2.layer.sum_to_one_norm
    :noindex:
 cross_channel_norm
 ------------------
-..  autoclass:: paddle.v2.layer.cross_channel_norm
+..  autofunction:: paddle.v2.layer.cross_channel_norm
    :noindex:
 row_l2_norm
 -----------
-..  autoclass:: paddle.v2.layer.row_l2_norm
+..  autofunction:: paddle.v2.layer.row_l2_norm
    :noindex:
 Recurrent Layers
@@ -125,22 +125,22 @@ Recurrent Layers
 recurrent
 ---------
-..  autoclass:: paddle.v2.layer.recurrent
+..  autofunction:: paddle.v2.layer.recurrent
    :noindex:
 lstmemory
 ---------
-..  autoclass:: paddle.v2.layer.lstmemory
+..  autofunction:: paddle.v2.layer.lstmemory
    :noindex:
 grumemory
 ---------
-..  autoclass:: paddle.v2.layer.grumemory
+..  autofunction:: paddle.v2.layer.grumemory
    :noindex:
 gated_unit
 -----------
-..  autoclass:: paddle.v2.layer.gated_unit
+..  autofunction:: paddle.v2.layer.gated_unit
    :noindex:
 Recurrent Layer Group
@@ -148,32 +148,32 @@ Recurrent Layer Group
 memory
 ------
-..  autoclass:: paddle.v2.layer.memory
+..  autofunction:: paddle.v2.layer.memory
    :noindex:
 recurrent_group
 ---------------
-..  autoclass:: paddle.v2.layer.recurrent_group
+..  autofunction:: paddle.v2.layer.recurrent_group
    :noindex:
 lstm_step
 ---------
-..  autoclass:: paddle.v2.layer.lstm_step
+..  autofunction:: paddle.v2.layer.lstm_step
    :noindex:
 gru_step
 --------
-..  autoclass:: paddle.v2.layer.gru_step
+..  autofunction:: paddle.v2.layer.gru_step
    :noindex:
 beam_search
 ------------
-..  autoclass:: paddle.v2.layer.beam_search
+..  autofunction:: paddle.v2.layer.beam_search
    :noindex:
 get_output
 ----------
-..  autoclass:: paddle.v2.layer.get_output
+..  autofunction:: paddle.v2.layer.get_output
    :noindex:
 Mixed Layer
@@ -183,54 +183,54 @@ Mixed Layer
 mixed
 -----
-..  autoclass:: paddle.v2.layer.mixed
+..  autofunction:: paddle.v2.layer.mixed
    :noindex:
 ..  _api_v2.layer_embedding:
 embedding
 ---------
-..  autoclass:: paddle.v2.layer.embedding
+..  autofunction:: paddle.v2.layer.embedding
    :noindex:
 scaling_projection
 ------------------
-..  autoclass:: paddle.v2.layer.scaling_projection
+..  autofunction:: paddle.v2.layer.scaling_projection
    :noindex:
 dotmul_projection
 -----------------
-..  autoclass:: paddle.v2.layer.dotmul_projection
+..  autofunction:: paddle.v2.layer.dotmul_projection
    :noindex:
 dotmul_operator
 ---------------
-..  autoclass:: paddle.v2.layer.dotmul_operator
+..  autofunction:: paddle.v2.layer.dotmul_operator
    :noindex:
 full_matrix_projection
 ----------------------
-..  autoclass:: paddle.v2.layer.full_matrix_projection
+..  autofunction:: paddle.v2.layer.full_matrix_projection
    :noindex:
 identity_projection
 -------------------
-..  autoclass:: paddle.v2.layer.identity_projection
+..  autofunction:: paddle.v2.layer.identity_projection
    :noindex:
 slice_projection
 -------------------
-..  autoclass:: paddle.v2.layer.slice_projection
+..  autofunction:: paddle.v2.layer.slice_projection
    :noindex:
 table_projection
 ----------------
-..  autoclass:: paddle.v2.layer.table_projection
+..  autofunction:: paddle.v2.layer.table_projection
    :noindex:
 trans_full_matrix_projection
 ----------------------------
-..  autoclass:: paddle.v2.layer.trans_full_matrix_projection
+..  autofunction:: paddle.v2.layer.trans_full_matrix_projection
    :noindex:
 Aggregate Layers
@@ -245,51 +245,46 @@ AggregateLevel
 pooling
 -------
-..  autoclass:: paddle.v2.layer.pooling
+..  autofunction:: paddle.v2.layer.pooling
    :noindex:
 ..  _api_v2.layer_last_seq:
 last_seq
 --------
-..  autoclass:: paddle.v2.layer.last_seq
+..  autofunction:: paddle.v2.layer.last_seq
    :noindex:
 ..  _api_v2.layer_first_seq:
 first_seq
 ---------
-..  autoclass:: paddle.v2.layer.first_seq
+..  autofunction:: paddle.v2.layer.first_seq
    :noindex:
 sub_seq
 ---------
-..  autoclass:: paddle.v2.layer.sub_seq
+..  autofunction:: paddle.v2.layer.sub_seq
    :noindex:
 concat
 ------
-..  autoclass:: paddle.v2.layer.concat
+..  autofunction:: paddle.v2.layer.concat
    :noindex:
 seq_concat
 ----------
-..  autoclass:: paddle.v2.layer.seq_concat
+..  autofunction:: paddle.v2.layer.seq_concat
    :noindex:
 seq_slice
 ---------
-..  autoclass:: paddle.v2.layer.seq_slice
+..  autofunction:: paddle.v2.layer.seq_slice
-    :noindex:
-kmax_sequence_score
-------------------
-..  autoclass:: paddle.v2.layer.kmax_sequence_score
    :noindex:
 sub_nested_seq
 --------------
-..  autoclass:: paddle.v2.layer.sub_nested_seq
+..  autofunction:: paddle.v2.layer.sub_nested_seq
    :noindex:
 Reshaping Layers
@@ -297,7 +292,7 @@ Reshaping Layers
 block_expand
 ------------
-..  autoclass:: paddle.v2.layer.block_expand
+..  autofunction:: paddle.v2.layer.block_expand
    :noindex:
 ..  _api_v2.layer_expand:
@@ -309,22 +304,22 @@ ExpandLevel
 expand
 ------
-..  autoclass:: paddle.v2.layer.expand
+..  autofunction:: paddle.v2.layer.expand
    :noindex:
 repeat
 ------
-..  autoclass:: paddle.v2.layer.repeat
+..  autofunction:: paddle.v2.layer.repeat
    :noindex:
 rotate
 ------
-..  autoclass:: paddle.v2.layer.rotate
+..  autofunction:: paddle.v2.layer.rotate
    :noindex:
 seq_reshape
 -----------
-..  autoclass:: paddle.v2.layer.seq_reshape
+..  autofunction:: paddle.v2.layer.seq_reshape
    :noindex:
 Math Layers
@@ -332,94 +327,94 @@ Math Layers
 addto
 -----
-..  autoclass:: paddle.v2.layer.addto
+..  autofunction:: paddle.v2.layer.addto
    :noindex:
 linear_comb
 -----------
-..  autoclass:: paddle.v2.layer.linear_comb
+..  autofunction:: paddle.v2.layer.linear_comb
    :noindex:
 interpolation
 -------------
-..  autoclass:: paddle.v2.layer.interpolation
+..  autofunction:: paddle.v2.layer.interpolation
    :noindex:
 bilinear_interp
 ---------------
-..  autoclass:: paddle.v2.layer.bilinear_interp
+..  autofunction:: paddle.v2.layer.bilinear_interp
    :noindex:
 dropout
 --------
-..  autoclass:: paddle.v2.layer.dropout
+..  autofunction:: paddle.v2.layer.dropout
    :noindex:
 dot_prod
 ---------
-.. autoclass:: paddle.v2.layer.dot_prod
+.. autofunction:: paddle.v2.layer.dot_prod
    :noindex:
 out_prod
 --------
-.. autoclass:: paddle.v2.layer.out_prod
+.. autofunction:: paddle.v2.layer.out_prod
    :noindex:
 power
 -----
-..  autoclass:: paddle.v2.layer.power
+..  autofunction:: paddle.v2.layer.power
    :noindex:
 scaling
 -------
-..  autoclass:: paddle.v2.layer.scaling
+..  autofunction:: paddle.v2.layer.scaling
    :noindex:
 clip
 ----
-..  autoclass:: paddle.v2.layer.clip
+..  autofunction:: paddle.v2.layer.clip
    :noindex:
 resize
 ------
-..  autoclass:: paddle.v2.layer.resize
+..  autofunction:: paddle.v2.layer.resize
    :noindex:
 slope_intercept
 ---------------
-..  autoclass:: paddle.v2.layer.slope_intercept
+..  autofunction:: paddle.v2.layer.slope_intercept
    :noindex:
 tensor
 ------
-..  autoclass:: paddle.v2.layer.tensor
+..  autofunction:: paddle.v2.layer.tensor
    :noindex:
 ..  _api_v2.layer_cos_sim:
 cos_sim
 -------
-..  autoclass:: paddle.v2.layer.cos_sim
+..  autofunction:: paddle.v2.layer.cos_sim
    :noindex:
 l2_distance
 -----------
-..  autoclass:: paddle.v2.layer.l2_distance
+..  autofunction:: paddle.v2.layer.l2_distance
    :noindex:
 trans
 -----
-..  autoclass:: paddle.v2.layer.trans
+..  autofunction:: paddle.v2.layer.trans
    :noindex:
 scale_shift
 -----------
-..  autoclass:: paddle.v2.layer.scale_shift
+..  autofunction:: paddle.v2.layer.scale_shift
    :noindex:
 factorization_machine
 ---------------------
-..  autoclass:: paddle.v2.layer.factorization_machine
+..  autofunction:: paddle.v2.layer.factorization_machine
    :noindex:
 Sampling Layers
@@ -427,17 +422,17 @@ Sampling Layers
 maxid
 -----
-..  autoclass:: paddle.v2.layer.max_id
+..  autofunction:: paddle.v2.layer.max_id
    :noindex:
 sampling_id
 -----------
-..  autoclass:: paddle.v2.layer.sampling_id
+..  autofunction:: paddle.v2.layer.sampling_id
    :noindex:
 multiplex
 ---------
-..  autoclass:: paddle.v2.layer.multiplex
+..  autofunction:: paddle.v2.layer.multiplex
    :noindex:
 ..  _api_v2.layer_costs:
@@ -447,97 +442,97 @@ Cost Layers
 cross_entropy_cost
 ------------------
-..  autoclass:: paddle.v2.layer.cross_entropy_cost
+..  autofunction:: paddle.v2.layer.cross_entropy_cost
    :noindex:
 cross_entropy_with_selfnorm_cost
 --------------------------------
-..  autoclass:: paddle.v2.layer.cross_entropy_with_selfnorm_cost
+..  autofunction:: paddle.v2.layer.cross_entropy_with_selfnorm_cost
    :noindex:
 multi_binary_label_cross_entropy_cost
 -------------------------------------
-..  autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
+..  autofunction:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
    :noindex:
 classification_cost
 -------------------
-.. autoclass:: paddle.v2.layer.classification_cost
+.. autofunction:: paddle.v2.layer.classification_cost
   :noindex:
 huber_regression_cost
 -------------------------
-..  autoclass:: paddle.v2.layer.huber_regression_cost
+..  autofunction:: paddle.v2.layer.huber_regression_cost
    :noindex:
 huber_classification_cost
 -------------------------
-..  autoclass:: paddle.v2.layer.huber_classification_cost
+..  autofunction:: paddle.v2.layer.huber_classification_cost
    :noindex:
 lambda_cost
 -----------
-..  autoclass:: paddle.v2.layer.lambda_cost
+..  autofunction:: paddle.v2.layer.lambda_cost
    :noindex:
 square_error_cost
 -----------------
-..  autoclass:: paddle.v2.layer.square_error_cost
+..  autofunction:: paddle.v2.layer.square_error_cost
    :noindex:
 rank_cost
 ---------
-..  autoclass:: paddle.v2.layer.rank_cost
+..  autofunction:: paddle.v2.layer.rank_cost
    :noindex:
 sum_cost
 ---------
-..  autoclass:: paddle.v2.layer.sum_cost
+..  autofunction:: paddle.v2.layer.sum_cost
    :noindex:
 crf
 ---
-..  autoclass:: paddle.v2.layer.crf
+..  autofunction:: paddle.v2.layer.crf
    :noindex:
 crf_decoding
 ------------
-..  autoclass:: paddle.v2.layer.crf_decoding
+..  autofunction:: paddle.v2.layer.crf_decoding
    :noindex:
 ctc
 ---
-..  autoclass:: paddle.v2.layer.ctc
+..  autofunction:: paddle.v2.layer.ctc
    :noindex:
 warp_ctc
 --------
-..  autoclass:: paddle.v2.layer.warp_ctc
+..  autofunction:: paddle.v2.layer.warp_ctc
    :noindex:
 nce
 ---
-..  autoclass:: paddle.v2.layer.nce
+..  autofunction:: paddle.v2.layer.nce
    :noindex:
 hsigmoid
 ---------
-..  autoclass:: paddle.v2.layer.hsigmoid
+..  autofunction:: paddle.v2.layer.hsigmoid
    :noindex:
 smooth_l1_cost
 --------------
-..  autoclass:: paddle.v2.layer.smooth_l1_cost
+..  autofunction:: paddle.v2.layer.smooth_l1_cost
    :noindex:
 multibox_loss
 --------------
-..  autoclass:: paddle.v2.layer.multibox_loss
+..  autofunction:: paddle.v2.layer.multibox_loss
    :noindex:
 detection_output
 ----------------
-..  autoclass:: paddle.v2.layer.detection_output
+..  autofunction:: paddle.v2.layer.detection_output
    :noindex:
 Check Layer
@@ -545,7 +540,7 @@ Check Layer
 eos
 ---
-..  autoclass:: paddle.v2.layer.eos
+..  autofunction:: paddle.v2.layer.eos
    :noindex:
 Activation
@@ -553,5 +548,5 @@ Activation
 prelu
 --------
-..  autoclass:: paddle.v2.layer.prelu
+..  autofunction:: paddle.v2.layer.prelu
    :noindex:
--- a/doc/v2/api/index_en.rst
+++ b/doc/v2/api/index_en.rst
@@ -8,4 +8,3 @@ API
    model_configs.rst
    data.rst
    run_logic.rst
-    fluid/index.rst
--- a/doc/v2/build_and_install/pip_install_cn.rst
+++ b/doc/v2/build_and_install/pip_install_cn.rst
@@ -60,6 +60,7 @@ paddlepaddle-gpu==0.11.0            使用CUDA 7.5和cuDNN 5编译的0.11.0版
    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
 .. _pip_dependency:

--- a/doc/v2/build_and_install/pip_install_en.rst
+++ b/doc/v2/build_and_install/pip_install_en.rst
@@ -63,6 +63,7 @@ If the links below shows up the login form, just click "Log in as guest" to star
    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
 .. _pip_dependency:

--- a/paddle/contrib/CMakeLists.txt
+++ b/paddle/contrib/CMakeLists.txt
@@ -14,3 +14,4 @@
 #
 add_subdirectory(inference)
+add_subdirectory(tape)
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -17,48 +17,9 @@ if(APPLE)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
 endif(APPLE)
-set(ANAKIN_INCLUDE "" CACHE STRING "root of Anakin header files")
-set(ANAKIN_LIBRARY "" CACHE STRING "path of Anakin library")
 set(inference_deps paddle_inference_api paddle_fluid_api)
-# if anakin is set enable anakin api implementation
-if(ANAKIN_INCLUDE AND ANAKIN_LIBRARY)
-    set(ANAKIN_FOUND ON)
-else()
-    set(ANAKIN_FOUND OFF)
-endif()
-function(fetch_include_recursively root_dir) 
-    if (IS_DIRECTORY ${root_dir}) 
-        include_directories(${root_dir})
-    endif()
-    file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*)
-    foreach(sub ${ALL_SUB})
-        if (IS_DIRECTORY ${root_dir}/${sub})
-            fetch_include_recursively(${root_dir}/${sub})
-        endif()
-    endforeach()
-endfunction()
-if (ANAKIN_FOUND)
-    # Anakin's code style doesn't follow google c style.
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp")
-    message(STATUS "Anakin for inference is enabled")
-    message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
-    fetch_include_recursively(${ANAKIN_INCLUDE})
-    link_directories(${ANAKIN_LIBRARY})
-    nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
-    target_link_libraries(inference_anakin_api anakin anakin_saber_common)
-    list(APPEND inference_deps inference_anakin_api)
-endif()
 function(inference_api_test TARGET_NAME)
    if (WITH_TESTING)
        set(options "")
@@ -79,7 +40,7 @@ function(inference_api_test TARGET_NAME)
 endfunction(inference_api_test)
 cc_library(paddle_inference_api
-    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc 
+    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 cc_test(test_paddle_inference_api
@@ -89,9 +50,17 @@ cc_test(test_paddle_inference_api
 inference_api_test(test_paddle_inference_api_impl
                    ARGS test_word2vec test_image_classification)
-if (ANAKIN_FOUND)
+if (WITH_ANAKIN)
+    # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
+    # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
+    # compile the libinference_anakin_api.a and compile with anakin.so.
+    nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
+    target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
+    target_link_libraries(inference_anakin_api anakin anakin_saber_common)
    cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
-    DEPS ${inference_deps})
+                                  ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
+                                  DEPS inference_anakin_api)
+    target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
 endif()
 if(WITH_TESTING)

--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <cuda.h>
 #include "paddle/contrib/inference/paddle_inference_api_anakin_engine.h"
+#include <cuda.h>
 namespace paddle {

--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
@@ -19,10 +19,9 @@ limitations under the License. */
 #pragma once
-// NOTE This header file do not have namespace.
-//#include <test/framework/net/paddle_api.h>
 #include "paddle/contrib/inference/paddle_inference_api.h"
+// from anakin
 #include "framework/core/net/net.h"
 #include "saber/saber_types.h"

--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
@@ -12,17 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "gflags/gflags.h"
 #include "paddle/contrib/inference/paddle_inference_api.h"
+DEFINE_string(model, "", "Directory of the inference model.");
 namespace paddle {
 AnakinConfig GetConfig() {
  AnakinConfig config;
-  config.model_file = "./mobilenet_v2.anakin.bin";
+  config.model_file = FLAGS_model;
  config.device = 0;
  config.max_batch_size = 1;
  return config;

--- a/paddle/contrib/tape/CMakeLists.txt
+++ b/paddle/contrib/tape/CMakeLists.txt
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+if(APPLE)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
+endif(APPLE)
+cc_library(tape_variable SRCS variable.cc DEPS ${FLUID_CORE_MODULES})
+cc_library(tape SRCS tape.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} tape_variable)
+cc_test(test_tape
+        SRCS test_tape.cc
+        DEPS tape tape_variable)
--- a/paddle/contrib/tape/README.md
+++ b/paddle/contrib/tape/README.md
+# Dynamic Graph on Fluid
+PaddlePaddle Fluid is targeting the autodiff without tape, which, however, is very
+challenging and we are still way from there. DyNet and PyTorch provide a good design
+idea, the *tape*, that significantly eases the challenge.  Also, DyNet provides
+a C++ API that is as convenient as Python but with higher efficiency and could
+conveniently integrate with industrial/production systems. This package, `tape`,
+combines the good of
+1. tape from PyTorch and DyNet
+2. C++ API and core from DyNet
+3. rich set of operators from PaddlePaddle
+## Overview
+We can implement Dynet-like Tape(See this [survey](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/survey/dynamic_graph.md))
+by wrapping Paddle Fluid's `Operator` and `Variable`.
+The user API is straight forward since
+1. it is imperative. And it uses host language's control flow logic.
+1. it avoids extra concepts such as `Scope` and `Executor`.
+All of these benefits come at the cost of just adding one line `reset_global_tape`
+at every iteration.
+## Code Structure
+In short, the `Tape` contains a vector of `OpHandle`s. And an `OpHandle` contains its
+`type`, the pointers to the `Variable`s, and necessary attributes.
+```c++
+class Variable {
+public:
+  VriableHandle Grad(); // returns its gradient variable
+private:
+  framework::VarDesc desc_; // compile time infershape, necessary for lazy execution
+  framework::Variable var_; // run time variable, holds data memory
+};
+using VariableHandle = shared_ptr<Variable>;
+struct OpHandle {
+  string type_;
+  map<string, vector<VariableHandle>> inputs_;
+  map<string, vector<VariableHandle>> outputs_;
+  AttributeMap attrs_;
+};
+class Tape {
+public:
+  void AddOp(OpHandle); // add op
+  void Forward();       // execute the tape_
+  void Backward();      // execute the backward of the tape_
+private:
+  vector<OpHandle> tape_;
+};
+```
+We uses `Function` to indicate layers. It takes care of parameter
+initialization and `AddOp` to the Tape when it is called.
+```c++
+class Linear {
+ public:
+  Linear(int in_dim, int out_dim, const std::string &act)
+      : w_(new Variable("LinearWeight")),
+        b_(new Variable("LinearBias")),
+        act_(act) {
+    Tape init_tape;
+    std::string initializer = "fill_constant";
+    framework::AttributeMap attrs;
+    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
+    attrs["shape"] = std::vector<int>{in_dim, out_dim};
+    attrs["value"] = 1.0f;
+    init_tape.AddOp(initializer, {}, {{"Out", {w_}}}, attrs);
+    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
+    attrs["shape"] = std::vector<int>{out_dim};
+    attrs["value"] = 1.0f;
+    init_tape.AddOp(initializer, {}, {{"Out", {b_}}}, attrs);
+    init_tape.Forward();
+  }
+  VariableHandle operator()(VariableHandle input) {
+    VariableHandle pre_bias(new Variable("linear"));
+    get_global_tape().AddOp("mul",
+                            {{"X", {input}}, {"Y", {w_}}},
+                            {{"Out", {pre_bias}}},
+                            {{"x_num_col_dims", 1}, {"y_num_col_dims", 1}});
+    VariableHandle pre_act(new Variable("linear"));
+    get_global_tape().AddOp("elementwise_add",
+                            {{"X", {pre_bias}}, {"Y", {b_}}},
+                            {{"Out", {pre_act}}},
+                            {{"axis", 1}});
+    VariableHandle post_act(new Variable("linear"));
+    get_global_tape().AddOp(act_,
+                            {{"X", {pre_act}}},
+                            {{"Out", {post_act}}},
+                            {});
+    return post_act;
+  }
+  std::vector<VariableHandle> Params() { return {w_, b_}; }
+ private:
+  VariableHandle w_;
+  VariableHandle b_;
+  std::string act_;
+};
+```
+## User API
+```c++
+// Model function
+paddle::tape::Linear linear1(3, 3, "relu"); // init weight and bias
+paddle::tape::Linear linear2(3, 3, "relu"); // init weight and bias
+paddle::tape::Mean mean;
+// Optimizer
+paddle::tape::SGD sgd(0.001);
+// Data Feeder
+paddle::tape::Fill data_feeder(...);
+VariableHandle input(new paddle::tape::Variable("input"));
+VariableHandle label(new paddle::tape::Variable("label"));
+for (int i = 0; i < 2; ++i) {
+  reset_global_tape();
+  data_feeder(input, label);
+  auto loss = softmax(linear2(linear1(input)), label); // compile time InferShape & InferVarType
+  LOG(INFO) << loss.value(); // Run forward up to loss
+  // Run backward, store gradient of w at w->Grad()
+  get_global_tape.Backward(loss);
+  // Update w
+  sgd(linear1.Params());
+  sgd(linear2.Params());
+}
+```
+<details>
+  <summary></summary>
+digraph G {
+	subgraph cluster_0 {
+                node [shape=record,style=filled];
+		style=filled;
+		color=lightgrey;
+                linear1 [label="{type: mul | {input | {<before_mul1>X: before_mul1 |<weight1> Y: weight1}} |  {output |<before_bias1> Out: before_bias1}}"];
+                elementwise_add1 [label="{type: elementwise_add | {input | {<before_bias1>X: before_bias1 |<bias1> Y: bias1}} |  {output |<before_act1> Out: before_act1}}"];
+                relu1 [label="{type: relu | {input | {<before_act1>X: before_act1 }} |  {output |<after_act1> Out: after_act1}}"];
+		linear1 -> elementwise_add1->relu1;
+		label = "forward tape";
+	}
+        linear1:before_mul1->before_mul1
+        linear1:weight1->weight1
+        linear1:before_bias1->before_bias1
+        elementwise_add1:bias1->bias1
+        elementwise_add1:before_bias1->before_bias1
+        elementwise_add1:before_act1->before_act1
+        relu1:before_act1->before_act1
+        relu1:after_act1->after_act1
+	subgraph cluster_1 {
+                node [shape=record,style=filled];
+		style=filled;
+		color=lightgrey;
+                linear1_grad [label="{type: mul_grad | {input | {<before_mul1>X: before_mul1 |<weight1> Y: weight1|<before_bias1_grad> Out_grad: before_bias1_grad}} |  {output |{<before_mul1_grad>X_grad: before_mul1_grad |<weight1_grad> Y_grad: weight1_grad}}}"];
+                elementwise_add1_grad [label="{type: elementwise_add_grad | {input | <before_act1_grad> Out_grad: before_act1_grad} |  {output |{<before_bias1_grad>X_grad: before_bias1_grad |<bias1_grad> Y_grad: bias1_grad}}}"];
+                relu1_grad [label="{type: relu_grad |  {input |<after_act1_grad> Out_grad: after_act1_grad} | {ouput | {<before_act1_grad>X_grad: before_act1_grad }}}"];
+		linear1_grad -> elementwise_add1_grad ->relu1_grad [dir=back];
+                label = "backward tape";
+	}
+        relu1_grad:after_act1_grad->after_act1_grad
+        relu1_grad:before_act1_grad->before_act1_grad
+        elementwise_add1_grad:before_act1_grad->before_act1_grad
+        elementwise_add1_grad:before_bias1_grad->before_bias1_grad
+        elementwise_add1_grad:bias1_grad->bias1_grad
+        linear1_grad:before_mul1->before_mul1
+        linear1_grad:weight1->weight1
+        linear1_grad:before_bias1_grad->before_bias1_grad
+        linear1_grad:before_mul1_grad->before_mul1_grad
+        linear1_grad:weight1_grad->weight1_grad
+	subgraph cluster_2 {
+                node [shape=record];
+                label = "Linear1";
+                weight1
+                bias1
+	}
+        weight1 -> weight1_grad [ label="Grad()", style="dashed" ];
+        bias1 -> bias1_grad [ label="Grad()", style="dashed"];
+}
+</details>
+![Image](https://github.com/tonyyang-svail/Paddle/blob/cpp_tap/paddle/contrib/tape/computation_graph.png)
+## Code Reuse
+We want to stay close to Paddle Fluid as much as possible.
+### Reuse All Operators
+As all Ops are registered at `OpInfoMap`, the effort of adding a new `Function`
+is about 10 lines of code, similar to expose an operator to Python.
+### Reuse Compile Time InferShape and InferVarType
+Note that all the symbolic information is stored at `tape::Varaible::desc_`, instead
+of `ProgramDesc.block.vars`, we create a temporary `BlockDesc` to do `InferShape` and
+`InferVarType` every time we `AddOp` to the tape.
+### Reuse Operator::Run
+We use smart pointer, instead of `Scope`, to manage memory. So we create a temporary
+`Scope` for every `Operator::Run()`.
+## Possible Feature
+### Release Memory on Backward
+We can release memory aggressively. During backward, we can delete the OpHandle once
+we have finished its backward. Since all the variable is managed by smart pointer, the
+memory is automatically released when its `ref_count` goes to 0.
+### Kernel Fusion
+As a symbolic representation of the Tape is constructed first before the actual
+execution, it would be possible to perform graph optimization. One use case is kernel
+fusion.
--- a/paddle/contrib/tape/computation_graph.png
+++ b/paddle/contrib/tape/computation_graph.png
--- a/paddle/contrib/tape/function.h
+++ b/paddle/contrib/tape/function.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include "paddle/contrib/tape/tape.h"
+#include "paddle/contrib/tape/variable.h"
+#include "paddle/fluid/framework/type_defs.h"
+namespace paddle {
+namespace tape {
+class Function {};
+class Fill {
+ public:
+  Fill(const std::string &initializer, const framework::AttributeMap &attrs)
+      : initializer_(initializer), attrs_(attrs) {}
+  void operator()(VariableHandle var) {
+    get_global_tape().AddOp(initializer_, {}, {{"Out", {var}}}, attrs_);
+  }
+ private:
+  const std::string initializer_;
+  const framework::AttributeMap attrs_;
+};
+class Mean {
+ public:
+  VariableHandle operator()(VariableHandle var) {
+    VariableHandle out(new Variable("mean"));
+    get_global_tape().AddOp("mean", {{"X", {var}}}, {{"Out", {out}}}, {});
+    return out;
+  }
+};
+class Linear {
+ public:
+  Linear(int in_dim, int out_dim, const std::string &act)
+      : w_(new Variable("LinearWeight")),
+        b_(new Variable("LinearBias")),
+        act_(act) {
+    Tape init_tape;
+    std::string initializer = "fill_constant";
+    framework::AttributeMap attrs;
+    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
+    attrs["shape"] = std::vector<int>{in_dim, out_dim};
+    attrs["value"] = 1.0f;
+    init_tape.AddOp(initializer, {}, {{"Out", {w_}}}, attrs);
+    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
+    attrs["shape"] = std::vector<int>{out_dim};
+    attrs["value"] = 1.0f;
+    init_tape.AddOp(initializer, {}, {{"Out", {b_}}}, attrs);
+    init_tape.Forward();
+  }
+  VariableHandle operator()(VariableHandle input) {
+    VariableHandle pre_bias(new Variable("linear"));
+    get_global_tape().AddOp("mul",
+                            {{"X", {input}}, {"Y", {w_}}},
+                            {{"Out", {pre_bias}}},
+                            {{"x_num_col_dims", 1}, {"y_num_col_dims", 1}});
+    VariableHandle pre_act(new Variable("linear"));
+    get_global_tape().AddOp("elementwise_add",
+                            {{"X", {pre_bias}}, {"Y", {b_}}},
+                            {{"Out", {pre_act}}},
+                            {{"axis", 1}});
+    VariableHandle post_act(new Variable("linear"));
+    get_global_tape().AddOp(
+        act_, {{"X", {pre_act}}}, {{"Out", {post_act}}}, {});
+    return post_act;
+  }
+  std::vector<VariableHandle> Params() { return {w_, b_}; }
+ private:
+  VariableHandle w_;
+  VariableHandle b_;
+  std::string act_;
+};
+class SGD {
+ public:
+  SGD(float learning_rate) : learning_rate_(new Variable("sgd")) {
+    Tape init_tape;
+    std::string initializer = "fill_constant";
+    framework::AttributeMap attrs;
+    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
+    attrs["shape"] = std::vector<int>{1};
+    attrs["value"] = learning_rate;
+    init_tape.AddOp(initializer, {}, {{"Out", {learning_rate_}}}, attrs);
+    init_tape.Forward();
+  }
+  void operator()(VariableHandle input) {
+    PADDLE_ENFORCE(get_global_tape().HasBeenBackwarded(),
+                   "optimization must happen after the backward");
+    Tape temp_tape;
+    temp_tape.AddOp("sgd",
+                    {{"Param", {input}},
+                     {"LearningRate", {learning_rate_}},
+                     {"Grad", {input->Grad()}}},
+                    {{"ParamOut", {input}}},
+                    {});
+    temp_tape.Forward();
+  }
+ private:
+  VariableHandle learning_rate_;
+};
+}
+}
--- a/paddle/contrib/tape/tape.cc
+++ b/paddle/contrib/tape/tape.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/contrib/tape/tape.h"
+#include <list>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/dim.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/pybind/pybind.h"
+namespace paddle {
+namespace tape {
+// borrowed from
+// https://stackoverflow.com/questions/874134/find-if-string-ends-with-another-string-in-c
+inline bool ends_with(std::string const &value, std::string const &ending) {
+  if (ending.size() > value.size()) return false;
+  return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
+}
+std::ostream &operator<<(std::ostream &os, const framework::VarDesc &var_desc) {
+  os << var_desc.Name();
+  os << "[" << var_desc.GetType() << "]";
+  os << "[" << var_desc.GetDataType() << "]";
+  os << "{";
+  for (auto &i : var_desc.GetShape()) {
+    os << i << ",";
+  }
+  os << "}";
+  return os;
+}
+std::string to_string(const std::string &type,
+                      const VariableHandleMap &in_vars,
+                      const VariableHandleMap &out_vars,
+                      const framework::AttributeMap &attrs) {
+  std::stringstream ss;
+  ss << type << " ";
+  for (auto &param_name : in_vars) {
+    for (auto &var : param_name.second) {
+      ss << param_name.first << ":(" << var->Desc() << ") ";
+    }
+  }
+  for (auto &param_name : out_vars) {
+    for (auto &var : param_name.second) {
+      ss << param_name.first << ":(" << var->Desc() << ") ";
+    }
+  }
+  return ss.str();
+}
+framework::OpDesc CreateOpDesc(const std::string &type,
+                               const VariableHandleMap &in_vars,
+                               const VariableHandleMap &out_vars,
+                               const framework::AttributeMap &attrs) {
+  framework::VariableNameMap inputs;
+  for (auto &param_name : in_vars) {
+    for (auto &var : param_name.second) {
+      inputs[param_name.first].emplace_back(var->Name());
+    }
+  }
+  framework::VariableNameMap outputs;
+  for (auto &param_name : out_vars) {
+    for (auto &var : param_name.second) {
+      outputs[param_name.first].emplace_back(var->Name());
+    }
+  }
+  return framework::OpDesc(type, inputs, outputs, attrs);
+}
+void InferShapeAndVarType(const std::string &type,
+                          const VariableHandleMap &in_vars,
+                          VariableHandleMap *out_vars,
+                          const framework::AttributeMap &attrs) {
+  framework::OpDesc op_desc = CreateOpDesc(type, in_vars, *out_vars, attrs);
+  // Create a temporary block for compile-time
+  framework::ProgramDesc program_desc;
+  framework::BlockDesc *block_desc = program_desc.MutableBlock(0);
+  PADDLE_ENFORCE(block_desc);
+  for (auto &param_name : in_vars) {
+    for (auto &var : param_name.second) {
+      *block_desc->Var(var->Name())->Proto() = *var->MutableDesc()->Proto();
+    }
+  }
+  for (auto &param_name : *out_vars) {
+    for (auto &var : param_name.second) {
+      *block_desc->Var(var->Name())->Proto() = *var->MutableDesc()->Proto();
+    }
+  }
+  LOG(INFO) << "- " << to_string(type, in_vars, *out_vars, attrs);
+  op_desc.InferShape(*block_desc);
+  op_desc.InferVarType(block_desc);
+  for (auto &param_name : *out_vars) {
+    for (auto &var : param_name.second) {
+      *var->MutableDesc()->Proto() = *block_desc->Var(var->Name())->Proto();
+    }
+  }
+  LOG(INFO) << "+ " << to_string(type, in_vars, *out_vars, attrs);
+}
+void Tape::AddOp(const std::string &type,
+                 const VariableHandleMap &in_vars,
+                 VariableHandleMap out_vars,
+                 const framework::AttributeMap &attrs) {
+  InferShapeAndVarType(type, in_vars, &out_vars, attrs);
+  tape_.emplace_back(type, in_vars, out_vars, attrs);
+}
+// Temporary Scope for Operator::Run()
+class ScopeWrapper : public framework::Scope {
+ public:
+  ScopeWrapper(const VariableHandleMap &in_vars,
+               const VariableHandleMap &out_vars) {
+    for (auto &v : in_vars) {
+      for (auto &vv : v.second) {
+        if (!vars_.count(vv->Name())) {
+          vars_[vv->Name()].reset(vv->Var());
+        }
+      }
+    }
+    for (auto &v : out_vars) {
+      for (auto &vv : v.second) {
+        if (!vars_.count(vv->Name())) {
+          vars_[vv->Name()].reset(vv->Var());
+        }
+      }
+    }
+  }
+  ~ScopeWrapper() {
+    for (auto &pair : vars_) {
+      pair.second.release();
+    }
+  }
+};
+void Tape::Forward() {
+  LOG(INFO) << "Starting forward -------------------------";
+  PADDLE_ENFORCE(!has_been_backwarded_);
+  while (current_position_ < tape_.size()) {
+    OpHandle &op = tape_[current_position_];
+    // Create Output Tensor, this is only necessary for OpWithKernel
+    for (auto &param2var : op.outputs_) {
+      for (auto &var : param2var.second) {
+        var->InitializeVariable();
+      }
+    }
+    framework::OpDesc op_desc =
+        CreateOpDesc(op.type_, op.inputs_, op.outputs_, op.attrs_);
+    ScopeWrapper scope(op.inputs_, op.outputs_);
+    framework::OpRegistry::CreateOp(op_desc)->Run(scope, platform::CPUPlace());
+    current_position_++;
+  }
+  LOG(INFO) << "Finishing forward -------------------------";
+}
+void Tape::Backward(VariableHandle target) {
+  PADDLE_ENFORCE(!has_been_backwarded_);
+  Forward();
+  // TODO(tonyyang-svail): check output of last op is target
+  backward_tape_.reset(new Tape());
+  framework::AttributeMap attrs;
+  // FIXME(tonyyang-svail): Need to infer_data_type
+  attrs["dtype"] = framework::proto::VarType::Type::VarType_Type_FP32;
+  attrs["shape"] = std::vector<int>{1};
+  attrs["value"] = 1.0f;
+  backward_tape_->AddOp(
+      "fill_constant", {}, {{"Out", {target->Grad()}}}, attrs);
+  for (auto it = tape_.rbegin(); it != tape_.rend(); ++it) {
+    framework::OpDesc op_desc =
+        CreateOpDesc(it->type_, it->inputs_, it->outputs_, it->attrs_);
+    std::unordered_map<std::string, std::string> grad_to_var;
+    std::vector<std::unique_ptr<framework::OpDesc>> grad_op_descs =
+        framework::OpInfoMap::Instance()
+            .Get(op_desc.Type())
+            .GradOpMaker()(op_desc, {}, &grad_to_var, {});
+    for (auto &op_desc : grad_op_descs) {
+      std::unordered_map<std::string, VariableHandle> name2var;
+      for (auto &param2vars : it->inputs_) {
+        for (auto &a : param2vars.second) {
+          name2var[a->Name()] = a;
+        }
+      }
+      for (auto &param2vars : it->outputs_) {
+        for (auto &a : param2vars.second) {
+          name2var[a->Name()] = a;
+        }
+      }
+      VariableHandleMap in_vars;
+      VariableHandleMap out_vars;
+      std::map<const framework::VariableNameMap *, VariableHandleMap *>
+          loop_over{{&op_desc->Inputs(), &in_vars},
+                    {&op_desc->Outputs(), &out_vars}};
+      for (auto &each : loop_over) {
+        auto &vmp = *each.first;
+        auto &vhm = *each.second;
+        for (auto &p2a : vmp) {
+          for (auto &argu : p2a.second) {
+            if (name2var.count(argu)) {
+              vhm[p2a.first].push_back(name2var[argu]);
+            } else {
+              PADDLE_ENFORCE(ends_with(argu, framework::kGradVarSuffix),
+                             argu.c_str());
+              std::string name = argu.substr(
+                  0, argu.size() - std::strlen(framework::kGradVarSuffix));
+              PADDLE_ENFORCE(name2var.count(name), name.c_str());
+              vhm[p2a.first].push_back(name2var[name]->Grad());
+            }
+          }
+        }
+      }
+      backward_tape_->AddOp(
+          op_desc->Type(), in_vars, out_vars, op_desc->GetAttrMap());
+    }
+    // TODO(tonyyang-svail): how to fill empty grad?
+    // TODO(tonyyang-svail): Sum var grad is necessary
+  }
+  backward_tape_->Forward();
+  has_been_backwarded_ = true;
+}
+Tape &get_global_tape() {
+  static Tape T;
+  return T;
+}
+void reset_global_tape() { get_global_tape() = Tape(); }
+}
+}
--- a/paddle/contrib/tape/tape.h
+++ b/paddle/contrib/tape/tape.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/contrib/tape/variable.h"
+namespace paddle {
+namespace tape {
+using VariableHandleMap = std::map<std::string, std::vector<VariableHandle>>;
+struct OpHandle {
+  OpHandle(const std::string &type,
+           const VariableHandleMap &in_vars,
+           const VariableHandleMap &out_vars,
+           const framework::AttributeMap &attrs)
+      : type_(type), inputs_(in_vars), outputs_(out_vars), attrs_(attrs) {}
+  std::string type_;
+  VariableHandleMap inputs_;
+  VariableHandleMap outputs_;
+  framework::AttributeMap attrs_;
+};
+class Tape {
+ public:
+  void AddOp(const std::string &type,
+             const VariableHandleMap &in_vars,
+             VariableHandleMap out_vars,
+             const framework::AttributeMap &attrs);
+  void Forward();
+  void Backward(VariableHandle target);
+  bool HasBeenBackwarded() { return has_been_backwarded_; }
+ private:
+  bool has_been_backwarded_ = false;
+  size_t current_position_ = 0;
+  std::vector<OpHandle> tape_;
+  std::shared_ptr<Tape> backward_tape_;
+};
+Tape &get_global_tape();
+void reset_global_tape();
+}
+}
--- a/paddle/contrib/tape/test_tape.cc
+++ b/paddle/contrib/tape/test_tape.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "gtest/gtest.h"
+#include "paddle/contrib/tape/function.h"
+using namespace paddle::tape;
+TEST(Tape, TestMLP) {
+  LOG(INFO) << "TestMLP";
+  Linear linear1(3, 3, "relu");
+  Linear linear2(3, 3, "relu");
+  Mean mean;
+  SGD sgd(0.001);
+  std::string initializer = "fill_constant";
+  paddle::framework::AttributeMap attrs;
+  attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
+  attrs["shape"] = std::vector<int>{3, 3};
+  attrs["value"] = 1.0f;
+  Fill filler(initializer, attrs);
+  for (int i = 0; i < 2; ++i) {
+    reset_global_tape();
+    VariableHandle input(new Variable("input"));
+    filler(input);
+    auto loss = mean(linear2(linear1(input)));
+    get_global_tape().Backward(loss);
+    for (auto w : linear1.Params()) {
+      sgd(w);
+    }
+    for (auto w : linear2.Params()) {
+      sgd(w);
+    }
+  }
+}
+int main(int argc, char** argv) {
+  std::vector<paddle::platform::Place> places;
+  places.emplace_back(paddle::platform::CPUPlace());
+  paddle::platform::DeviceContextPool::Init(places);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
--- a/paddle/contrib/tape/variable.cc
+++ b/paddle/contrib/tape/variable.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/contrib/tape/variable.h"
+namespace paddle {
+namespace tape {
+void Variable::InitializeVariable() {
+  LOG(INFO) << "Initialzing " << desc_.Name() << " as " << desc_.GetType();
+  framework::proto::VarType::Type var_type = desc_.GetType();
+  if (var_type == framework::proto::VarType::LOD_TENSOR) {
+    var_.GetMutable<framework::LoDTensor>();
+  } else if (var_type == framework::proto::VarType::SELECTED_ROWS) {
+    var_.GetMutable<framework::SelectedRows>();
+  } else {
+    PADDLE_THROW("Variable type %d is not in [LOD_TENSOR, SELECTED_ROWS]",
+                 var_type);
+  }
+}
+}
+}
--- a/paddle/contrib/tape/variable.h
+++ b/paddle/contrib/tape/variable.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include "paddle/fluid/framework/operator.h"  // framework::kGradVarSuffix
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/variable.h"
+namespace paddle {
+namespace tape {
+class Variable;
+using VariableHandle = std::shared_ptr<Variable>;
+/*
+ * Combination of
+ *     framework::VarDesc desc_;
+ *     framework::Variable var_;
+ */
+class Variable {
+ public:
+  Variable(const std::string pre_fix)
+      : desc_(pre_fix + std::to_string(count())) {}
+  Variable(const std::string pre_fix, bool is_grad)
+      : desc_(pre_fix + (is_grad ? framework::kGradVarSuffix
+                                 : std::to_string(count()))) {}
+  ~Variable() { LOG(INFO) << "Deleting " << Name(); }
+  // Instantiate LoDTensor/SelectedRow
+  void InitializeVariable();
+  VariableHandle Grad() {
+    if (grad_.expired()) {
+      VariableHandle new_grad(new Variable(desc_.Name(), true));
+      grad_ = new_grad;
+      return new_grad;
+    } else {
+      return VariableHandle(grad_);
+    }
+  }
+  // Stochastic Gradient Descent with Momentum
+  //  VariableHandle Momentum ();
+  //  void init(const std::string& initializer,
+  //            const framework::AttributeMap& attrs);
+  // void value() {};
+  const framework::VarDesc& Desc() const { return desc_; }
+  framework::VarDesc* MutableDesc() { return &desc_; }
+  // TODO(tonyyang-svail): No need to expose name
+  std::string Name() const { return desc_.Name(); }
+  framework::Variable* Var() { return &var_; }
+ private:
+  int count() {
+    static int counter = 0;
+    return counter++;
+  }
+  framework::VarDesc desc_;
+  framework::Variable var_;
+  std::weak_ptr<Variable> grad_;
+};
+}
+}
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -84,7 +84,7 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
 if(WITH_DISTRIBUTE)
-  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr)
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr)
  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
  set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -330,8 +330,12 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
  }
  for (auto& op : ctx->ops_) {
-    VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
+    VLOG(4) << place_ << " " << op->DebugStringEx(local_scope);
    op->Run(*local_scope, place_);
+    // NOTE! Please do not delete this line, it's usefull because the debug
+    // string before and after op.run are different, after run the output
+    // will have right shape which is usefull for debug.
+    VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
    if (FLAGS_benchmark) {
      VLOG(2) << "Memory used after operator " + op->Type() + " running: "

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -69,6 +69,19 @@ static DDim GetDims(const Scope& scope, const std::string& name,
  }
 }
+static int GetRowSize(const Scope& scope, const std::string& name) {
+  Variable* var = scope.FindVar(name);
+  if (var == nullptr) {
+    return -1;
+  }
+  if (var->IsType<SelectedRows>()) {
+    return var->Get<SelectedRows>().rows().size();
+  }
+  return -1;
+}
 static LoD GetLoD(const Scope& scope, const std::string& name) {
  Variable* var = scope.FindVar(name);
  auto default_lod = LoD({{}});
@@ -85,6 +98,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
 }
 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
+  VLOG(10) << "- " << DebugStringEx(&scope);
  if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
    PADDLE_THROW("Cannot run operator on place %s", place);
@@ -94,6 +108,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #endif
  }
  RunImpl(scope, place);
+  VLOG(10) << "+ " << DebugStringEx(&scope);
 }
 bool OperatorBase::HasInputs(const std::string& name) const {
@@ -153,6 +168,10 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
    for (size_t i = 0; i < input.second.size(); ++i) {
      ss << input.second[i];
      if (scope) {
+        int row_size = GetRowSize(*scope, input.second[i]);
+        if (row_size >= 0) {
+          ss << "[row_size=" << row_size << "]";
+        }
        ss << "[" << GetDims(*scope, input.second[i], true) << "]";
        ss << "(" << GetLoD(*scope, input.second[i]) << ")";
      }
@@ -173,6 +192,10 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
    for (size_t i = 0; i < output.second.size(); ++i) {
      ss << output.second[i];
      if (scope) {
+        int row_size = GetRowSize(*scope, output.second[i]);
+        if (row_size >= 0) {
+          ss << "[row_size=" << row_size << "]";
+        }
        ss << "[" << GetDims(*scope, output.second[i], true) << "]";
        ss << "(" << GetLoD(*scope, output.second[i]) << ")";
      }

--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@@ -35,14 +35,15 @@ class ReaderBase {
 class DecoratedReader : public ReaderBase {
 public:
-  explicit DecoratedReader(ReaderBase* reader) : ReaderBase(), reader_(reader) {
+  explicit DecoratedReader(const std::shared_ptr<ReaderBase>& reader)
+      : ReaderBase(), reader_(reader) {
    PADDLE_ENFORCE_NOT_NULL(reader_);
  }
  void ReInit() override { reader_->ReInit(); }
 protected:
-  ReaderBase* reader_;
+  std::shared_ptr<ReaderBase> reader_;
 };
 class FileReader : public ReaderBase {
@@ -64,7 +65,7 @@ class ReaderHolder {
 public:
  void Reset(ReaderBase* reader) { reader_.reset(reader); }
-  ReaderBase* Get() const { return reader_.get(); }
+  std::shared_ptr<ReaderBase> Get() const { return reader_; }
  void ReadNext(std::vector<LoDTensor>* out) {
    PADDLE_ENFORCE_NOT_NULL(reader_);
@@ -76,7 +77,7 @@ class ReaderHolder {
  }
 private:
-  std::unique_ptr<ReaderBase> reader_;
+  std::shared_ptr<ReaderBase> reader_;
 };
 }  // namespace framework

--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -81,6 +81,9 @@ class Scope {
  // Rename variable to a new name and return the new name
  std::string Rename(const std::string& origin_name) const;
+ protected:
+  mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
 private:
  // Call Scope::NewScope for a sub-scope.
  explicit Scope(Scope const* parent) : parent_(parent) {}
@@ -93,8 +96,6 @@ class Scope {
  // Caller doesn't own the returned Variable.
  Variable* FindVarLocally(const std::string& name) const;
-  mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
  // Scope in `kids_` are owned by this class.
  mutable std::list<Scope*> kids_;
  Scope const* parent_{nullptr};

--- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
@@ -19,10 +19,17 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using batch_norm_bwd = mkldnn::batch_normalization_backward;
+using batch_norm_fwd = mkldnn::batch_normalization_forward;
+using framework::DataLayout;
+using framework::Tensor;
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::reorder;
+using mkldnn::stream;
 using paddle::platform::MKLDNNDeviceContext;
 using paddle::platform::MKLDNNMemDesc;
-using mkldnn::memory;
+using platform::to_void_cast;
 template <typename T>
 using EigenArrayMap =
@@ -64,21 +71,12 @@ void run_batch_norm_op(Args &&... args) {
  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
 }
-template <typename T>
-inline void *cast_const_to_void(const T *t) {
-  return static_cast<void *>(const_cast<T *>(t));
-}
 }  // namespace
 template <typename T>
 class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto data_layout_str = ctx.Attr<std::string>("data_layout");
-    auto data_layout = framework::StringToDataLayout(data_layout_str);
-    PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW,
-                   "MKLDNN batch normalization handles only NCHW data layout");
    const float epsilon = ctx.Attr<float>("epsilon");
    const float momentum = ctx.Attr<float>("momentum");
    const bool is_test = ctx.Attr<bool>("is_test");
@@ -99,41 +97,53 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    const auto *scale = ctx.Input<Tensor>("Scale");
    const auto *shift = ctx.Input<Tensor>("Bias");
-    y->mutable_data<T>(ctx.GetPlace());
+    PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
-    mean_out->mutable_data<T>(ctx.GetPlace());
+                       x->format() != memory::format::format_undef,
-    variance_out->mutable_data<T>(ctx.GetPlace());
+                   "Wrong layout/format set for Input x tensor");
+    const T *x_data = x->data<T>();
+    const T *mean_data = mean->data<T>();
+    const T *variance_data = variance->data<T>();
+    T *y_data = y->mutable_data<T>(ctx.GetPlace());
+    T *mean_out_data = mean_out->mutable_data<T>(ctx.GetPlace());
+    T *variance_out_data = variance_out->mutable_data<T>(ctx.GetPlace());
+    T *batch_mean_data = nullptr;
+    T *batch_variance_data = nullptr;
    if (!is_test) {
-      batch_mean->mutable_data<T>(ctx.GetPlace());
+      batch_mean_data = batch_mean->mutable_data<T>(ctx.GetPlace());
-      batch_variance->mutable_data<T>(ctx.GetPlace());
+      batch_variance_data = batch_variance->mutable_data<T>(ctx.GetPlace());
    }
    auto propagation = is_test == true ? mkldnn::prop_kind::forward_scoring
                                       : mkldnn::prop_kind::forward_training;
-    auto dims = paddle::framework::vectorize2int(x->dims());
+    auto src_tz = paddle::framework::vectorize2int(x->dims());
+    auto scale_tz = paddle::framework::vectorize2int(scale->dims());
-    auto src_md =
+    PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1");
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
+    const unsigned int ic = scale_tz[0];
-    auto dst_md =
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
-    auto src_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine};
-    auto dst_pd = mkldnn::memory::primitive_desc{dst_md, mkldnn_engine};
-    auto src = mkldnn::memory{src_pd, cast_const_to_void(x->data<T>())};
-    auto dst = mkldnn::memory{dst_pd, y->data<T>()};
    unsigned flags = mkldnn::use_scale_shift;
    if (is_test) flags |= mkldnn::use_global_stats;
+    // create mkldnn memory from input x tensor
+    auto src_memory =
+        memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
+               to_void_cast(x_data));
+    // create primitive descriptor for batch norm forward
    using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
-    auto batch_norm_fwd_desc =
+    auto batch_norm_fwd_desc = bn_fwd_types::op_desc{
-        bn_fwd_types::op_desc{propagation, src_md, epsilon, flags};
+        propagation, src_memory.get_primitive_desc().desc(), epsilon, flags};
-    auto batch_norm_fwd_pd =
+    std::shared_ptr<batch_norm_fwd::primitive_desc> batch_norm_fwd_pd =
-        bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine};
+        std::shared_ptr<batch_norm_fwd::primitive_desc>(
+            new batch_norm_fwd::primitive_desc(batch_norm_fwd_desc,
+                                               mkldnn_engine));
-    const unsigned int ic = dims[1];
+    // Save the pd to be used in backward pass
+    const std::string key = ctx.op().Output("SavedMean");
+    const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
+    dev_ctx.SetBlob(key_batch_norm_fwd_pd, batch_norm_fwd_pd);
    // MKLDNN requires a single piece of memory for scale and shift/bias data
    const size_t scaleshift_size = 2 * ic;
@@ -143,73 +153,58 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    copy_to_weights(scale->data<T>(), scale->data<T>() + ic, shift->data<T>(),
                    shift->data<T>() + ic, &scaleshift_data);
-    auto scaleshift_memory = mkldnn::memory{
+    // crate mkldnn memory for weights(scale/shift)
-        batch_norm_fwd_pd.weights_primitive_desc(), scaleshift_data.data()};
+    auto scaleshift_memory = memory(batch_norm_fwd_pd->weights_primitive_desc(),
+                                    scaleshift_data.data());
-    if (is_test) {
+    // create mkldnn memory for output y tensor
-      auto mean_memory = mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(),
+    auto dst_memory = memory(batch_norm_fwd_pd->dst_primitive_desc(), y_data);
-                                        cast_const_to_void(mean->data<T>())};
+    if (is_test) {
+      // create mkldnn memory for stats (as input)
+      auto mean_memory = memory(batch_norm_fwd_pd->mean_primitive_desc(),
+                                to_void_cast(mean_data));
      auto variance_memory =
-          mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(),
+          memory(batch_norm_fwd_pd->variance_primitive_desc(),
-                         cast_const_to_void(variance->data<T>())};
+                 to_void_cast(variance_data));
      run_batch_norm_op<typename bn_fwd_types::op_type>(
-          batch_norm_fwd_pd, src, (const mkldnn::primitive::at &)mean_memory,
+          *batch_norm_fwd_pd, src_memory,
+          (const mkldnn::primitive::at &)mean_memory,
          (const mkldnn::primitive::at &)variance_memory, scaleshift_memory,
-          dst);
+          dst_memory);
    } else {
+      // create mkldnn memory for stats (as output)
      auto mean_memory =
-          mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(),
+          memory(batch_norm_fwd_pd->mean_primitive_desc(), batch_mean_data);
-                         cast_const_to_void(batch_mean->data<T>())};
+      auto variance_memory = memory(
+          batch_norm_fwd_pd->variance_primitive_desc(), batch_variance_data);
-      auto variance_memory =
-          mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(),
-                         cast_const_to_void(batch_variance->data<T>())};
-      run_batch_norm_op<bn_fwd_types::op_type>(batch_norm_fwd_pd, src,
+      run_batch_norm_op<bn_fwd_types::op_type>(*batch_norm_fwd_pd, src_memory,
-                                               scaleshift_memory, dst,
+                                               scaleshift_memory, dst_memory,
                                               mean_memory, variance_memory);
    }
    if (!is_test) {
-      const unsigned int in = dims[0];
+      // mkldnn only compute stats for current batch
-      const unsigned int sample_size = x->numel() / in / ic;
+      // so we need compute momentum stats via Eigen lib
+      EigenVectorArrayMap<T> batch_mean_e(batch_mean_data, ic);
-      // saved_xx is use just in this batch of data
+      EigenVectorArrayMap<T> batch_variance_e(batch_variance_data, ic);
-      EigenVectorArrayMap<T> saved_mean_e(
+      ConstEigenVectorArrayMap<T> mean_e(mean_data, ic);
-          batch_mean->mutable_data<T>(ctx.GetPlace()), ic);
+      ConstEigenVectorArrayMap<T> variance_e{variance_data, ic};
-      EigenVectorArrayMap<T> saved_variance_e(
-          batch_variance->mutable_data<T>(ctx.GetPlace()), ic);
+      EigenVectorArrayMap<T> running_mean_e(mean_out_data, ic);
-      saved_mean_e.setZero();
+      EigenVectorArrayMap<T> running_variance_e(variance_out_data, ic);
-      saved_variance_e.setZero();
-      const unsigned int x_arr_size = in * ic;
-      ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, x_arr_size);
-      for (unsigned int nc = 0; nc < x_arr_size; ++nc) {
-        saved_mean_e(nc % ic) += x_arr.col(nc).sum();
-      }
-      saved_mean_e /= in * sample_size;
-      for (unsigned int nc = 0; nc < x_arr_size; ++nc) {
-        saved_variance_e(nc % ic) +=
-            (x_arr.col(nc) - saved_mean_e(nc % ic)).matrix().squaredNorm();
-      }
-      saved_variance_e /= in * sample_size;
-      ConstEigenVectorArrayMap<T> mean_arr{mean->data<T>(), ic};
-      ConstEigenVectorArrayMap<T> variance_arr{variance->data<T>(), ic};
-      EigenVectorArrayMap<T> running_mean_arr(
-          mean_out->mutable_data<T>(ctx.GetPlace()), ic);
-      EigenVectorArrayMap<T> running_var_arr(
-          variance_out->mutable_data<T>(ctx.GetPlace()), ic);
      auto one_minus_momentum = 1. - momentum;
-      running_mean_arr =
+      running_mean_e = mean_e * momentum + batch_mean_e * one_minus_momentum;
-          mean_arr * momentum + saved_mean_e * one_minus_momentum;
+      running_variance_e =
-      running_var_arr =
+          variance_e * momentum + batch_variance_e * one_minus_momentum;
-          variance_arr * momentum + saved_variance_e * one_minus_momentum;
    }
+    y->set_layout(DataLayout::kMKLDNN);
+    y->set_format(
+        (memory::format)dst_memory.get_primitive_desc().desc().data.format);
  }
 };
@@ -217,11 +212,6 @@ template <typename T>
 class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 public:
  void Compute(const paddle::framework::ExecutionContext &ctx) const override {
-    auto data_layout_str = ctx.Attr<std::string>("data_layout");
-    auto data_layout = framework::StringToDataLayout(data_layout_str);
-    PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW,
-                   "MKLDNN batch normalization handles only NCHW data layout");
    auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
    auto mkldnn_engine = dev_ctx.GetEngine();
@@ -238,88 +228,132 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    auto *diff_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
    auto *diff_shift = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-    diff_x->mutable_data<T>(ctx.GetPlace());
+    PADDLE_ENFORCE(diff_y->layout() == DataLayout::kMKLDNN &&
-    diff_scale->mutable_data<T>(ctx.GetPlace());
+                       diff_y->format() != memory::format::format_undef,
-    diff_shift->mutable_data<T>(ctx.GetPlace());
+                   "Wrong layout/format set for Input diff_y tensor");
+    const T *x_data = x->data<T>();
+    const T *diff_y_data = diff_y->data<T>();
+    const T *batch_mean_data = batch_mean->data<T>();
+    const T *batch_variance_data = batch_variance->data<T>();
+    const T *scale_data = scale->data<T>();
+    const T *shift_data = shift->data<T>();
+    T *diff_x_data = diff_x->mutable_data<T>(ctx.GetPlace());
+    T *diff_scale_data = diff_scale->mutable_data<T>(ctx.GetPlace());
+    T *diff_shift_data = diff_shift->mutable_data<T>(ctx.GetPlace());
+    auto src_tz = paddle::framework::vectorize2int(x->dims());
+    auto diff_src_tz = src_tz;
+    auto dst_tz = src_tz;
+    auto diff_dst_tz = dst_tz;
+    auto scale_tz = paddle::framework::vectorize2int(scale->dims());
+    PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1");
+    const unsigned int ic = scale_tz[0];
+    // Retrieve bn_fwd_pd from device context
+    const std::string key = ctx.op().Input("SavedMean");
+    const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
+    auto batch_norm_fwd_pd =
+        std::static_pointer_cast<batch_norm_fwd::primitive_desc>(
+            dev_ctx.GetBlob(key_batch_norm_fwd_pd));
+    PADDLE_ENFORCE(batch_norm_fwd_pd != nullptr,
+                   "Fail to find batch_norm_fwd_pd in device context");
-    auto dims = paddle::framework::vectorize2int(x->dims());
+    using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
-    unsigned flags = mkldnn::use_scale_shift | !mkldnn::use_global_stats;
-    auto src_md =
+    // create mkldnn memory from input diff_y tensor
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
+    auto user_diff_dst_memory =
-    auto dst_md =
+        memory({{{diff_dst_tz}, memory::data_type::f32, diff_y->format()},
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
+                mkldnn_engine},
-    auto diff_src_md =
+               to_void_cast(diff_y_data));
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
-    auto diff_dst_md =
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
-    using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
+    // create mkldnn memory from input x tensor
-    using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
+    auto src_memory =
+        memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
+               to_void_cast(x_data));
-    auto batch_norm_fwd_desc = bn_fwd_types::op_desc{
+    // for diff_dst, try to use same format as dst in forward pass
-        mkldnn::prop_kind::forward_training, src_md, epsilon, flags};
+    auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc();
-    auto batch_norm_fwd_pd =
+    auto diff_dst_md = diff_dst_pd.desc();
-        bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine};
+    // create primitive descriptor for batch norm backward
+    unsigned flags = mkldnn::use_scale_shift;
    auto batch_norm_bwd_desc = bn_bwd_types::op_desc{
-        mkldnn::prop_kind::backward, diff_dst_md, dst_md, epsilon, flags};
+        mkldnn::prop_kind::backward, diff_dst_md,
+        src_memory.get_primitive_desc().desc(), epsilon, flags};
    auto batch_norm_bwd_pd = bn_bwd_types::op_prim{
-        batch_norm_bwd_desc, mkldnn_engine, batch_norm_fwd_pd};
+        batch_norm_bwd_desc, mkldnn_engine, *batch_norm_fwd_pd};
-    auto src = mkldnn::memory{{src_md, mkldnn_engine},
+    // reorder user_diff_dst if it's not in preferred format
-                              cast_const_to_void(x->data<T>())};
+    auto diff_dst_memory = user_diff_dst_memory;
+    primitive reorder_diff_dst;
-    auto mean = mkldnn::memory{batch_norm_bwd_pd.mean_primitive_desc(),
+    bool is_diff_dst_reordered = false;
-                               cast_const_to_void(batch_mean->data<T>())};
+    if (diff_dst_pd != user_diff_dst_memory.get_primitive_desc()) {
+      diff_dst_memory = memory(diff_dst_pd);
-    auto variance =
+      reorder_diff_dst = reorder(user_diff_dst_memory, diff_dst_memory);
-        mkldnn::memory{batch_norm_bwd_pd.variance_primitive_desc(),
+      is_diff_dst_reordered = true;
-                       cast_const_to_void(batch_variance->data<T>())};
+    }
-    auto diff_dst = mkldnn::memory{{diff_dst_md, mkldnn_engine},
-                                   cast_const_to_void(diff_y->data<T>())};
-    const unsigned int ic = dims[1];
+    // create mkldnn memory for input tensors (src/mean/variance)
+    auto mean_memory = memory(batch_norm_bwd_pd.mean_primitive_desc(),
+                              to_void_cast(batch_mean_data));
+    auto variance_memory = memory(batch_norm_bwd_pd.variance_primitive_desc(),
+                                  to_void_cast(batch_variance_data));
+    // MKLDNN requires a single piece of memory for scale and shift/bias data
    const size_t scaleshift_size = 2 * ic;
    std::vector<T> scaleshift_data;
    scaleshift_data.reserve(scaleshift_size);
-    copy_to_weights(scale->data<T>(), scale->data<T>() + ic, shift->data<T>(),
+    copy_to_weights(scale_data, scale_data + ic, shift_data, shift_data + ic,
-                    shift->data<T>() + ic, &scaleshift_data);
+                    &scaleshift_data);
-    auto scaleshift_memory = mkldnn::memory{
+    // create mkldnn memory for input tensors (scale/shift)
-        batch_norm_bwd_pd.weights_primitive_desc(), scaleshift_data.data()};
+    auto scaleshift_memory = memory(batch_norm_bwd_pd.weights_primitive_desc(),
+                                    scaleshift_data.data());
+    // create mkldnn memory for output diff weights (combined scale/shift)
    std::vector<T> diff_scaleshift_data;
    diff_scaleshift_data.reserve(scaleshift_size);
-    copy_to_weights(diff_scale->data<T>(), diff_scale->data<T>() + ic,
-                    diff_shift->data<T>(), diff_shift->data<T>() + ic,
-                    &diff_scaleshift_data);
    auto diff_scaleshift_memory =
-        mkldnn::memory{batch_norm_bwd_pd.diff_weights_primitive_desc(),
+        memory(batch_norm_bwd_pd.diff_weights_primitive_desc(),
-                       diff_scaleshift_data.data()};
+               diff_scaleshift_data.data());
-    auto diff_src = mkldnn::memory{{diff_src_md, mkldnn_engine},
+    // here assume diff_src is in the same format of src
-                                   static_cast<void *>(diff_x->data<T>())};
+    auto diff_src_memory = memory(src_memory.get_primitive_desc(), diff_x_data);
-    run_batch_norm_op<bn_bwd_types::op_type>(
+    // finally create batch_norm backward primitive
-        batch_norm_bwd_pd, src, mean, variance, diff_dst, scaleshift_memory,
+    auto batch_norm_bwd_prim =
-        diff_src, diff_scaleshift_memory);
+        batch_norm_bwd(batch_norm_bwd_pd, src_memory, mean_memory,
+                       variance_memory, diff_dst_memory, scaleshift_memory,
+                       diff_src_memory, diff_scaleshift_memory);
+    // execute optional reorder and batch_norm backward primitive
+    std::vector<primitive> pipeline;
+    if (is_diff_dst_reordered) pipeline.push_back(reorder_diff_dst);
+    pipeline.push_back(batch_norm_bwd_prim);
+    stream(stream::kind::eager).submit(pipeline).wait();
+    // copy back diff sacle/shift to output tensors (diff scale/shift)
+    diff_scaleshift_data.resize(scaleshift_size);
    auto it = std::begin(diff_scaleshift_data);
-    std::copy(it, std::next(it, ic), diff_scale->data<T>());
+    std::copy(it, std::next(it, ic), diff_scale_data);
    std::copy(std::next(it, ic), std::end(diff_scaleshift_data),
-              diff_shift->data<T>());
+              diff_shift_data);
+    // set layout/format of output tensors
+    diff_x->set_layout(DataLayout::kMKLDNN);
+    diff_x->set_format((memory::format)diff_src_memory.get_primitive_desc()
+                           .desc()
+                           .data.format);
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP_KERNEL(batch_norm, MKLDNN, paddle::platform::CPUPlace,
+REGISTER_OP_KERNEL(batch_norm, MKLDNN, ::paddle::platform::CPUPlace,
                   ops::BatchNormMKLDNNOpKernel<float>);
-REGISTER_OP_KERNEL(batch_norm_grad, MKLDNN, paddle::platform::CPUPlace,
+REGISTER_OP_KERNEL(batch_norm_grad, MKLDNN, ::paddle::platform::CPUPlace,
                   ops::BatchNormMKLDNNGradOpKernel<float>);
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -110,19 +110,19 @@ class BatchNormOp : public framework::OperatorWithKernel {
                                         ctx.Input<Tensor>("Variance")->type()),
                      "Variance input should be of float type");
-    framework::LibraryType library_{framework::LibraryType::kPlain};
    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    framework::LibraryType library = framework::LibraryType::kPlain;
    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
 #ifdef PADDLE_WITH_MKLDNN
-    if (library_ == framework::LibraryType::kPlain &&
+    if (library == framework::LibraryType::kPlain &&
        platform::CanMKLDNNBeUsed(ctx)) {
-      library_ = framework::LibraryType::kMKLDNN;
+      library = framework::LibraryType::kMKLDNN;
      layout = framework::DataLayout::kMKLDNN;
    }
 #endif
    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
-                                   library_);
+                                   library);
  }
 };
@@ -370,19 +370,21 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
      PADDLE_THROW("can't find Y@GRAD");
    }
-    framework::LibraryType library_{framework::LibraryType::kPlain};
    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
 #ifdef PADDLE_WITH_MKLDNN
-    if (library_ == framework::LibraryType::kPlain &&
+    if (library == framework::LibraryType::kPlain &&
        platform::CanMKLDNNBeUsed(ctx)) {
-      library_ = framework::LibraryType::kMKLDNN;
+      library = framework::LibraryType::kMKLDNN;
-      layout_ = framework::DataLayout::kMKLDNN;
+      layout = framework::DataLayout::kMKLDNN;
    }
 #endif
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        layout_, library_);
+        layout, library);
  }
 };

--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -75,9 +75,8 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvOp::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
  framework::LibraryType library{framework::LibraryType::kPlain};
-  std::string data_format = ctx.Attr<std::string>("data_format");
  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  std::string data_format = ctx.Attr<std::string>("data_format");
  framework::DataLayout layout = framework::StringToDataLayout(data_format);
 #ifdef PADDLE_WITH_CUDA

--- a/paddle/fluid/operators/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/gen_nccl_id_op.cc
@@ -67,6 +67,10 @@ class GenNCCLIdOp : public framework::OperatorBase {
      client->AsyncSendVar(ep, dev_ctx, *scope, NCCL_ID_VARNAME);
    }
    client->Wait();
+    for (auto& ep : endpoint_list) {
+      client->AsyncSendBatchBarrier(ep);
+    }
+    client->Wait();
    VLOG(3) << "sending completed...";
  }

--- a/paddle/fluid/operators/mean_iou_op.cc
+++ b/paddle/fluid/operators/mean_iou_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/mean_iou_op.h"
+namespace paddle {
+namespace operators {
+class MeanIoUOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Predictions"),
+                   "Input (Predictions) of MeanIoU op should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input (labels) of MeanIoU op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutMeanIou"),
+                   "Output (OutMeanIou) of MeanIoU op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutWrong"),
+                   "Output (OutWrong) of MeanIoU op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutCorrect"),
+                   "Output (OutWrong) of MeanIoU op should not be null.");
+    int64_t num_classes =
+        static_cast<int64_t>(ctx->Attrs().Get<int>("num_classes"));
+    ctx->SetOutputDim("OutMeanIou", {1});
+    ctx->SetOutputDim("OutWrong", {num_classes});
+    ctx->SetOutputDim("OutCorrect", {num_classes});
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Predictions")->type()),
+        ctx.GetPlace());
+  }
+};
+class MeanIoUOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Predictions",
+             "(Tensor), A Tensor of prediction results for semantic labels"
+             " with type int32 or int64. The rank should be greater than 1.");
+    AddInput(
+        "Labels",
+        "(Tensor), A Tensor of ground truth labels with type int32 or int64."
+        "Its shape should be the same as Input(Predictions).");
+    AddInput("InWrongs",
+             "(vector<Tensor>), A list of Tensor with shape "
+             "[num_classes]. They are used to collect wrong number among "
+             "batches. Empty list is also valid here.")
+        .AsDuplicable()
+        .AsDispensable();
+    AddInput(
+        "InCorrects",
+        "(vector<Tensor>), A list of Tensor with shape "
+        "[num_classes]. They are used to collect correct number among batches. "
+        "Empty list is also valid here.")
+        .AsDuplicable()
+        .AsDispensable();
+    AddInput("InMeanIou",
+             "(vector<Tensor>), A list of Tensor that Output(mean_iou) should "
+             "be added to. Empty list is also valid here.")
+        .AsDuplicable()
+        .AsDispensable();
+    AddOutput("OutMeanIou",
+              "(vector<Tensor>), A Tensor representing the"
+              " mean intersection-over-union with shape [1].");
+    AddOutput("OutWrong", "(Tensor), A Tensor with shape [num_classes]. ");
+    AddOutput("OutCorrect", "(Tensor), A Tensor with shape [num_classes]. ");
+    AddAttr<int>("num_classes", "(int), The possible number of labels.");
+    AddComment(R"DOC(
+mean-IOU Operator.
+Mean Intersection-Over-Union is a common evaluation metric for
+semantic image segmentation, which first computes the IOU for each
+semantic class and then computes the average over classes. 
+IOU is defined as follows: 
+    IOU = true_positive / (true_positive + false_positive + false_negative).
+It is based on pixel level area while "IOU Similarity Operator" 
+is based on area of rectangle.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(mean_iou, ops::MeanIoUOp, ops::MeanIoUOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(mean_iou, ops::MeanIoUKernel<int>,
+                       ops::MeanIoUKernel<int32_t>,
+                       ops::MeanIoUKernel<int64_t>);
--- a/paddle/fluid/operators/mean_iou_op.cu
+++ b/paddle/fluid/operators/mean_iou_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/mean_iou_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_info.h"
+namespace paddle {
+namespace operators {
+using platform::PADDLE_CUDA_NUM_THREADS;
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+template <typename T>
+__global__ void CountCUDAKernel(const int num_classes, const int count,
+                                const T* predictions, const T* labels,
+                                int* wrong, int* correct) {
+  extern __shared__ int blcok_cache[];
+  int* wrong_c = blcok_cache;
+  int* correct_c = blcok_cache + num_classes;
+  // init cache
+  for (int i = threadIdx.x; i < num_classes * 2; i += blockDim.x) {
+    blcok_cache[i] = 0;
+  }
+  __syncthreads();
+  T pred;
+  T label;
+  CUDA_1D_KERNEL_LOOP(i, count) {
+    pred = predictions[i];
+    label = labels[i];
+    if (pred == label) {
+      atomicAdd(correct_c + pred, 1);
+    } else {
+      atomicAdd(wrong_c + pred, 1);
+      atomicAdd(wrong_c + label, 1);
+    }
+  }
+  __syncthreads();
+  for (int i = threadIdx.x; i < num_classes; i += blockDim.x) {
+    atomicAdd(wrong + i, wrong_c[i]);
+    atomicAdd(correct + i, correct_c[i]);
+  }
+}
+__global__ void ComputeIoUCUDAKernel(const int num_classes, int* wrong,
+                                     int* correct, float* ious, float* iou) {
+  __shared__ int valid_count_c;
+  if (threadIdx.x == 0) {
+    valid_count_c = 0;
+  }
+  __syncthreads();
+  CUDA_1D_KERNEL_LOOP(i, num_classes) {
+    int wrong_n = wrong[i];
+    int correct_n = correct[i];
+    int denominator = wrong_n + correct_n;
+    if (denominator > 0) {
+      atomicAdd(&valid_count_c, 1);
+      ious[i] = static_cast<float>(correct_n) / denominator;
+    } else {
+      ious[i] = 0;
+    }
+  }
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    float iou_sum = 0;
+    for (int i = 0; i < num_classes; ++i) {
+      iou_sum += ious[i];
+    }
+    iou[0] += iou_sum / valid_count_c;
+  }
+}
+template <typename T>
+class MeanIoUCUDAOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& place = *ctx.template device_context<platform::CUDADeviceContext>()
+                       .eigen_device();
+    // get input and output tensor
+    auto* predictions = ctx.Input<Tensor>("Predictions");
+    auto* labels = ctx.Input<Tensor>("Labels");
+    auto* out_mean_iou = ctx.Output<Tensor>("OutMeanIou");
+    auto* out_wrong = ctx.Output<Tensor>("OutWrong");
+    auto* out_correct = ctx.Output<Tensor>("OutCorrect");
+    int num_classes = static_cast<int>(ctx.Attr<int>("num_classes"));
+    // Get data ptr
+    const T* predictions_data = predictions->data<T>();
+    const T* labels_data = labels->data<T>();
+    int* out_wrong_data = out_wrong->mutable_data<int>(ctx.GetPlace());
+    int* out_correct_data = out_correct->mutable_data<int>(ctx.GetPlace());
+    float* out_mean_iou_data =
+        out_mean_iou->mutable_data<float>(ctx.GetPlace());
+    // Get Eigen tensor
+    auto out_mean_iou_t = EigenTensor<float, 1>::From(*out_mean_iou);
+    auto out_wrong_t = EigenTensor<int, 1>::From(*out_wrong);
+    auto out_correct_t = EigenTensor<int, 1>::From(*out_correct);
+    // Temporary tensor
+    Tensor ious;
+    float* ious_data = ious.mutable_data<float>(
+        {static_cast<int64_t>(num_classes)}, ctx.GetPlace());
+    auto ious_t = EigenTensor<float, 1>::From(ious);
+    // Init out_wrong, out_correct and out_mean_iou
+    out_wrong_t.device(place) = out_wrong_t.constant(0);
+    out_correct_t.device(place) = out_correct_t.constant(0);
+    out_mean_iou_t.device(place) = out_mean_iou_t.constant(0.0f);
+    // collect pre wrong, correct and mean_iou
+    auto in_mean_ious = ctx.MultiInput<Tensor>("InMeanIou");
+    for (int i = 0; i < in_mean_ious.size(); ++i) {
+      out_mean_iou_t.device(place) +=
+          EigenTensor<float, 1>::From(*in_mean_ious[i]);
+    }
+    auto in_wrongs = ctx.MultiInput<Tensor>("InWrongs");
+    for (int i = 0; i < in_wrongs.size(); ++i) {
+      out_wrong_t.device(place) += EigenTensor<int, 1>::From(*in_wrongs[i]);
+    }
+    auto in_corrects = ctx.MultiInput<Tensor>("InCorrects");
+    for (int i = 0; i < in_corrects.size(); ++i) {
+      out_correct_t.device(place) += EigenTensor<int, 1>::From(*in_corrects[i]);
+    }
+    // compute
+    auto stream = ctx.cuda_device_context().stream();
+    int block = PADDLE_CUDA_NUM_THREADS;
+    int grid = (predictions->numel() + block - 1) / block;
+    int cache_size = (num_classes * 2 + 1) * sizeof(int);
+    CountCUDAKernel<T><<<grid, block, cache_size, stream>>>(
+        num_classes, predictions->numel(), predictions_data, labels_data,
+        out_wrong_data, out_correct_data);
+    ctx.device_context().Wait();
+    ComputeIoUCUDAKernel<<<1, block, 0, stream>>>(num_classes, out_wrong_data,
+                                                  out_correct_data, ious_data,
+                                                  out_mean_iou_data);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(mean_iou, ops::MeanIoUCUDAOpKernel<int>,
+                        ops::MeanIoUCUDAOpKernel<int64_t>,
+                        ops::MeanIoUCUDAOpKernel<int32_t>);
--- a/paddle/fluid/operators/mean_iou_op.h
+++ b/paddle/fluid/operators/mean_iou_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename T, int D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+template <typename T>
+class MeanIoUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
+                       .eigen_device();
+    // get input and output tensor
+    auto* predictions = ctx.Input<Tensor>("Predictions");
+    auto* labels = ctx.Input<Tensor>("Labels");
+    auto* out_mean_iou = ctx.Output<Tensor>("OutMeanIou");
+    auto* out_wrong = ctx.Output<Tensor>("OutWrong");
+    auto* out_correct = ctx.Output<Tensor>("OutCorrect");
+    int num_classes = static_cast<int>(ctx.Attr<int>("num_classes"));
+    // get data ptr
+    const T* predictions_data = predictions->data<T>();
+    const T* labels_data = labels->data<T>();
+    float* out_mean_iou_data =
+        out_mean_iou->mutable_data<float>(ctx.GetPlace());
+    int* out_wrong_data = out_wrong->mutable_data<int>(ctx.GetPlace());
+    int* out_correct_data = out_correct->mutable_data<int>(ctx.GetPlace());
+    // get eigen tensor
+    auto out_mean_iou_t = EigenTensor<float, 1>::From(*out_mean_iou);
+    auto out_wrong_t = EigenTensor<int, 1>::From(*out_wrong);
+    auto out_correct_t = EigenTensor<int, 1>::From(*out_correct);
+    // Tmp tensor
+    Tensor denominator;
+    Tensor valid_count;
+    Tensor iou_sum;
+    // get data ptr of tmp tensor
+    int* denominator_data = denominator.mutable_data<int>(
+        {static_cast<int64_t>(num_classes)}, ctx.GetPlace());
+    int* valid_count_data = valid_count.mutable_data<int>({1}, ctx.GetPlace());
+    float* iou_sum_data = iou_sum.mutable_data<float>({1}, ctx.GetPlace());
+    // get eigen tensor of tmp tensor
+    auto denominator_t = EigenTensor<int, 1>::From(denominator);
+    auto valid_count_t = EigenTensor<int, 1>::From(valid_count);
+    auto iou_sum_t = EigenTensor<float, 1>::From(iou_sum);
+    // init out_wrong, out_correct and out_mean_iou
+    out_wrong_t = out_wrong_t.constant(0);
+    out_correct_t = out_correct_t.constant(0);
+    out_mean_iou_t = out_mean_iou_t.constant(0);
+    // collect pre wrong, correct and mean_iou
+    auto in_mean_ious = ctx.MultiInput<Tensor>("InMeanIou");
+    for (size_t i = 0; i < in_mean_ious.size(); ++i) {
+      out_mean_iou_t.device(place) +=
+          EigenTensor<float, 1>::From(*in_mean_ious[i]);
+    }
+    auto in_wrongs = ctx.MultiInput<Tensor>("InWrongs");
+    for (size_t i = 0; i < in_wrongs.size(); ++i) {
+      out_wrong_t.device(place) += EigenTensor<int, 1>::From(*in_wrongs[i]);
+    }
+    auto in_corrects = ctx.MultiInput<Tensor>("InCorrects");
+    for (size_t i = 0; i < in_corrects.size(); ++i) {
+      out_correct_t.device(place) += EigenTensor<int, 1>::From(*in_corrects[i]);
+    }
+    // compute
+    for (int64_t i = 0; i < predictions->numel(); ++i) {
+      if (predictions_data[i] == labels_data[i]) {
+        out_correct_data[predictions_data[i]] += 1;
+      } else {
+        out_wrong_data[labels_data[i]] += 1;
+        out_wrong_data[predictions_data[i]] += 1;
+      }
+    }
+    denominator_t = out_wrong_t + out_correct_t;
+    valid_count_t =
+        (denominator_t > denominator_t.constant(0.0f)).cast<int>().sum();
+    for (int i = 0; i < num_classes; ++i) {
+      if (denominator_data[i] == 0) {
+        denominator_data[i] = 1;
+      }
+    }
+    iou_sum_t =
+        (out_correct_t.cast<float>() / denominator_t.cast<float>()).sum();
+    out_mean_iou_data[0] += (iou_sum_data[0] / valid_count_data[0]);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/merge_ids_op.cc
+++ b/paddle/fluid/operators/merge_ids_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/merge_ids_op.h"
+namespace paddle {
+namespace operators {
+class MergeIdsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}");
+    AddInput(
+        "X",
+        "(LoDTensors) multi input tensor with shape{batch_num, N}, N is the "
+        "size of embedding table")
+        .AsDuplicable();
+    AddOutput("Out", "(LoDTensor) The merged outputs of the input tensors.");
+    AddComment(R"DOC(
+Merge multi LoDTensor's into one according to Ids's shard num.
+split_ids_op -> prefetch_op -> merge_ids_op
+merge_ids_op should be used after split_ids_op and prefetch_op, split_ids_op
+ will split input Ids into multiple tensors according to Id's shard number.
+prefetch_op will send them to parameter server to prefetch embedding value
+back. During split, the order of ids is disordered. In merge_ids_op we use
+the original Ids to restore the order of the fetched embedding value and
+ also pass the lod information to the merged output.
+Example:
+    Ids = [1,2,3,4,5,6] # 3 shared
+split_ids_op ->
+    Id0 = [3, 6] # id % 3 == 0
+    Id1 = [1, 4] # id % 3 == 1
+    Id2 = [2, 5] # id % 3 == 2
+prefetch_op ->
+    X0 = [[0.3 0.3]   # 3
+          [0.6 0.6]]  # 6
+    X1 = [[0.1 0.1]   # 1
+          [0.4 0.4]]  # 4
+    X2 = [[0.2 0.2]   # 2
+          [0.5 0.5]]  # 5
+merge_ids_op ->
+    Out = [[0.1 0.1]  # 1
+           [0.2 0.2]  # 2
+           [0.3 0.3]  # 3
+           [0.4 0.4]  # 4
+           [0.5 0.5]  # 5
+           [0.6 0.6]] # 6
+)DOC");
+  }
+};
+class MergeIdsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Ids"), "MergeIdsOp must has input Ids.");
+    PADDLE_ENFORCE(ctx->HasInputs("X"), "MergeIdsOp must has input X.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "MergeIdsOp must has output Out.");
+    auto ids_var_type = ctx->GetInputsVarType("Ids").front();
+    auto ids_dims = ctx->GetInputDim("Ids");
+    if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
+      PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
+      PADDLE_ENFORCE_EQ(ids_dims[1], 1);
+    }
+    auto x_var_type = ctx->GetInputsVarType("X");
+    for (auto &var_type : x_var_type) {
+      PADDLE_ENFORCE_EQ(var_type, framework::proto::VarType::LOD_TENSOR,
+                        "input X only support lod tensors");
+    }
+    ctx->ShareLoD("Ids", "Out");
+  }
+ private:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.MultiInput<framework::Tensor>("X").front()->type()),
+        ctx.GetPlace());
+  }
+};
+class MergeIdsOpInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    auto *input_var = block->Var(op_desc.Input("Ids")[0]);
+    for (auto &out_var : op_desc.Output("Out")) {
+      block->Var(out_var)->SetType(input_var->GetType());
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(merge_ids, ops::MergeIdsOp, ops::MergeIdsOpMaker,
+                  ops::MergeIdsOpInferVarType);
+REGISTER_OP_CPU_KERNEL(
+    merge_ids, ops::MergeIdsOpKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/fluid/operators/merge_ids_op.h
+++ b/paddle/fluid/operators/merge_ids_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class MergeIdsOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto place = ctx.GetPlace();
+    if (!platform::is_cpu_place(place)) {
+      PADDLE_THROW("MergeIds do not support GPU kernel");
+    }
+    VLOG(3) << "run in MergeIdsOpKernel";
+    const auto *ids_var = ctx.InputVar("Ids");
+    PADDLE_ENFORCE(ids_var->IsType<framework::LoDTensor>(),
+                   "only support to merge Ids of LoDTensor");
+    const auto &ids_tensor = ids_var->Get<framework::LoDTensor>();
+    const auto &ids_dims = ids_tensor.dims();
+    const int64_t *ids = ids_tensor.data<int64_t>();
+    auto x_tensors = ctx.MultiInput<framework::LoDTensor>("X");
+    auto *out = ctx.Output<framework::LoDTensor>("Out");
+    int batch_size = 0;
+    int embedding_size = 0;
+    for (auto &input : x_tensors) {
+      if (framework::product(input->dims()) != 0) {
+        if (embedding_size == 0) {
+          embedding_size = input->dims()[1];
+        }
+        PADDLE_ENFORCE_EQ(embedding_size, input->dims()[1],
+                          "embedding size of all input should be the same");
+        batch_size += input->dims()[0];
+      }
+    }
+    PADDLE_ENFORCE_EQ(
+        batch_size, ids_dims[0],
+        "the batch size of ids and merged embedding value should be the same");
+    const size_t shard_num = x_tensors.size();
+    if (shard_num == 1) {
+      VLOG(3) << "only one shard, we can copy the data directly";
+      TensorCopy(*x_tensors[0], place, out);
+    } else {
+      std::vector<int> in_indexs(shard_num, 0);
+      auto *out_data = out->mutable_data<T>(
+          framework::make_ddim({batch_size, embedding_size}), place);
+      // copy data from ins[shard_num] to out.
+      for (int i = 0; i < ids_dims[0]; ++i) {
+        int64_t id = ids[i];
+        size_t shard_id = static_cast<size_t>(id) % shard_num;
+        int index = in_indexs[shard_id];
+        memcpy(out_data + embedding_size * i,
+               x_tensors[shard_id]->data<T>() + index * embedding_size,
+               sizeof(T) * embedding_size);
+        in_indexs[shard_id] += 1;
+      }
+      for (size_t i = 0; i < shard_num; ++i) {
+        PADDLE_ENFORCE_EQ(in_indexs[i], x_tensors[i]->dims()[0],
+                          "after merge, all data in x_tensor should be used");
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/reader/create_batch_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc
@@ -20,7 +20,7 @@ namespace reader {
 class BatchReader : public framework::DecoratedReader {
 public:
-  BatchReader(ReaderBase* reader, int batch_size)
+  BatchReader(const std::shared_ptr<ReaderBase>& reader, int batch_size)
      : DecoratedReader(reader), batch_size_(batch_size) {
    buffer_.reserve(batch_size_);
  }

--- a/paddle/fluid/operators/reader/create_custom_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc
@@ -22,7 +22,8 @@ namespace reader {
 class CustomReader : public framework::DecoratedReader {
 public:
-  CustomReader(ReaderBase* reader, const framework::BlockDesc& sub_block,
+  CustomReader(const std::shared_ptr<ReaderBase>& reader,
+               const framework::BlockDesc& sub_block,
               const std::vector<std::string>& source_var_names,
               const std::vector<std::string>& sink_var_names)
      : DecoratedReader(reader),

--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -34,7 +34,8 @@ static constexpr size_t kChannelSize = 1;  // kCacheSize - 2
 class DoubleBufferReader : public framework::DecoratedReader {
 public:
  explicit DoubleBufferReader(
-      ReaderBase* reader, platform::Place target_place = platform::CPUPlace())
+      const std::shared_ptr<ReaderBase>& reader,
+      platform::Place target_place = platform::CPUPlace())
      : DecoratedReader(reader), place_(target_place) {
    cpu_tensor_cache_.resize(kCacheSize);
    gpu_tensor_cache_.resize(kCacheSize);

--- a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
@@ -21,7 +21,7 @@ namespace reader {
 class MultiPassReader : public framework::DecoratedReader {
 public:
-  MultiPassReader(ReaderBase* reader, int pass_num)
+  MultiPassReader(const std::shared_ptr<ReaderBase>& reader, int pass_num)
      : DecoratedReader(reader), pass_num_(pass_num), pass_count_(0) {}
  void ReadNext(std::vector<framework::LoDTensor>* out) override {

--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
@@ -23,7 +23,8 @@ namespace reader {
 class ShuffleReader : public framework::DecoratedReader {
 public:
-  ShuffleReader(ReaderBase* reader, size_t buffer_size, size_t seed = 0)
+  ShuffleReader(const std::shared_ptr<ReaderBase>& reader, size_t buffer_size,
+                size_t seed = 0)
      : DecoratedReader(reader), buffer_size_(buffer_size), seed_(seed) {
    VLOG(10) << "Create shuffle reader of " << reader_;
    if (seed_ == 0) {

--- a/paddle/fluid/operators/reader/create_threaded_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
@@ -21,7 +21,8 @@ namespace reader {
 class ThreadedReader : public framework::DecoratedReader {
 public:
-  explicit ThreadedReader(ReaderBase* reader) : DecoratedReader(reader) {}
+  explicit ThreadedReader(const std::shared_ptr<ReaderBase>& reader)
+      : DecoratedReader(reader) {}
  void ReadNext(std::vector<framework::LoDTensor>* out) override {
    std::lock_guard<std::mutex> lock(mutex_);

--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -21,12 +21,17 @@ limitations under the License. */
 #include <unistd.h>
 #endif
+#include <algorithm>
 #include "gflags/gflags.h"
 DEFINE_double(fraction_of_cpu_memory_to_use, 1,
              "Default use 100% of CPU memory for PaddlePaddle,"
              "reserve the rest for page tables, etc");
+DEFINE_uint64(
+    initial_cpu_memory_in_mb, 500,
+    "Default initial 500MB of CPU memory for PaddlePaddle, in MD unit.");
 DEFINE_double(
    fraction_of_cuda_pinned_memory_to_use, 0.5,
    "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
@@ -54,7 +59,10 @@ inline size_t CpuTotalPhysicalMemory() {
 size_t CpuMaxAllocSize() {
  // For distributed systems, it requires configuring and limiting
  // the fraction of memory to use.
-  return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
+  return std::min(
+      static_cast<size_t>(FLAGS_fraction_of_cpu_memory_to_use *
+                          CpuTotalPhysicalMemory()),
+      static_cast<size_t>(FLAGS_initial_cpu_memory_in_mb * 1 << 20));
 }
 size_t CpuMinChunkSize() {

--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -322,7 +322,6 @@ class DeviceTracerImpl : public DeviceTracer {
    DisableActivity();
    dynload::cuptiUnsubscribe(subscriber_);
    CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_));
-    PADDLE_ENFORCE(dynload::cuptiFinalize());
    enabled_ = false;
  }

--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -72,7 +72,6 @@ extern void *cupti_dso_handle;
  __macro(cuptiGetResultString);              \
  __macro(cuptiActivityGetNumDroppedRecords); \
  __macro(cuptiActivityFlushAll);             \
-  __macro(cuptiFinalize);                     \
  __macro(cuptiSubscribe);                    \
  __macro(cuptiUnsubscribe);                  \
  __macro(cuptiEnableCallback);               \

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -132,7 +132,8 @@ EOF
        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-        -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
+        -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
+        -DWITH_ANAKIN=ON
 }
 function abort(){

--- a/python/paddle/batch.py
+++ b/python/paddle/batch.py
@@ -15,7 +15,7 @@
 __all__ = ['batch']
-def batch(reader, batch_size, drop_last=False):
+def batch(reader, batch_size, drop_last=True):
    """
    Create a batched reader.

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -382,7 +382,7 @@ class Operator(object):
        'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
        'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine',
        'ncclInit', 'channel_create', 'channel_close', 'channel_send',
-        'channel_recv', 'select'
+        'channel_recv', 'select', 'gen_nccl_id'
    }
    def __init__(self,

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -25,68 +25,20 @@ import utils
 import random
 __all__ = [
-    'fc',
+    'fc', 'embedding', 'dynamic_lstm', 'dynamic_lstmp', 'dynamic_gru',
-    'embedding',
+    'gru_unit', 'linear_chain_crf', 'crf_decoding', 'cos_sim', 'cross_entropy',
-    'dynamic_lstm',
+    'square_error_cost', 'chunk_eval', 'sequence_conv', 'conv2d',
-    'dynamic_lstmp',
+    'sequence_pool', 'sequence_softmax', 'softmax', 'pool2d', 'batch_norm',
-    'dynamic_gru',
+    'beam_search_decode', 'conv2d_transpose', 'sequence_expand', 'lstm_unit',
-    'gru_unit',
+    'reduce_sum', 'reduce_mean', 'reduce_max', 'reduce_min', 'reduce_prod',
-    'linear_chain_crf',
+    'sequence_first_step', 'sequence_last_step', 'dropout', 'split',
-    'crf_decoding',
+    'ctc_greedy_decoder', 'edit_distance', 'l2_normalize', 'matmul', 'topk',
-    'cos_sim',
+    'warpctc', 'sequence_reshape', 'transpose', 'im2sequence', 'nce',
-    'cross_entropy',
+    'beam_search', 'row_conv', 'multiplex', 'layer_norm',
-    'square_error_cost',
+    'softmax_with_cross_entropy', 'smooth_l1', 'one_hot',
-    'chunk_eval',
+    'autoincreased_step_counter', 'reshape', 'lod_reset', 'lrn', 'pad',
-    'sequence_conv',
+    'label_smooth', 'roi_pool', 'dice_loss', 'image_resize',
-    'conv2d',
+    'image_resize_short', 'resize_bilinear', 'gather', 'random_crop', 'mean_iou'
-    'sequence_pool',
-    'sequence_softmax',
-    'softmax',
-    'pool2d',
-    'batch_norm',
-    'beam_search_decode',
-    'conv2d_transpose',
-    'sequence_expand',
-    'lstm_unit',
-    'reduce_sum',
-    'reduce_mean',
-    'reduce_max',
-    'reduce_min',
-    'reduce_prod',
-    'sequence_first_step',
-    'sequence_last_step',
-    'dropout',
-    'split',
-    'ctc_greedy_decoder',
-    'edit_distance',
-    'l2_normalize',
-    'matmul',
-    'topk',
-    'warpctc',
-    'sequence_reshape',
-    'transpose',
-    'im2sequence',
-    'nce',
-    'beam_search',
-    'row_conv',
-    'multiplex',
-    'layer_norm',
-    'softmax_with_cross_entropy',
-    'smooth_l1',
-    'one_hot',
-    'autoincreased_step_counter',
-    'reshape',
-    'lod_reset',
-    'lrn',
-    'pad',
-    'label_smooth',
-    'roi_pool',
-    'dice_loss',
-    'image_resize',
-    'image_resize_short',
-    'resize_bilinear',
-    'gather',
-    'random_crop',
 ]
@@ -261,9 +213,10 @@ def embedding(input,
    return tmp
-# TODO(qijun): expose H0 and C0
 def dynamic_lstm(input,
                 size,
+                 h_0=None,
+                 c_0=None,
                 param_attr=None,
                 bias_attr=None,
                 use_peepholes=True,
@@ -324,6 +277,13 @@ def dynamic_lstm(input,
                         (T X 4D), where T is the total time steps in this
                         mini-batch, D is the hidden size.
        size(int): 4 * hidden size.
+        h_0(Variable): The initial hidden state is an optional input, default is zero.
+                       This is a tensor with shape (N x D), where N is the
+                       batch size and D is the hidden size.
+        c_0(Variable): The initial cell state is an optional input, default is zero.
+                       This is a tensor with shape (N x D), where N is the
+                       batch size. `h_0` and `c_0` can be NULL but only at the same time.
        param_attr(ParamAttr|None): The parameter attribute for the learnable
                               hidden-hidden weights.
@@ -387,12 +347,20 @@ def dynamic_lstm(input,
    cell = helper.create_tmp_variable(dtype)
    batch_gate = helper.create_tmp_variable(dtype)
    batch_cell_pre_act = helper.create_tmp_variable(dtype)
+    inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
+    batch_size = input.shape[0]
+    if h_0:
+        assert h_0.shape == (batch_size, size), \
+            'The shape of h0 should be (batch_size, %d)' % size
+        inputs['H0'] = h_0
+    if c_0:
+        assert c_0.shape == (batch_size, size), \
+            'The shape of c0 should be (batch_size, %d)' % size
+        inputs['C0'] = c_0
    helper.append_op(
        type='lstm',
-        inputs={'Input': input,
+        inputs=inputs,
-                'Weight': weight,
-                'Bias': bias},
        outputs={
            'Hidden': hidden,
            'Cell': cell,
@@ -677,11 +645,13 @@ def dynamic_gru(input,
        attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
    bias = helper.create_parameter(
        attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True)
+    batch_size = input.shape[0]
    inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
    if h_0 != None:
        assert h_0.shape == (
-            size, size), 'The shape of h0 should be(%d, %d)' % (size, size)
+            batch_size, size
-        inputs['h0'] = h_0
+        ), 'The shape of h0 should be(batch_size, %d)' % size
+        inputs['H0'] = h_0
    hidden = helper.create_tmp_variable(dtype)
    batch_gate = helper.create_tmp_variable(dtype)
@@ -4213,6 +4183,7 @@ def gather(input, index):
        output (Variable): The output is a tensor with the same rank as input.
    Examples:
        .. code-block:: python
            output = fluid.layers.gather(x, index)
@@ -4277,3 +4248,53 @@ def random_crop(x, shape, seed=None):
                 "SeedOut": seed_out},
        attrs={"shape": shape})
    return out
+def mean_iou(input, label, num_classes):
+    """
+    Mean Intersection-Over-Union is a common evaluation metric for
+    semantic image segmentation, which first computes the IOU for each 
+    semantic class and then computes the average over classes. 
+    IOU is defined as follows: 
+    .. math::
+        IOU = true_positive / (true_positive + false_positive + false_negative). 
+    The predictions are accumulated in a confusion matrix and mean-IOU 
+    is then calculated from it.
+    Args:
+        input (Variable): A Tensor of prediction results for semantic labels with type int32 or int64.
+        label (Variable):  A Tensor of ground truth labels with type int32 or int64. 
+                           Its shape should be the same as input.
+    Returns:
+        mean_iou (Variable): A Tensor representing the mean intersection-over-union with shape [1].
+        out_wrong(Variable): A Tensor with shape [num_classes]. The wrong numbers of each class.
+        out_correct(Variable): A Tensor with shape [num_classes]. The correct numbers of each class. 
+    Examples:
+        .. code-block:: python
+            iou, wrongs, corrects = fluid.layers.mean_iou(predict, label, num_classes)
+    """
+    helper = LayerHelper('mean_iou', **locals())
+    dtype = helper.input_dtype()
+    out_mean_iou = helper.create_tmp_variable(dtype='float32')
+    out_wrong = helper.create_tmp_variable(dtype='int32')
+    out_correct = helper.create_tmp_variable(dtype='int32')
+    helper.append_op(
+        type="mean_iou",
+        inputs={"predictions": input,
+                "labels": label},
+        outputs={
+            "out_mean_iou": out_mean_iou,
+            "out_wrong": out_wrong,
+            "out_correct": out_correct
+        },
+        attrs={"num_classes": num_classes})
+    return out_mean_iou, out_wrong, out_correct
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
@@ -96,10 +96,11 @@ def train(use_cuda, train_program, params_dirname):
    train_reader = paddle.batch(
        paddle.reader.shuffle(
            cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10),
-        batch_size=BATCH_SIZE)
+        batch_size=BATCH_SIZE,
+        drop_last=False)
    test_reader = paddle.batch(
-        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
+        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False)
    def event_handler(event):
        if isinstance(event, fluid.EndStepEvent):

--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
@@ -73,10 +73,11 @@ def train(use_cuda, train_program, params_dirname):
    train_reader = paddle.batch(
        paddle.reader.shuffle(
            cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10),
-        batch_size=BATCH_SIZE)
+        batch_size=BATCH_SIZE,
+        drop_last=False)
    test_reader = paddle.batch(
-        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
+        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False)
    def event_handler(event):
        if isinstance(event, fluid.EndStepEvent):

--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
@@ -87,7 +87,9 @@ def train(use_cuda, train_program, params_dirname):
    def event_handler(event):
        if isinstance(event, fluid.EndEpochEvent):
            test_reader = paddle.batch(
-                paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE)
+                paddle.dataset.imdb.test(word_dict),
+                batch_size=BATCH_SIZE,
+                drop_last=False)
            avg_cost, acc = trainer.test(
                reader=test_reader, feed_order=['words', 'label'])
@@ -113,7 +115,8 @@ def train(use_cuda, train_program, params_dirname):
    train_reader = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.imdb.train(word_dict), buf_size=25000),
-        batch_size=BATCH_SIZE)
+        batch_size=BATCH_SIZE,
+        drop_last=False)
    trainer.train(
        num_epochs=1,

--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -56,7 +56,7 @@ BATCH_SIZE = 200
 # fix the order of training data
 train_reader = paddle.batch(
-    paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE)
+    paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE, drop_last=False)
 # train_reader = paddle.batch(
 #     paddle.reader.shuffle(

--- a/python/paddle/fluid/tests/unittests/test_mean_iou.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_iou.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import division
+import unittest
+import numpy as np
+from op_test import OpTest
+def compute_mean_iou(predictions, labels, num_classes, in_wrongs, in_corrects,
+                     in_mean_ious):
+    assert predictions.shape == labels.shape
+    predictions = predictions.flatten()
+    labels = labels.flatten()
+    out_wrong = np.zeros([num_classes]).astype("int32")
+    for _, wrong in in_wrongs:
+        out_wrong += wrong
+    out_correct = np.zeros([num_classes]).astype("int32")
+    for _, correct in in_corrects:
+        out_correct += correct
+    for pred, label in zip(predictions, labels):
+        if pred == label:
+            out_correct[pred] += 1
+        else:
+            out_wrong[pred] += 1
+            out_wrong[label] += 1
+    denominator = out_wrong + out_correct
+    valid_count = (denominator != 0).sum()
+    denominator = np.where(denominator > 0, denominator,
+                           np.ones(denominator.shape))
+    mean_iou = (out_correct / denominator).sum() / valid_count
+    for _, in_mean_iou in in_mean_ious:
+        mean_iou += in_mean_iou
+    return mean_iou, out_wrong, out_correct
+class TestMeanIOUOp(OpTest):
+    def setUp(self):
+        self.config()
+        self.op_type = "mean_iou"
+        predictions = np.random.randint(0, self.num_classes,
+                                        self.image_size).astype("int32")
+        labels = np.random.randint(0, self.num_classes,
+                                   self.image_size).astype("int32")
+        in_wrongs = []
+        for i in range(self.in_wrong_num):
+            in_wrongs.append(("in_wrong_%d" % i, np.random.randint(
+                0, 10, [self.num_classes]).astype("int32")))
+        in_corrects = []
+        for i in range(self.in_correct_num):
+            in_corrects.append(("in_correct_%d" % i, np.random.randint(
+                0, 10, [self.num_classes]).astype("int32")))
+        in_mean_ious = []
+        for i in range(self.in_mean_iou_num):
+            in_mean_ious.append(("in_mean_iou_%d" % i, np.random.uniform(
+                0, 1, [1]).astype("float32")))
+        self.inputs = {
+            'Predictions': predictions,
+            'Labels': labels,
+            'InWrongs': in_wrongs,
+            'InCorrects': in_corrects,
+            'InMeanIou': in_mean_ious
+        }
+        self.attrs = {'num_classes': long(self.num_classes)}
+        mean_iou, out_wrong, out_correct = compute_mean_iou(
+            predictions, labels, self.num_classes, in_wrongs, in_corrects,
+            in_mean_ious)
+        self.outputs = {
+            'OutMeanIou': mean_iou,
+            'OutWrong': out_wrong,
+            'OutCorrect': out_correct
+        }
+    def config(self):
+        self.num_classes = 10
+        self.image_size = [128, 128]
+        self.in_wrong_num = 0
+        self.in_correct_num = 0
+        self.in_mean_iou_num = 0
+    def test_check_output(self):
+        self.check_output()
+class TestCase1(TestMeanIOUOp):
+    def config(self):
+        self.num_classes = 5
+        self.image_size = [100, 128]
+        self.in_wrong_num = 2
+        self.in_correct_num = 2
+        self.in_mean_iou_num = 2
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
+++ b/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+class TestMergeIdsOp(OpTest):
+    def setUp(self):
+        self.op_type = "merge_ids"
+        ids = np.array([[0], [2], [2], [3], [5], [5], [6]]).astype('int64')
+        x0 = np.array([[0.1, 0.2], [0.2, 0.3], [0.3, 0.4]]).astype('float32')
+        x1 = np.array([]).astype('float32')
+        x2 = np.array([[0.4, 0.5], [0.4, 0.5], [0.5, 0.6],
+                       [0.5, 0.6]]).astype('float32')
+        out = np.array([[0.1, 0.2], [0.4, 0.5], [0.4, 0.5], [0.2, 0.3],
+                        [0.5, 0.6], [0.5, 0.6], [0.3, 0.4]]).astype('float32')
+        self.inputs = {'Ids': ids, "X": [('x0', x0), ('x1', x1), ('x2', x2)]}
+        self.outputs = {'Out': out}
+    def test_check_output(self):
+        self.check_output()
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -629,7 +629,7 @@ class DistributeTranspiler:
                if op.type == LOOKUP_TABLE_TYPE:
                    continue_search_lookup_table_op = True
-                    op_index = list(all_ops).index(op)
+                    lookup_table_op_index = list(all_ops).index(op)
                    ids_name = op.input("Ids")
                    out_name = op.output("Out")
@@ -649,7 +649,7 @@ class DistributeTranspiler:
                    # insert split_ids_op
                    program.global_block().insert_op(
-                        index=op_index,
+                        index=lookup_table_op_index,
                        type="split_ids",
                        inputs={
                            'Ids': [
@@ -661,7 +661,7 @@ class DistributeTranspiler:
                    # insert prefetch_op
                    program.global_block().insert_op(
-                        index=op_index + 1,
+                        index=lookup_table_op_index + 1,
                        type="prefetch",
                        inputs={'X': prefetch_input_vars},
                        outputs={"Out": prefetch_output_vars},
@@ -672,16 +672,21 @@ class DistributeTranspiler:
                    # insert concat_op
                    program.global_block().insert_op(
-                        index=op_index + 2,
+                        index=lookup_table_op_index + 2,
-                        type="concat",
+                        type="merge_ids",
-                        inputs={'X': prefetch_output_vars},
+                        inputs={
+                            'Ids': [
+                                program.global_block().vars[varname]
+                                for varname in ids_name
+                            ],
+                            'X': prefetch_output_vars
+                        },
                        outputs={
                            "Out": [
                                program.global_block().vars[varname]
                                for varname in out_name
                            ]
-                        },
+                        })
-                        attrs={"axis": 0})
                    # delete lookup_table_op
                    delete_ops(program.global_block(), [op])

--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -240,14 +240,15 @@ class ExtraLayerAttribute(object):
    :type error_clipping_threshold: float
    :param drop_rate: Dropout rate. Dropout will create a mask on layer output.
                      The dropout rate is the zero rate of this mask. The
-                      details of what dropout is please refer to `here
+                      details of what dropout is please refer to `JMLRdropout
-                      <https://www.cs.toronto.edu/~hinton/absps/
+                      <https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf
-                      JMLRdropout.pdf>`_.
+                      >`_.
    :type drop_rate: float
    :param device: device ID of layer. device=-1, use CPU. device>=0, use GPU.
-                   The details allocation in parallel_nn please refer to `here
+                   The details allocation in parallel_nn please refer to `use_case
-                   <http://www.paddlepaddle.org/doc/ui/cmd_argument/
+                   <https://github.com/PaddlePaddle/Paddle/blob/develop/doc/v2
-                   use_case.html#case-2-specify-layers-in-different-devices>`_.
+                   /howto/cmd_parameter/use_case_en.md#case-2-specify-layers-in
+                   -different-devices>`_.
    :type device: int
    """

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -2556,7 +2556,7 @@ def img_conv_layer(input,
    the output will be obtained by concatenating the two results.
    The details of grouped convolution, please refer to:
-    `ImageNet Classification with Deep Convolutional Neural Networks
+    `ImageNet Classification With Deep Convolutional Neural Networks
    <http://www.cs.toronto.edu/~kriz/imagenet_classification_with_deep_convolutional.pdf>`_
    The example usage is:
@@ -5678,8 +5678,8 @@ def warp_ctc_layer(input,
    <https://github.com/baidu-research/warp-ctc>`_ library, which is used in
    `Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin
    <https://arxiv.org/pdf/1512.02595v1.pdf>`_, to compute Connectionist Temporal
-    Classification (CTC) loss. Besides, another `warp-ctc
+    Classification (CTC) loss. Besides, another `warp-ctc repository
-    <https://github.com/gangliao/warp-ctc>`_ repository, which is forked from
+    <https://github.com/gangliao/warp-ctc>`_ , which is forked from
    the official one, is maintained to enable more compiling options. During the
    building process, PaddlePaddle will clone the source codes, build and
    install it to :code:`third_party/install/warpctc` directory.

--- a/python/paddle/v2/minibatch.py
+++ b/python/paddle/v2/minibatch.py
@@ -15,7 +15,7 @@
 __all__ = ['batch']
-def batch(reader, batch_size, drop_last=False):
+def batch(reader, batch_size, drop_last=True):
    """
    Create a batched reader.