Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into argmin_argmax

fa002cd6 · sneaxiy · e0f883e6 · 9169b3b8 · fa002cd6 · fa002cd6
94 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -61,6 +61,7 @@ option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen"            OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
+option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})

 # CMAKE_BUILD_TYPE
@@ -193,7 +194,10 @@ set(EXTERNAL_LIBS
 if(WITH_GPU)
    include(cuda)
    include(tensorrt)
-endif(WITH_GPU)
+    include(external/anakin)
+else()
+  set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
+endif()

 if(WITH_AMD_GPU)
    find_package(HIP)

--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -188,7 +188,7 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
        print_train_time(start_time, time.time(), num_samples)
        print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))),
        # evaluation
-        if not args.no_test and batch_acc:
+        if not args.no_test and batch_acc and not args.use_reader_op:
            pass_test_acc = test(exe, infer_prog, test_reader, feeder,
                                 batch_acc)
            print(", Test Accuracy: %f" % pass_test_acc)
@@ -285,11 +285,12 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
            batch_id += 1

        print_train_time(start_time, time.time(), num_samples)
-        if not args.no_test and batch_acc:
+        if not args.no_test and batch_acc and not args.use_reader_op:
+            # we have not implement record io for test
+            # skip test when use args.use_reader_op
            test_acc = test(startup_exe, infer_prog, test_reader, feeder,
                            batch_acc)
            print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc))
-        exit(0)


 def print_arguments(args):

--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -199,7 +199,10 @@ def get_model(args):
    batched_train_reader = paddle.batch(
        paddle.reader.shuffle(
            train_reader, buf_size=5120),
-        batch_size=args.batch_size * args.gpus)
-    batched_test_reader = paddle.batch(train_reader, batch_size=args.batch_size)
+        batch_size=args.batch_size * args.gpus,
+        drop_last=True)
+    batched_test_reader = paddle.batch(
+        train_reader, batch_size=args.batch_size, drop_last=True)

-    return avg_cost, inference_program, optimizer, batched_train_reader, batched_test_reader, batch_acc
+    return avg_cost, inference_program, optimizer, batched_train_reader,\
+                   batched_test_reader, batch_acc
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -118,6 +118,10 @@ endif()
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")

+if(WITH_DISTRIBUTE)
+  add_definitions(-DPADDLE_WITH_DISTRIBUTE)
+endif()
+
 if(WITH_GOLANG)
  # we need to symlink Paddle directory into GOPATH. If we
  # don't do it and we have code that depends on Paddle, go

--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
+if (NOT WITH_ANAKIN)
+  return()
+endif()
+
+set(ANAKIN_INSTALL_DIR "${THIRD_PARTY_PATH}/install/anakin" CACHE PATH
+  "Anakin install path." FORCE)
+set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header files")
+set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library")
+
+set(ANAKIN_COMPILE_EXTRA_FLAGS -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp)
+
+set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/3.0/anakin_release_simple.tar.gz")
+
+# A helper function used in Anakin, currently, to use it, one need to recursively include
+# nearly all the header files.
+function(fetch_include_recursively root_dir)
+    if (IS_DIRECTORY ${root_dir})
+        include_directories(${root_dir})
+    endif()
+
+    file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*)
+    foreach(sub ${ALL_SUB})
+        if (IS_DIRECTORY ${root_dir}/${sub})
+            fetch_include_recursively(${root_dir}/${sub})
+        endif()
+    endforeach()
+endfunction()
+
+# download library
+message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
+execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
+execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
+execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
+execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
+execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
+
+if (WITH_ANAKIN)
+    message(STATUS "Anakin for inference is enabled")
+    message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
+    fetch_include_recursively(${ANAKIN_INCLUDE})
+    link_directories(${ANAKIN_LIBRARY})
+endif()
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -29,6 +29,8 @@ IF(NOT ${CBLAS_FOUND})
        "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
        CACHE FILEPATH "openblas library." FORCE)

+    ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS)
+
    SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
    SET(OPENBLAS_COMMIT "v0.2.20")


--- a/doc/fluid/api/detection.rst
+++ b/doc/fluid/api/detection.rst
--- a/doc/fluid/api/gen_doc.sh
+++ b/doc/fluid/api/gen_doc.sh
 #!/bin/bash
-python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst
+python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler > layers.rst

 for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer
 do

--- a/doc/fluid/api/io.rst
+++ b/doc/fluid/api/io.rst
@@ -59,21 +59,3 @@ get_inference_program
 ..  autofunction:: paddle.fluid.io.get_inference_program
    :noindex:

-save_checkpoint
---------------
-
-..  autofunction:: paddle.fluid.io.save_checkpoint
-    :noindex:
-
-load_checkpoint
---------------
-
-..  autofunction:: paddle.fluid.io.load_checkpoint
-    :noindex:
-
-clean_checkpoint
----------------
-
-..  autofunction:: paddle.fluid.io.clean_checkpoint
-    :noindex:
-
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -181,12 +181,6 @@ Print
 ..  autofunction:: paddle.fluid.layers.Print
    :noindex:

-is_empty
--------
-
-..  autofunction:: paddle.fluid.layers.is_empty
-    :noindex:
-
 device
 ======

@@ -261,19 +255,6 @@ double_buffer
 ..  autofunction:: paddle.fluid.layers.double_buffer
    :noindex:

-random_data_generator
---------------------
-
-..  autofunction:: paddle.fluid.layers.random_data_generator
-    :noindex:
-
-Preprocessor
------------
-
-..  autoclass:: paddle.fluid.layers.Preprocessor
-    :members:
-    :noindex:
-
 nn
 ==

@@ -613,30 +594,6 @@ roi_pool
 ..  autofunction:: paddle.fluid.layers.roi_pool
    :noindex:

-dice_loss
---------
-
-..  autofunction:: paddle.fluid.layers.dice_loss
-    :noindex:
-
-resize_bilinear
---------------
-
-..  autofunction:: paddle.fluid.layers.resize_bilinear
-    :noindex:
-
-gather
------
-
-..  autofunction:: paddle.fluid.layers.gather
-    :noindex:
-
-random_crop
-----------
-
-..  autofunction:: paddle.fluid.layers.random_crop
-    :noindex:
-
 ops
 ===

@@ -784,12 +741,6 @@ sum
 ..  autofunction:: paddle.fluid.layers.sum
    :noindex:

-shape
-----
-
-..  autofunction:: paddle.fluid.layers.shape
-    :noindex:
-
 sigmoid
 -------

@@ -1039,3 +990,93 @@ zeros
 ..  autofunction:: paddle.fluid.layers.zeros
    :noindex:

+detection
+=========
+
+multi_box_head
+--------------
+
+..  autofunction:: paddle.fluid.layers.multi_box_head
+    :noindex:
+
+bipartite_match
+---------------
+
+..  autofunction:: paddle.fluid.layers.bipartite_match
+    :noindex:
+
+target_assign
+-------------
+
+..  autofunction:: paddle.fluid.layers.target_assign
+    :noindex:
+
+detection_output
+----------------
+
+..  autofunction:: paddle.fluid.layers.detection_output
+    :noindex:
+
+ssd_loss
+--------
+
+..  autofunction:: paddle.fluid.layers.ssd_loss
+    :noindex:
+
+detection_map
+-------------
+
+..  autofunction:: paddle.fluid.layers.detection_map
+    :noindex:
+
+iou_similarity
+--------------
+
+..  autofunction:: paddle.fluid.layers.iou_similarity
+    :noindex:
+
+box_coder
+---------
+
+..  autofunction:: paddle.fluid.layers.box_coder
+    :noindex:
+
+learning_rate_scheduler
+=======================
+
+exponential_decay
+-----------------
+
+..  autofunction:: paddle.fluid.layers.exponential_decay
+    :noindex:
+
+natural_exp_decay
+-----------------
+
+..  autofunction:: paddle.fluid.layers.natural_exp_decay
+    :noindex:
+
+inverse_time_decay
+------------------
+
+..  autofunction:: paddle.fluid.layers.inverse_time_decay
+    :noindex:
+
+polynomial_decay
+----------------
+
+..  autofunction:: paddle.fluid.layers.polynomial_decay
+    :noindex:
+
+piecewise_decay
+---------------
+
+..  autofunction:: paddle.fluid.layers.piecewise_decay
+    :noindex:
+
+noam_decay
+----------
+
+..  autofunction:: paddle.fluid.layers.noam_decay
+    :noindex:
+
--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
@@ -89,13 +89,6 @@ DecayedAdagradOptimizer
    :members:
    :noindex:

-RMSPropOptimizer
----------------
-
-..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
-    :members:
-    :noindex:
-
 Adadelta
 --------


--- a/doc/fluid/api/profiler.rst
+++ b/doc/fluid/api/profiler.rst
@@ -23,15 +23,3 @@ profiler
 ..  autofunction:: paddle.fluid.profiler.profiler
    :noindex:

-start_profiler
--------------
-
-..  autofunction:: paddle.fluid.profiler.start_profiler
-    :noindex:
-
-stop_profiler
-------------
-
-..  autofunction:: paddle.fluid.profiler.stop_profiler
-    :noindex:
-
--- a/doc/survey/dynamic_graph.md
+++ b/doc/survey/dynamic_graph.md
@@ -171,7 +171,7 @@ Pytorch chooses immediate evaluation. It avoids ever materializing a "forward gr

 ## What can fluid learn from them?

-TBD
+Please refer to `paddle/contrib/dynamic/`.

 # Appendix


--- a/doc/v2/api/config/evaluators.rst
+++ b/doc/v2/api/config/evaluators.rst
@@ -101,7 +101,7 @@ value_printer
    :noindex:

 Detection
-=====
+==========

 detection_map
 -------------

--- a/doc/v2/api/config/layer.rst
+++ b/doc/v2/api/config/layer.rst
@@ -11,7 +11,7 @@ Data layer

 data
 ----
-..  autoclass:: paddle.v2.layer.data
+..  autofunction:: paddle.v2.layer.data
    :noindex:

 Fully Connected Layers
@@ -21,12 +21,12 @@ Fully Connected Layers

 fc
 --
-..  autoclass:: paddle.v2.layer.fc
+..  autofunction:: paddle.v2.layer.fc
    :noindex:

 selective_fc
 ------------
-..  autoclass:: paddle.v2.layer.selective_fc
+..  autofunction:: paddle.v2.layer.selective_fc
    :noindex:

 Conv Layers
@@ -34,34 +34,34 @@ Conv Layers

 conv_operator
 -------------
-..  autoclass:: paddle.v2.layer.conv_operator
+..  autofunction:: paddle.v2.layer.conv_operator
    :noindex:

 conv_projection
 ---------------
-..  autoclass:: paddle.v2.layer.conv_projection
+..  autofunction:: paddle.v2.layer.conv_projection
    :noindex:

 conv_shift
 ----------
-..  autoclass:: paddle.v2.layer.conv_shift
+..  autofunction:: paddle.v2.layer.conv_shift
    :noindex:

 img_conv
 --------
-..  autoclass:: paddle.v2.layer.img_conv
+..  autofunction:: paddle.v2.layer.img_conv
    :noindex:

 ..  _api_v2.layer_context_projection:

 context_projection
 ------------------
-..  autoclass:: paddle.v2.layer.context_projection
+..  autofunction:: paddle.v2.layer.context_projection
    :noindex:

 row_conv
 --------
-..  autoclass:: paddle.v2.layer.row_conv
+..  autofunction:: paddle.v2.layer.row_conv
    :noindex:

 Image Pooling Layer
@@ -69,27 +69,27 @@ Image Pooling Layer

 img_pool
 --------
-..  autoclass:: paddle.v2.layer.img_pool
+..  autofunction:: paddle.v2.layer.img_pool
    :noindex:

 spp
 ---
-..  autoclass:: paddle.v2.layer.spp
+..  autofunction:: paddle.v2.layer.spp
    :noindex:

 maxout
 ------
-..  autoclass:: paddle.v2.layer.maxout
+..  autofunction:: paddle.v2.layer.maxout
    :noindex:

 roi_pool
 --------
-..  autoclass:: paddle.v2.layer.roi_pool
+..  autofunction:: paddle.v2.layer.roi_pool
    :noindex:

 pad
 ----
-..  autoclass:: paddle.v2.layer.pad
+..  autofunction:: paddle.v2.layer.pad
    :noindex:

 Norm Layer
@@ -97,27 +97,27 @@ Norm Layer

 img_cmrnorm
 -----------
-..  autoclass:: paddle.v2.layer.img_cmrnorm
+..  autofunction:: paddle.v2.layer.img_cmrnorm
    :noindex:

 batch_norm
 ----------
-..  autoclass:: paddle.v2.layer.batch_norm
+..  autofunction:: paddle.v2.layer.batch_norm
    :noindex:

 sum_to_one_norm
 ---------------
-..  autoclass:: paddle.v2.layer.sum_to_one_norm
+..  autofunction:: paddle.v2.layer.sum_to_one_norm
    :noindex:

 cross_channel_norm
 ------------------
-..  autoclass:: paddle.v2.layer.cross_channel_norm
+..  autofunction:: paddle.v2.layer.cross_channel_norm
    :noindex:

 row_l2_norm
 -----------
-..  autoclass:: paddle.v2.layer.row_l2_norm
+..  autofunction:: paddle.v2.layer.row_l2_norm
    :noindex:

 Recurrent Layers
@@ -125,22 +125,22 @@ Recurrent Layers

 recurrent
 ---------
-..  autoclass:: paddle.v2.layer.recurrent
+..  autofunction:: paddle.v2.layer.recurrent
    :noindex:

 lstmemory
 ---------
-..  autoclass:: paddle.v2.layer.lstmemory
+..  autofunction:: paddle.v2.layer.lstmemory
    :noindex:

 grumemory
 ---------
-..  autoclass:: paddle.v2.layer.grumemory
+..  autofunction:: paddle.v2.layer.grumemory
    :noindex:

 gated_unit
 -----------
-..  autoclass:: paddle.v2.layer.gated_unit
+..  autofunction:: paddle.v2.layer.gated_unit
    :noindex:

 Recurrent Layer Group
@@ -148,32 +148,32 @@ Recurrent Layer Group

 memory
 ------
-..  autoclass:: paddle.v2.layer.memory
+..  autofunction:: paddle.v2.layer.memory
    :noindex:

 recurrent_group
 ---------------
-..  autoclass:: paddle.v2.layer.recurrent_group
+..  autofunction:: paddle.v2.layer.recurrent_group
    :noindex:

 lstm_step
 ---------
-..  autoclass:: paddle.v2.layer.lstm_step
+..  autofunction:: paddle.v2.layer.lstm_step
    :noindex:

 gru_step
 --------
-..  autoclass:: paddle.v2.layer.gru_step
+..  autofunction:: paddle.v2.layer.gru_step
    :noindex:

 beam_search
 ------------
-..  autoclass:: paddle.v2.layer.beam_search
+..  autofunction:: paddle.v2.layer.beam_search
    :noindex:

 get_output
 ----------
-..  autoclass:: paddle.v2.layer.get_output
+..  autofunction:: paddle.v2.layer.get_output
    :noindex:

 Mixed Layer
@@ -183,54 +183,54 @@ Mixed Layer

 mixed
 -----
-..  autoclass:: paddle.v2.layer.mixed
+..  autofunction:: paddle.v2.layer.mixed
    :noindex:

 ..  _api_v2.layer_embedding:

 embedding
 ---------
-..  autoclass:: paddle.v2.layer.embedding
+..  autofunction:: paddle.v2.layer.embedding
    :noindex:

 scaling_projection
 ------------------
-..  autoclass:: paddle.v2.layer.scaling_projection
+..  autofunction:: paddle.v2.layer.scaling_projection
    :noindex:

 dotmul_projection
 -----------------
-..  autoclass:: paddle.v2.layer.dotmul_projection
+..  autofunction:: paddle.v2.layer.dotmul_projection
    :noindex:

 dotmul_operator
 ---------------
-..  autoclass:: paddle.v2.layer.dotmul_operator
+..  autofunction:: paddle.v2.layer.dotmul_operator
    :noindex:

 full_matrix_projection
 ----------------------
-..  autoclass:: paddle.v2.layer.full_matrix_projection
+..  autofunction:: paddle.v2.layer.full_matrix_projection
    :noindex:

 identity_projection
 -------------------
-..  autoclass:: paddle.v2.layer.identity_projection
+..  autofunction:: paddle.v2.layer.identity_projection
    :noindex:

 slice_projection
 -------------------
-..  autoclass:: paddle.v2.layer.slice_projection
+..  autofunction:: paddle.v2.layer.slice_projection
    :noindex:

 table_projection
 ----------------
-..  autoclass:: paddle.v2.layer.table_projection
+..  autofunction:: paddle.v2.layer.table_projection
    :noindex:

 trans_full_matrix_projection
 ----------------------------
-..  autoclass:: paddle.v2.layer.trans_full_matrix_projection
+..  autofunction:: paddle.v2.layer.trans_full_matrix_projection
    :noindex:

 Aggregate Layers
@@ -245,51 +245,46 @@ AggregateLevel

 pooling
 -------
-..  autoclass:: paddle.v2.layer.pooling
+..  autofunction:: paddle.v2.layer.pooling
    :noindex:

 ..  _api_v2.layer_last_seq:

 last_seq
 --------
-..  autoclass:: paddle.v2.layer.last_seq
+..  autofunction:: paddle.v2.layer.last_seq
    :noindex:

 ..  _api_v2.layer_first_seq:

 first_seq
 ---------
-..  autoclass:: paddle.v2.layer.first_seq
+..  autofunction:: paddle.v2.layer.first_seq
    :noindex:

 sub_seq
 ---------
-..  autoclass:: paddle.v2.layer.sub_seq
+..  autofunction:: paddle.v2.layer.sub_seq
    :noindex:

 concat
 ------
-..  autoclass:: paddle.v2.layer.concat
+..  autofunction:: paddle.v2.layer.concat
    :noindex:

 seq_concat
 ----------
-..  autoclass:: paddle.v2.layer.seq_concat
+..  autofunction:: paddle.v2.layer.seq_concat
    :noindex:

 seq_slice
 ---------
-..  autoclass:: paddle.v2.layer.seq_slice
-    :noindex:
-
-kmax_sequence_score
-------------------
-..  autoclass:: paddle.v2.layer.kmax_sequence_score
+..  autofunction:: paddle.v2.layer.seq_slice
    :noindex:

 sub_nested_seq
 --------------
-..  autoclass:: paddle.v2.layer.sub_nested_seq
+..  autofunction:: paddle.v2.layer.sub_nested_seq
    :noindex:

 Reshaping Layers
@@ -297,7 +292,7 @@ Reshaping Layers

 block_expand
 ------------
-..  autoclass:: paddle.v2.layer.block_expand
+..  autofunction:: paddle.v2.layer.block_expand
    :noindex:

 ..  _api_v2.layer_expand:
@@ -309,22 +304,22 @@ ExpandLevel

 expand
 ------
-..  autoclass:: paddle.v2.layer.expand
+..  autofunction:: paddle.v2.layer.expand
    :noindex:

 repeat
 ------
-..  autoclass:: paddle.v2.layer.repeat
+..  autofunction:: paddle.v2.layer.repeat
    :noindex:

 rotate
 ------
-..  autoclass:: paddle.v2.layer.rotate
+..  autofunction:: paddle.v2.layer.rotate
    :noindex:

 seq_reshape
 -----------
-..  autoclass:: paddle.v2.layer.seq_reshape
+..  autofunction:: paddle.v2.layer.seq_reshape
    :noindex:

 Math Layers
@@ -332,94 +327,94 @@ Math Layers

 addto
 -----
-..  autoclass:: paddle.v2.layer.addto
+..  autofunction:: paddle.v2.layer.addto
    :noindex:

 linear_comb
 -----------
-..  autoclass:: paddle.v2.layer.linear_comb
+..  autofunction:: paddle.v2.layer.linear_comb
    :noindex:

 interpolation
 -------------
-..  autoclass:: paddle.v2.layer.interpolation
+..  autofunction:: paddle.v2.layer.interpolation
    :noindex:

 bilinear_interp
 ---------------
-..  autoclass:: paddle.v2.layer.bilinear_interp
+..  autofunction:: paddle.v2.layer.bilinear_interp
    :noindex:

 dropout
 --------
-..  autoclass:: paddle.v2.layer.dropout
+..  autofunction:: paddle.v2.layer.dropout
    :noindex:

 dot_prod
 ---------
-.. autoclass:: paddle.v2.layer.dot_prod
+.. autofunction:: paddle.v2.layer.dot_prod
    :noindex:

 out_prod
 --------
-.. autoclass:: paddle.v2.layer.out_prod
+.. autofunction:: paddle.v2.layer.out_prod
    :noindex:

 power
 -----
-..  autoclass:: paddle.v2.layer.power
+..  autofunction:: paddle.v2.layer.power
    :noindex:

 scaling
 -------
-..  autoclass:: paddle.v2.layer.scaling
+..  autofunction:: paddle.v2.layer.scaling
    :noindex:

 clip
 ----
-..  autoclass:: paddle.v2.layer.clip
+..  autofunction:: paddle.v2.layer.clip
    :noindex:

 resize
 ------
-..  autoclass:: paddle.v2.layer.resize
+..  autofunction:: paddle.v2.layer.resize
    :noindex:

 slope_intercept
 ---------------
-..  autoclass:: paddle.v2.layer.slope_intercept
+..  autofunction:: paddle.v2.layer.slope_intercept
    :noindex:

 tensor
 ------
-..  autoclass:: paddle.v2.layer.tensor
+..  autofunction:: paddle.v2.layer.tensor
    :noindex:

 ..  _api_v2.layer_cos_sim:

 cos_sim
 -------
-..  autoclass:: paddle.v2.layer.cos_sim
+..  autofunction:: paddle.v2.layer.cos_sim
    :noindex:

 l2_distance
 -----------
-..  autoclass:: paddle.v2.layer.l2_distance
+..  autofunction:: paddle.v2.layer.l2_distance
    :noindex:

 trans
 -----
-..  autoclass:: paddle.v2.layer.trans
+..  autofunction:: paddle.v2.layer.trans
    :noindex:

 scale_shift
 -----------
-..  autoclass:: paddle.v2.layer.scale_shift
+..  autofunction:: paddle.v2.layer.scale_shift
    :noindex:

 factorization_machine
 ---------------------
-..  autoclass:: paddle.v2.layer.factorization_machine
+..  autofunction:: paddle.v2.layer.factorization_machine
    :noindex:

 Sampling Layers
@@ -427,17 +422,17 @@ Sampling Layers

 maxid
 -----
-..  autoclass:: paddle.v2.layer.max_id
+..  autofunction:: paddle.v2.layer.max_id
    :noindex:

 sampling_id
 -----------
-..  autoclass:: paddle.v2.layer.sampling_id
+..  autofunction:: paddle.v2.layer.sampling_id
    :noindex:

 multiplex
 ---------
-..  autoclass:: paddle.v2.layer.multiplex
+..  autofunction:: paddle.v2.layer.multiplex
    :noindex:

 ..  _api_v2.layer_costs:
@@ -447,97 +442,97 @@ Cost Layers

 cross_entropy_cost
 ------------------
-..  autoclass:: paddle.v2.layer.cross_entropy_cost
+..  autofunction:: paddle.v2.layer.cross_entropy_cost
    :noindex:

 cross_entropy_with_selfnorm_cost
 --------------------------------
-..  autoclass:: paddle.v2.layer.cross_entropy_with_selfnorm_cost
+..  autofunction:: paddle.v2.layer.cross_entropy_with_selfnorm_cost
    :noindex:

 multi_binary_label_cross_entropy_cost
 -------------------------------------
-..  autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
+..  autofunction:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
    :noindex:

 classification_cost
 -------------------
-.. autoclass:: paddle.v2.layer.classification_cost
+.. autofunction:: paddle.v2.layer.classification_cost
   :noindex:

 huber_regression_cost
 -------------------------
-..  autoclass:: paddle.v2.layer.huber_regression_cost
+..  autofunction:: paddle.v2.layer.huber_regression_cost
    :noindex:

 huber_classification_cost
 -------------------------
-..  autoclass:: paddle.v2.layer.huber_classification_cost
+..  autofunction:: paddle.v2.layer.huber_classification_cost
    :noindex:

 lambda_cost
 -----------
-..  autoclass:: paddle.v2.layer.lambda_cost
+..  autofunction:: paddle.v2.layer.lambda_cost
    :noindex:

 square_error_cost
 -----------------
-..  autoclass:: paddle.v2.layer.square_error_cost
+..  autofunction:: paddle.v2.layer.square_error_cost
    :noindex:

 rank_cost
 ---------
-..  autoclass:: paddle.v2.layer.rank_cost
+..  autofunction:: paddle.v2.layer.rank_cost
    :noindex:

 sum_cost
 ---------
-..  autoclass:: paddle.v2.layer.sum_cost
+..  autofunction:: paddle.v2.layer.sum_cost
    :noindex:

 crf
 ---
-..  autoclass:: paddle.v2.layer.crf
+..  autofunction:: paddle.v2.layer.crf
    :noindex:

 crf_decoding
 ------------
-..  autoclass:: paddle.v2.layer.crf_decoding
+..  autofunction:: paddle.v2.layer.crf_decoding
    :noindex:

 ctc
 ---
-..  autoclass:: paddle.v2.layer.ctc
+..  autofunction:: paddle.v2.layer.ctc
    :noindex:

 warp_ctc
 --------
-..  autoclass:: paddle.v2.layer.warp_ctc
+..  autofunction:: paddle.v2.layer.warp_ctc
    :noindex:

 nce
 ---
-..  autoclass:: paddle.v2.layer.nce
+..  autofunction:: paddle.v2.layer.nce
    :noindex:

 hsigmoid
 ---------
-..  autoclass:: paddle.v2.layer.hsigmoid
+..  autofunction:: paddle.v2.layer.hsigmoid
    :noindex:

 smooth_l1_cost
 --------------
-..  autoclass:: paddle.v2.layer.smooth_l1_cost
+..  autofunction:: paddle.v2.layer.smooth_l1_cost
    :noindex:

 multibox_loss
 --------------
-..  autoclass:: paddle.v2.layer.multibox_loss
+..  autofunction:: paddle.v2.layer.multibox_loss
    :noindex:

 detection_output
 ----------------
-..  autoclass:: paddle.v2.layer.detection_output
+..  autofunction:: paddle.v2.layer.detection_output
    :noindex:

 Check Layer
@@ -545,7 +540,7 @@ Check Layer

 eos
 ---
-..  autoclass:: paddle.v2.layer.eos
+..  autofunction:: paddle.v2.layer.eos
    :noindex:

 Activation
@@ -553,5 +548,5 @@ Activation

 prelu
 --------
-..  autoclass:: paddle.v2.layer.prelu
+..  autofunction:: paddle.v2.layer.prelu
    :noindex:
--- a/doc/v2/api/index_en.rst
+++ b/doc/v2/api/index_en.rst
@@ -8,4 +8,3 @@ API
    model_configs.rst
    data.rst
    run_logic.rst
-    fluid/index.rst
--- a/doc/v2/build_and_install/pip_install_cn.rst
+++ b/doc/v2/build_and_install/pip_install_cn.rst
@@ -60,6 +60,7 @@ paddlepaddle-gpu==0.11.0            使用CUDA 7.5和cuDNN 5编译的0.11.0版
    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"

 .. _pip_dependency:


--- a/doc/v2/build_and_install/pip_install_en.rst
+++ b/doc/v2/build_and_install/pip_install_en.rst
@@ -63,6 +63,7 @@ If the links below shows up the login form, just click "Log in as guest" to star
    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"

 .. _pip_dependency:


--- a/paddle/contrib/CMakeLists.txt
+++ b/paddle/contrib/CMakeLists.txt
@@ -14,3 +14,4 @@
 #

 add_subdirectory(inference)
+add_subdirectory(tape)
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -17,48 +17,9 @@ if(APPLE)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
 endif(APPLE)

-set(ANAKIN_INCLUDE "" CACHE STRING "root of Anakin header files")
-set(ANAKIN_LIBRARY "" CACHE STRING "path of Anakin library")
-

 set(inference_deps paddle_inference_api paddle_fluid_api)

-# if anakin is set enable anakin api implementation
-if(ANAKIN_INCLUDE AND ANAKIN_LIBRARY)
-    set(ANAKIN_FOUND ON)
-else()
-    set(ANAKIN_FOUND OFF)
-endif()
-
-function(fetch_include_recursively root_dir) 
-    if (IS_DIRECTORY ${root_dir}) 
-        include_directories(${root_dir})
-    endif()
-
-    file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*)
-    foreach(sub ${ALL_SUB})
-        if (IS_DIRECTORY ${root_dir}/${sub})
-            fetch_include_recursively(${root_dir}/${sub})
-        endif()
-    endforeach()
-endfunction()
-
-if (ANAKIN_FOUND)
-    # Anakin's code style doesn't follow google c style.
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp")
-
-    message(STATUS "Anakin for inference is enabled")
-    message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
-    fetch_include_recursively(${ANAKIN_INCLUDE})
-
-    link_directories(${ANAKIN_LIBRARY})
-
-    nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
-    target_link_libraries(inference_anakin_api anakin anakin_saber_common)
-    list(APPEND inference_deps inference_anakin_api)
-endif()
-
-
 function(inference_api_test TARGET_NAME)
    if (WITH_TESTING)
        set(options "")
@@ -79,7 +40,7 @@ function(inference_api_test TARGET_NAME)
 endfunction(inference_api_test)

 cc_library(paddle_inference_api
-    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc 
+    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})

 cc_test(test_paddle_inference_api
@@ -89,9 +50,17 @@ cc_test(test_paddle_inference_api
 inference_api_test(test_paddle_inference_api_impl
                    ARGS test_word2vec test_image_classification)

-if (ANAKIN_FOUND)
+if (WITH_ANAKIN)
+    # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
+    # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
+    # compile the libinference_anakin_api.a and compile with anakin.so.
+    nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
+    target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
+    target_link_libraries(inference_anakin_api anakin anakin_saber_common)
    cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
-    DEPS ${inference_deps})
+                                  ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
+                                  DEPS inference_anakin_api)
+    target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
 endif()

 if(WITH_TESTING)

--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <cuda.h>
-
 #include "paddle/contrib/inference/paddle_inference_api_anakin_engine.h"
+#include <cuda.h>

 namespace paddle {


--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
@@ -19,10 +19,9 @@ limitations under the License. */

 #pragma once

-// NOTE This header file do not have namespace.
-//#include <test/framework/net/paddle_api.h>
 #include "paddle/contrib/inference/paddle_inference_api.h"

+// from anakin
 #include "framework/core/net/net.h"
 #include "saber/saber_types.h"


--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
@@ -12,17 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>

-#include "gflags/gflags.h"
 #include "paddle/contrib/inference/paddle_inference_api.h"

+DEFINE_string(model, "", "Directory of the inference model.");
+
 namespace paddle {

 AnakinConfig GetConfig() {
  AnakinConfig config;
-  config.model_file = "./mobilenet_v2.anakin.bin";
+  config.model_file = FLAGS_model;
  config.device = 0;
  config.max_batch_size = 1;
  return config;

--- a/paddle/contrib/tape/CMakeLists.txt
+++ b/paddle/contrib/tape/CMakeLists.txt
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+if(APPLE)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
+endif(APPLE)
+
+cc_library(tape_variable SRCS variable.cc DEPS ${FLUID_CORE_MODULES})
+cc_library(tape SRCS tape.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} tape_variable)
+
+cc_test(test_tape
+        SRCS test_tape.cc
+        DEPS tape tape_variable)
--- a/paddle/contrib/tape/README.md
+++ b/paddle/contrib/tape/README.md
+# Dynamic Graph on Fluid
+
+PaddlePaddle Fluid is targeting the autodiff without tape, which, however, is very
+challenging and we are still way from there. DyNet and PyTorch provide a good design
+idea, the *tape*, that significantly eases the challenge.  Also, DyNet provides
+a C++ API that is as convenient as Python but with higher efficiency and could
+conveniently integrate with industrial/production systems. This package, `tape`,
+combines the good of
+
+1. tape from PyTorch and DyNet
+2. C++ API and core from DyNet
+3. rich set of operators from PaddlePaddle
+
+## Overview
+
+We can implement Dynet-like Tape(See this [survey](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/survey/dynamic_graph.md))
+by wrapping Paddle Fluid's `Operator` and `Variable`.
+
+The user API is straight forward since
+
+1. it is imperative. And it uses host language's control flow logic.
+1. it avoids extra concepts such as `Scope` and `Executor`.
+
+All of these benefits come at the cost of just adding one line `reset_global_tape`
+at every iteration.
+
+## Code Structure
+
+In short, the `Tape` contains a vector of `OpHandle`s. And an `OpHandle` contains its
+`type`, the pointers to the `Variable`s, and necessary attributes.
+
+```c++
+class Variable {
+public:
+  VriableHandle Grad(); // returns its gradient variable
+private:
+  framework::VarDesc desc_; // compile time infershape, necessary for lazy execution
+  framework::Variable var_; // run time variable, holds data memory
+};
+
+using VariableHandle = shared_ptr<Variable>;
+
+struct OpHandle {
+  string type_;
+  map<string, vector<VariableHandle>> inputs_;
+  map<string, vector<VariableHandle>> outputs_;
+  AttributeMap attrs_;
+};
+
+class Tape {
+public:
+  void AddOp(OpHandle); // add op
+  void Forward();       // execute the tape_
+  void Backward();      // execute the backward of the tape_
+private:
+  vector<OpHandle> tape_;
+};
+```
+
+We uses `Function` to indicate layers. It takes care of parameter
+initialization and `AddOp` to the Tape when it is called.
+
+```c++
+class Linear {
+ public:
+  Linear(int in_dim, int out_dim, const std::string &act)
+      : w_(new Variable("LinearWeight")),
+        b_(new Variable("LinearBias")),
+        act_(act) {
+    Tape init_tape;
+
+    std::string initializer = "fill_constant";
+    framework::AttributeMap attrs;
+    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
+    attrs["shape"] = std::vector<int>{in_dim, out_dim};
+    attrs["value"] = 1.0f;
+    init_tape.AddOp(initializer, {}, {{"Out", {w_}}}, attrs);
+
+    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
+    attrs["shape"] = std::vector<int>{out_dim};
+    attrs["value"] = 1.0f;
+    init_tape.AddOp(initializer, {}, {{"Out", {b_}}}, attrs);
+
+    init_tape.Forward();
+  }
+
+  VariableHandle operator()(VariableHandle input) {
+    VariableHandle pre_bias(new Variable("linear"));
+    get_global_tape().AddOp("mul",
+                            {{"X", {input}}, {"Y", {w_}}},
+                            {{"Out", {pre_bias}}},
+                            {{"x_num_col_dims", 1}, {"y_num_col_dims", 1}});
+    VariableHandle pre_act(new Variable("linear"));
+    get_global_tape().AddOp("elementwise_add",
+                            {{"X", {pre_bias}}, {"Y", {b_}}},
+                            {{"Out", {pre_act}}},
+                            {{"axis", 1}});
+    VariableHandle post_act(new Variable("linear"));
+    get_global_tape().AddOp(act_,
+                            {{"X", {pre_act}}},
+                            {{"Out", {post_act}}},
+                            {});
+    return post_act;
+  }
+
+  std::vector<VariableHandle> Params() { return {w_, b_}; }
+
+ private:
+  VariableHandle w_;
+  VariableHandle b_;
+  std::string act_;
+};
+```
+
+## User API
+
+```c++
+// Model function
+paddle::tape::Linear linear1(3, 3, "relu"); // init weight and bias
+paddle::tape::Linear linear2(3, 3, "relu"); // init weight and bias
+paddle::tape::Mean mean;
+
+// Optimizer
+paddle::tape::SGD sgd(0.001);
+
+// Data Feeder
+paddle::tape::Fill data_feeder(...);
+VariableHandle input(new paddle::tape::Variable("input"));
+VariableHandle label(new paddle::tape::Variable("label"));
+
+for (int i = 0; i < 2; ++i) {
+  reset_global_tape();
+
+  data_feeder(input, label);
+
+  auto loss = softmax(linear2(linear1(input)), label); // compile time InferShape & InferVarType
+  LOG(INFO) << loss.value(); // Run forward up to loss
+
+  // Run backward, store gradient of w at w->Grad()
+  get_global_tape.Backward(loss);
+
+  // Update w
+  sgd(linear1.Params());
+  sgd(linear2.Params());
+}
+```
+
+<details>
+  <summary></summary>
+digraph G {
+
+	subgraph cluster_0 {
+                node [shape=record,style=filled];
+		style=filled;
+		color=lightgrey;
+                linear1 [label="{type: mul | {input | {<before_mul1>X: before_mul1 |<weight1> Y: weight1}} |  {output |<before_bias1> Out: before_bias1}}"];
+                elementwise_add1 [label="{type: elementwise_add | {input | {<before_bias1>X: before_bias1 |<bias1> Y: bias1}} |  {output |<before_act1> Out: before_act1}}"];
+                relu1 [label="{type: relu | {input | {<before_act1>X: before_act1 }} |  {output |<after_act1> Out: after_act1}}"];
+
+		linear1 -> elementwise_add1->relu1;
+		label = "forward tape";
+	}
+
+        linear1:before_mul1->before_mul1
+        linear1:weight1->weight1
+        linear1:before_bias1->before_bias1
+
+        elementwise_add1:bias1->bias1
+        elementwise_add1:before_bias1->before_bias1
+        elementwise_add1:before_act1->before_act1
+
+        relu1:before_act1->before_act1
+        relu1:after_act1->after_act1
+
+	subgraph cluster_1 {
+                node [shape=record,style=filled];
+		style=filled;
+		color=lightgrey;
+                linear1_grad [label="{type: mul_grad | {input | {<before_mul1>X: before_mul1 |<weight1> Y: weight1|<before_bias1_grad> Out_grad: before_bias1_grad}} |  {output |{<before_mul1_grad>X_grad: before_mul1_grad |<weight1_grad> Y_grad: weight1_grad}}}"];
+
+                elementwise_add1_grad [label="{type: elementwise_add_grad | {input | <before_act1_grad> Out_grad: before_act1_grad} |  {output |{<before_bias1_grad>X_grad: before_bias1_grad |<bias1_grad> Y_grad: bias1_grad}}}"];
+
+                relu1_grad [label="{type: relu_grad |  {input |<after_act1_grad> Out_grad: after_act1_grad} | {ouput | {<before_act1_grad>X_grad: before_act1_grad }}}"];
+
+		linear1_grad -> elementwise_add1_grad ->relu1_grad [dir=back];
+                label = "backward tape";
+	}
+
+        relu1_grad:after_act1_grad->after_act1_grad
+        relu1_grad:before_act1_grad->before_act1_grad
+
+        elementwise_add1_grad:before_act1_grad->before_act1_grad
+        elementwise_add1_grad:before_bias1_grad->before_bias1_grad
+        elementwise_add1_grad:bias1_grad->bias1_grad
+
+        linear1_grad:before_mul1->before_mul1
+        linear1_grad:weight1->weight1
+        linear1_grad:before_bias1_grad->before_bias1_grad
+        linear1_grad:before_mul1_grad->before_mul1_grad
+        linear1_grad:weight1_grad->weight1_grad
+
+
+	subgraph cluster_2 {
+                node [shape=record];
+                label = "Linear1";
+                weight1
+                bias1
+	}
+
+        weight1 -> weight1_grad [ label="Grad()", style="dashed" ];
+        bias1 -> bias1_grad [ label="Grad()", style="dashed"];
+
+	
+
+}
+</details>
+
+![Image](https://github.com/tonyyang-svail/Paddle/blob/cpp_tap/paddle/contrib/tape/computation_graph.png)
+
+## Code Reuse
+
+We want to stay close to Paddle Fluid as much as possible.
+
+### Reuse All Operators
+
+As all Ops are registered at `OpInfoMap`, the effort of adding a new `Function`
+is about 10 lines of code, similar to expose an operator to Python.
+
+### Reuse Compile Time InferShape and InferVarType
+
+Note that all the symbolic information is stored at `tape::Varaible::desc_`, instead
+of `ProgramDesc.block.vars`, we create a temporary `BlockDesc` to do `InferShape` and
+`InferVarType` every time we `AddOp` to the tape.
+
+### Reuse Operator::Run
+
+We use smart pointer, instead of `Scope`, to manage memory. So we create a temporary
+`Scope` for every `Operator::Run()`.
+
+## Possible Feature
+
+### Release Memory on Backward
+
+We can release memory aggressively. During backward, we can delete the OpHandle once
+we have finished its backward. Since all the variable is managed by smart pointer, the
+memory is automatically released when its `ref_count` goes to 0.
+
+### Kernel Fusion
+
+As a symbolic representation of the Tape is constructed first before the actual
+execution, it would be possible to perform graph optimization. One use case is kernel
+fusion.
--- a/paddle/contrib/tape/computation_graph.png
+++ b/paddle/contrib/tape/computation_graph.png
--- a/paddle/contrib/tape/function.h
+++ b/paddle/contrib/tape/function.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/contrib/tape/tape.h"
+#include "paddle/contrib/tape/variable.h"
+#include "paddle/fluid/framework/type_defs.h"
+
+namespace paddle {
+namespace tape {
+
+class Function {};
+
+class Fill {
+ public:
+  Fill(const std::string &initializer, const framework::AttributeMap &attrs)
+      : initializer_(initializer), attrs_(attrs) {}
+
+  void operator()(VariableHandle var) {
+    get_global_tape().AddOp(initializer_, {}, {{"Out", {var}}}, attrs_);
+  }
+
+ private:
+  const std::string initializer_;
+  const framework::AttributeMap attrs_;
+};
+
+class Mean {
+ public:
+  VariableHandle operator()(VariableHandle var) {
+    VariableHandle out(new Variable("mean"));
+    get_global_tape().AddOp("mean", {{"X", {var}}}, {{"Out", {out}}}, {});
+    return out;
+  }
+};
+
+class Linear {
+ public:
+  Linear(int in_dim, int out_dim, const std::string &act)
+      : w_(new Variable("LinearWeight")),
+        b_(new Variable("LinearBias")),
+        act_(act) {
+    Tape init_tape;
+
+    std::string initializer = "fill_constant";
+    framework::AttributeMap attrs;
+    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
+    attrs["shape"] = std::vector<int>{in_dim, out_dim};
+    attrs["value"] = 1.0f;
+    init_tape.AddOp(initializer, {}, {{"Out", {w_}}}, attrs);
+
+    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
+    attrs["shape"] = std::vector<int>{out_dim};
+    attrs["value"] = 1.0f;
+    init_tape.AddOp(initializer, {}, {{"Out", {b_}}}, attrs);
+
+    init_tape.Forward();
+  }
+
+  VariableHandle operator()(VariableHandle input) {
+    VariableHandle pre_bias(new Variable("linear"));
+    get_global_tape().AddOp("mul",
+                            {{"X", {input}}, {"Y", {w_}}},
+                            {{"Out", {pre_bias}}},
+                            {{"x_num_col_dims", 1}, {"y_num_col_dims", 1}});
+    VariableHandle pre_act(new Variable("linear"));
+    get_global_tape().AddOp("elementwise_add",
+                            {{"X", {pre_bias}}, {"Y", {b_}}},
+                            {{"Out", {pre_act}}},
+                            {{"axis", 1}});
+    VariableHandle post_act(new Variable("linear"));
+    get_global_tape().AddOp(
+        act_, {{"X", {pre_act}}}, {{"Out", {post_act}}}, {});
+    return post_act;
+  }
+
+  std::vector<VariableHandle> Params() { return {w_, b_}; }
+
+ private:
+  VariableHandle w_;
+  VariableHandle b_;
+  std::string act_;
+};
+
+class SGD {
+ public:
+  SGD(float learning_rate) : learning_rate_(new Variable("sgd")) {
+    Tape init_tape;
+
+    std::string initializer = "fill_constant";
+    framework::AttributeMap attrs;
+    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
+    attrs["shape"] = std::vector<int>{1};
+    attrs["value"] = learning_rate;
+    init_tape.AddOp(initializer, {}, {{"Out", {learning_rate_}}}, attrs);
+
+    init_tape.Forward();
+  }
+
+  void operator()(VariableHandle input) {
+    PADDLE_ENFORCE(get_global_tape().HasBeenBackwarded(),
+                   "optimization must happen after the backward");
+    Tape temp_tape;
+    temp_tape.AddOp("sgd",
+                    {{"Param", {input}},
+                     {"LearningRate", {learning_rate_}},
+                     {"Grad", {input->Grad()}}},
+                    {{"ParamOut", {input}}},
+                    {});
+    temp_tape.Forward();
+  }
+
+ private:
+  VariableHandle learning_rate_;
+};
+}
+}
--- a/paddle/contrib/tape/tape.cc
+++ b/paddle/contrib/tape/tape.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/contrib/tape/tape.h"
+
+#include <list>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/dim.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/pybind/pybind.h"
+
+namespace paddle {
+namespace tape {
+
+// borrowed from
+// https://stackoverflow.com/questions/874134/find-if-string-ends-with-another-string-in-c
+inline bool ends_with(std::string const &value, std::string const &ending) {
+  if (ending.size() > value.size()) return false;
+  return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
+}
+
+std::ostream &operator<<(std::ostream &os, const framework::VarDesc &var_desc) {
+  os << var_desc.Name();
+  os << "[" << var_desc.GetType() << "]";
+  os << "[" << var_desc.GetDataType() << "]";
+  os << "{";
+  for (auto &i : var_desc.GetShape()) {
+    os << i << ",";
+  }
+  os << "}";
+  return os;
+}
+
+std::string to_string(const std::string &type,
+                      const VariableHandleMap &in_vars,
+                      const VariableHandleMap &out_vars,
+                      const framework::AttributeMap &attrs) {
+  std::stringstream ss;
+  ss << type << " ";
+  for (auto &param_name : in_vars) {
+    for (auto &var : param_name.second) {
+      ss << param_name.first << ":(" << var->Desc() << ") ";
+    }
+  }
+  for (auto &param_name : out_vars) {
+    for (auto &var : param_name.second) {
+      ss << param_name.first << ":(" << var->Desc() << ") ";
+    }
+  }
+  return ss.str();
+}
+
+framework::OpDesc CreateOpDesc(const std::string &type,
+                               const VariableHandleMap &in_vars,
+                               const VariableHandleMap &out_vars,
+                               const framework::AttributeMap &attrs) {
+  framework::VariableNameMap inputs;
+  for (auto &param_name : in_vars) {
+    for (auto &var : param_name.second) {
+      inputs[param_name.first].emplace_back(var->Name());
+    }
+  }
+  framework::VariableNameMap outputs;
+  for (auto &param_name : out_vars) {
+    for (auto &var : param_name.second) {
+      outputs[param_name.first].emplace_back(var->Name());
+    }
+  }
+  return framework::OpDesc(type, inputs, outputs, attrs);
+}
+
+void InferShapeAndVarType(const std::string &type,
+                          const VariableHandleMap &in_vars,
+                          VariableHandleMap *out_vars,
+                          const framework::AttributeMap &attrs) {
+  framework::OpDesc op_desc = CreateOpDesc(type, in_vars, *out_vars, attrs);
+
+  // Create a temporary block for compile-time
+  framework::ProgramDesc program_desc;
+  framework::BlockDesc *block_desc = program_desc.MutableBlock(0);
+  PADDLE_ENFORCE(block_desc);
+
+  for (auto &param_name : in_vars) {
+    for (auto &var : param_name.second) {
+      *block_desc->Var(var->Name())->Proto() = *var->MutableDesc()->Proto();
+    }
+  }
+  for (auto &param_name : *out_vars) {
+    for (auto &var : param_name.second) {
+      *block_desc->Var(var->Name())->Proto() = *var->MutableDesc()->Proto();
+    }
+  }
+
+  LOG(INFO) << "- " << to_string(type, in_vars, *out_vars, attrs);
+  op_desc.InferShape(*block_desc);
+  op_desc.InferVarType(block_desc);
+  for (auto &param_name : *out_vars) {
+    for (auto &var : param_name.second) {
+      *var->MutableDesc()->Proto() = *block_desc->Var(var->Name())->Proto();
+    }
+  }
+  LOG(INFO) << "+ " << to_string(type, in_vars, *out_vars, attrs);
+}
+
+void Tape::AddOp(const std::string &type,
+                 const VariableHandleMap &in_vars,
+                 VariableHandleMap out_vars,
+                 const framework::AttributeMap &attrs) {
+  InferShapeAndVarType(type, in_vars, &out_vars, attrs);
+  tape_.emplace_back(type, in_vars, out_vars, attrs);
+}
+
+// Temporary Scope for Operator::Run()
+class ScopeWrapper : public framework::Scope {
+ public:
+  ScopeWrapper(const VariableHandleMap &in_vars,
+               const VariableHandleMap &out_vars) {
+    for (auto &v : in_vars) {
+      for (auto &vv : v.second) {
+        if (!vars_.count(vv->Name())) {
+          vars_[vv->Name()].reset(vv->Var());
+        }
+      }
+    }
+    for (auto &v : out_vars) {
+      for (auto &vv : v.second) {
+        if (!vars_.count(vv->Name())) {
+          vars_[vv->Name()].reset(vv->Var());
+        }
+      }
+    }
+  }
+
+  ~ScopeWrapper() {
+    for (auto &pair : vars_) {
+      pair.second.release();
+    }
+  }
+};
+
+void Tape::Forward() {
+  LOG(INFO) << "Starting forward -------------------------";
+  PADDLE_ENFORCE(!has_been_backwarded_);
+  while (current_position_ < tape_.size()) {
+    OpHandle &op = tape_[current_position_];
+
+    // Create Output Tensor, this is only necessary for OpWithKernel
+    for (auto &param2var : op.outputs_) {
+      for (auto &var : param2var.second) {
+        var->InitializeVariable();
+      }
+    }
+
+    framework::OpDesc op_desc =
+        CreateOpDesc(op.type_, op.inputs_, op.outputs_, op.attrs_);
+    ScopeWrapper scope(op.inputs_, op.outputs_);
+    framework::OpRegistry::CreateOp(op_desc)->Run(scope, platform::CPUPlace());
+    current_position_++;
+  }
+
+  LOG(INFO) << "Finishing forward -------------------------";
+}
+
+void Tape::Backward(VariableHandle target) {
+  PADDLE_ENFORCE(!has_been_backwarded_);
+
+  Forward();
+
+  // TODO(tonyyang-svail): check output of last op is target
+  backward_tape_.reset(new Tape());
+
+  framework::AttributeMap attrs;
+
+  // FIXME(tonyyang-svail): Need to infer_data_type
+  attrs["dtype"] = framework::proto::VarType::Type::VarType_Type_FP32;
+  attrs["shape"] = std::vector<int>{1};
+  attrs["value"] = 1.0f;
+  backward_tape_->AddOp(
+      "fill_constant", {}, {{"Out", {target->Grad()}}}, attrs);
+
+  for (auto it = tape_.rbegin(); it != tape_.rend(); ++it) {
+    framework::OpDesc op_desc =
+        CreateOpDesc(it->type_, it->inputs_, it->outputs_, it->attrs_);
+    std::unordered_map<std::string, std::string> grad_to_var;
+    std::vector<std::unique_ptr<framework::OpDesc>> grad_op_descs =
+        framework::OpInfoMap::Instance()
+            .Get(op_desc.Type())
+            .GradOpMaker()(op_desc, {}, &grad_to_var, {});
+
+    for (auto &op_desc : grad_op_descs) {
+      std::unordered_map<std::string, VariableHandle> name2var;
+      for (auto &param2vars : it->inputs_) {
+        for (auto &a : param2vars.second) {
+          name2var[a->Name()] = a;
+        }
+      }
+      for (auto &param2vars : it->outputs_) {
+        for (auto &a : param2vars.second) {
+          name2var[a->Name()] = a;
+        }
+      }
+
+      VariableHandleMap in_vars;
+      VariableHandleMap out_vars;
+      std::map<const framework::VariableNameMap *, VariableHandleMap *>
+          loop_over{{&op_desc->Inputs(), &in_vars},
+                    {&op_desc->Outputs(), &out_vars}};
+      for (auto &each : loop_over) {
+        auto &vmp = *each.first;
+        auto &vhm = *each.second;
+        for (auto &p2a : vmp) {
+          for (auto &argu : p2a.second) {
+            if (name2var.count(argu)) {
+              vhm[p2a.first].push_back(name2var[argu]);
+            } else {
+              PADDLE_ENFORCE(ends_with(argu, framework::kGradVarSuffix),
+                             argu.c_str());
+              std::string name = argu.substr(
+                  0, argu.size() - std::strlen(framework::kGradVarSuffix));
+              PADDLE_ENFORCE(name2var.count(name), name.c_str());
+              vhm[p2a.first].push_back(name2var[name]->Grad());
+            }
+          }
+        }
+      }
+
+      backward_tape_->AddOp(
+          op_desc->Type(), in_vars, out_vars, op_desc->GetAttrMap());
+    }
+
+    // TODO(tonyyang-svail): how to fill empty grad?
+    // TODO(tonyyang-svail): Sum var grad is necessary
+  }
+
+  backward_tape_->Forward();
+  has_been_backwarded_ = true;
+}
+
+Tape &get_global_tape() {
+  static Tape T;
+  return T;
+}
+
+void reset_global_tape() { get_global_tape() = Tape(); }
+}
+}
--- a/paddle/contrib/tape/tape.h
+++ b/paddle/contrib/tape/tape.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/contrib/tape/variable.h"
+
+namespace paddle {
+namespace tape {
+
+using VariableHandleMap = std::map<std::string, std::vector<VariableHandle>>;
+
+struct OpHandle {
+  OpHandle(const std::string &type,
+           const VariableHandleMap &in_vars,
+           const VariableHandleMap &out_vars,
+           const framework::AttributeMap &attrs)
+      : type_(type), inputs_(in_vars), outputs_(out_vars), attrs_(attrs) {}
+
+  std::string type_;
+  VariableHandleMap inputs_;
+  VariableHandleMap outputs_;
+  framework::AttributeMap attrs_;
+};
+
+class Tape {
+ public:
+  void AddOp(const std::string &type,
+             const VariableHandleMap &in_vars,
+             VariableHandleMap out_vars,
+             const framework::AttributeMap &attrs);
+  void Forward();
+  void Backward(VariableHandle target);
+
+  bool HasBeenBackwarded() { return has_been_backwarded_; }
+
+ private:
+  bool has_been_backwarded_ = false;
+  size_t current_position_ = 0;
+
+  std::vector<OpHandle> tape_;
+  std::shared_ptr<Tape> backward_tape_;
+};
+
+Tape &get_global_tape();
+
+void reset_global_tape();
+}
+}
--- a/paddle/contrib/tape/test_tape.cc
+++ b/paddle/contrib/tape/test_tape.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+#include "paddle/contrib/tape/function.h"
+
+using namespace paddle::tape;
+
+TEST(Tape, TestMLP) {
+  LOG(INFO) << "TestMLP";
+  Linear linear1(3, 3, "relu");
+  Linear linear2(3, 3, "relu");
+  Mean mean;
+
+  SGD sgd(0.001);
+
+  std::string initializer = "fill_constant";
+  paddle::framework::AttributeMap attrs;
+  attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
+  attrs["shape"] = std::vector<int>{3, 3};
+  attrs["value"] = 1.0f;
+  Fill filler(initializer, attrs);
+
+  for (int i = 0; i < 2; ++i) {
+    reset_global_tape();
+
+    VariableHandle input(new Variable("input"));
+    filler(input);
+
+    auto loss = mean(linear2(linear1(input)));
+
+    get_global_tape().Backward(loss);
+
+    for (auto w : linear1.Params()) {
+      sgd(w);
+    }
+    for (auto w : linear2.Params()) {
+      sgd(w);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  std::vector<paddle::platform::Place> places;
+  places.emplace_back(paddle::platform::CPUPlace());
+  paddle::platform::DeviceContextPool::Init(places);
+
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
--- a/paddle/contrib/tape/variable.cc
+++ b/paddle/contrib/tape/variable.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/contrib/tape/variable.h"
+
+namespace paddle {
+namespace tape {
+
+void Variable::InitializeVariable() {
+  LOG(INFO) << "Initialzing " << desc_.Name() << " as " << desc_.GetType();
+  framework::proto::VarType::Type var_type = desc_.GetType();
+  if (var_type == framework::proto::VarType::LOD_TENSOR) {
+    var_.GetMutable<framework::LoDTensor>();
+  } else if (var_type == framework::proto::VarType::SELECTED_ROWS) {
+    var_.GetMutable<framework::SelectedRows>();
+  } else {
+    PADDLE_THROW("Variable type %d is not in [LOD_TENSOR, SELECTED_ROWS]",
+                 var_type);
+  }
+}
+}
+}
--- a/paddle/contrib/tape/variable.h
+++ b/paddle/contrib/tape/variable.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <memory>
+
+#include "paddle/fluid/framework/operator.h"  // framework::kGradVarSuffix
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/variable.h"
+
+namespace paddle {
+namespace tape {
+
+class Variable;
+using VariableHandle = std::shared_ptr<Variable>;
+
+/*
+ * Combination of
+ *     framework::VarDesc desc_;
+ *     framework::Variable var_;
+ */
+class Variable {
+ public:
+  Variable(const std::string pre_fix)
+      : desc_(pre_fix + std::to_string(count())) {}
+
+  Variable(const std::string pre_fix, bool is_grad)
+      : desc_(pre_fix + (is_grad ? framework::kGradVarSuffix
+                                 : std::to_string(count()))) {}
+
+  ~Variable() { LOG(INFO) << "Deleting " << Name(); }
+
+  // Instantiate LoDTensor/SelectedRow
+  void InitializeVariable();
+
+  VariableHandle Grad() {
+    if (grad_.expired()) {
+      VariableHandle new_grad(new Variable(desc_.Name(), true));
+      grad_ = new_grad;
+      return new_grad;
+    } else {
+      return VariableHandle(grad_);
+    }
+  }
+
+  // Stochastic Gradient Descent with Momentum
+  //  VariableHandle Momentum ();
+
+  //  void init(const std::string& initializer,
+  //            const framework::AttributeMap& attrs);
+
+  // void value() {};
+
+  const framework::VarDesc& Desc() const { return desc_; }
+  framework::VarDesc* MutableDesc() { return &desc_; }
+
+  // TODO(tonyyang-svail): No need to expose name
+  std::string Name() const { return desc_.Name(); }
+
+  framework::Variable* Var() { return &var_; }
+
+ private:
+  int count() {
+    static int counter = 0;
+    return counter++;
+  }
+
+  framework::VarDesc desc_;
+  framework::Variable var_;
+
+  std::weak_ptr<Variable> grad_;
+};
+}
+}
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -83,8 +83,13 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)

 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)

-cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
-framework_proto glog lod_rank_table feed_fetch_method)
+if(WITH_DISTRIBUTE)
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr)
+  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+else()
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method)
+endif()


 cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor)

--- a/paddle/fluid/framework/details/ssa_graph_checker.h
+++ b/paddle/fluid/framework/details/ssa_graph_checker.h
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace framework {
 namespace details {
-class SSAGraph;
+struct SSAGraph;

 class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
 public:

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -20,6 +20,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/fluid/operators/detail/grpc_client.h"
+#endif
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"

@@ -44,6 +47,14 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {

 Executor::Executor(const platform::Place& place) : place_(place) {}

+#ifdef PADDLE_WITH_DISTRIBUTE
+void Executor::Complete() {
+  ::paddle::operators::detail::RPCClient::GetInstance<
+      ::paddle::operators::detail::GRPCClient>()
+      ->SendComplete();
+}
+#endif
+
 void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
  if (var_type == proto::VarType::LOD_TENSOR) {
    var->GetMutable<LoDTensor>();
@@ -319,8 +330,12 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
  }

  for (auto& op : ctx->ops_) {
-    VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
+    VLOG(4) << place_ << " " << op->DebugStringEx(local_scope);
    op->Run(*local_scope, place_);
+    // NOTE! Please do not delete this line, it's usefull because the debug
+    // string before and after op.run are different, after run the output
+    // will have right shape which is usefull for debug.
+    VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);

    if (FLAGS_benchmark) {
      VLOG(2) << "Memory used after operator " + op->Type() + " running: "

--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -44,6 +44,13 @@ class Executor {

  explicit Executor(const platform::Place& place);

+#ifdef PADDLE_WITH_DISTRIBUTE
+  /*
+   * Sending signal to pserver to mark current trainer stop.
+   */
+  void Complete();
+#endif
+
  /* @Brief
   * Runtime evaluation of the given ProgramDesc under certain Scope
   *

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -69,6 +69,19 @@ static DDim GetDims(const Scope& scope, const std::string& name,
  }
 }

+static int GetRowSize(const Scope& scope, const std::string& name) {
+  Variable* var = scope.FindVar(name);
+  if (var == nullptr) {
+    return -1;
+  }
+
+  if (var->IsType<SelectedRows>()) {
+    return var->Get<SelectedRows>().rows().size();
+  }
+
+  return -1;
+}
+
 static LoD GetLoD(const Scope& scope, const std::string& name) {
  Variable* var = scope.FindVar(name);
  auto default_lod = LoD({{}});
@@ -85,6 +98,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
 }

 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
+  VLOG(10) << "- " << DebugStringEx(&scope);
  if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
    PADDLE_THROW("Cannot run operator on place %s", place);
@@ -94,6 +108,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #endif
  }
  RunImpl(scope, place);
+  VLOG(10) << "+ " << DebugStringEx(&scope);
 }

 bool OperatorBase::HasInputs(const std::string& name) const {
@@ -153,6 +168,10 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
    for (size_t i = 0; i < input.second.size(); ++i) {
      ss << input.second[i];
      if (scope) {
+        int row_size = GetRowSize(*scope, input.second[i]);
+        if (row_size >= 0) {
+          ss << "[row_size=" << row_size << "]";
+        }
        ss << "[" << GetDims(*scope, input.second[i], true) << "]";
        ss << "(" << GetLoD(*scope, input.second[i]) << ")";
      }
@@ -173,6 +192,10 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
    for (size_t i = 0; i < output.second.size(); ++i) {
      ss << output.second[i];
      if (scope) {
+        int row_size = GetRowSize(*scope, output.second[i]);
+        if (row_size >= 0) {
+          ss << "[row_size=" << row_size << "]";
+        }
        ss << "[" << GetDims(*scope, output.second[i], true) << "]";
        ss << "(" << GetLoD(*scope, output.second[i]) << ")";
      }

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -145,9 +145,9 @@ void ParallelExecutor::BCastParamsToGPUs(
    auto &dims = main_tensor.dims();
    if (paddle::platform::is_gpu_place(main_tensor.place())) {
 #ifdef PADDLE_WITH_CUDA
+      std::vector<void *> buffers;
      size_t numel = main_tensor.numel();
      ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
-      platform::NCCLGroupGuard guard;
      for (size_t i = 0; i < member_->places_.size(); ++i) {
        auto place = member_->places_[i];
        void *buffer;
@@ -159,11 +159,21 @@ void ParallelExecutor::BCastParamsToGPUs(
          t->Resize(dims);
          buffer = t->mutable_data(place, main_tensor.type());
        }
-        auto &nccl_ctx = member_->nccl_ctxs_->at(place);
-        platform::dynload::ncclBcast(buffer, numel, data_type, 0,
-                                     nccl_ctx.comm_, nccl_ctx.stream());
+        buffers.push_back(buffer);
      }
-      member_->nccl_ctxs_->WaitAll();
+
+      PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(),
+                        "variables' buffer size to bcast NOT equal to places");
+      {
+        platform::NCCLGroupGuard guard;
+        for (size_t i = 0; i < member_->places_.size(); ++i) {
+          auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]);
+          platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
+                                       nccl_ctx.comm_, nccl_ctx.stream());
+        }
+        member_->nccl_ctxs_->WaitAll();
+      }
+
 #else
      PADDLE_THROW("Not compiled with CUDA");
 #endif

--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@@ -35,14 +35,15 @@ class ReaderBase {

 class DecoratedReader : public ReaderBase {
 public:
-  explicit DecoratedReader(ReaderBase* reader) : ReaderBase(), reader_(reader) {
+  explicit DecoratedReader(const std::shared_ptr<ReaderBase>& reader)
+      : ReaderBase(), reader_(reader) {
    PADDLE_ENFORCE_NOT_NULL(reader_);
  }

  void ReInit() override { reader_->ReInit(); }

 protected:
-  ReaderBase* reader_;
+  std::shared_ptr<ReaderBase> reader_;
 };

 class FileReader : public ReaderBase {
@@ -64,7 +65,7 @@ class ReaderHolder {
 public:
  void Reset(ReaderBase* reader) { reader_.reset(reader); }

-  ReaderBase* Get() const { return reader_.get(); }
+  std::shared_ptr<ReaderBase> Get() const { return reader_; }

  void ReadNext(std::vector<LoDTensor>* out) {
    PADDLE_ENFORCE_NOT_NULL(reader_);
@@ -76,7 +77,7 @@ class ReaderHolder {
  }

 private:
-  std::unique_ptr<ReaderBase> reader_;
+  std::shared_ptr<ReaderBase> reader_;
 };

 }  // namespace framework

--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -81,6 +81,9 @@ class Scope {
  // Rename variable to a new name and return the new name
  std::string Rename(const std::string& origin_name) const;

+ protected:
+  mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
+
 private:
  // Call Scope::NewScope for a sub-scope.
  explicit Scope(Scope const* parent) : parent_(parent) {}
@@ -93,8 +96,6 @@ class Scope {
  // Caller doesn't own the returned Variable.
  Variable* FindVarLocally(const std::string& name) const;

-  mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
-
  // Scope in `kids_` are owned by this class.
  mutable std::list<Scope*> kids_;
  Scope const* parent_{nullptr};

--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -20,16 +20,20 @@ limitations under the License. */
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/pybind/pybind.h"

 DEFINE_string(devices, "", "The devices to be used which is joined by comma.");
 DEFINE_bool(init_p2p, false, "Whether to init p2p.");
+DEFINE_int32(math_num_threads, 1,
+             "Number of threads used to run math functions.");

 namespace paddle {
 namespace inference {

 void Init(const std::vector<std::string> argv) {
  framework::InitGflags(argv);
+  operators::math::SetNumThreads(FLAGS_math_num_threads);
  // init devices
  std::vector<int> devices;
  std::string token;

--- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
@@ -19,10 +19,17 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = framework::Tensor;
+using batch_norm_bwd = mkldnn::batch_normalization_backward;
+using batch_norm_fwd = mkldnn::batch_normalization_forward;
+using framework::DataLayout;
+using framework::Tensor;
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::reorder;
+using mkldnn::stream;
 using paddle::platform::MKLDNNDeviceContext;
 using paddle::platform::MKLDNNMemDesc;
-using mkldnn::memory;
+using platform::to_void_cast;

 template <typename T>
 using EigenArrayMap =
@@ -64,21 +71,12 @@ void run_batch_norm_op(Args &&... args) {
  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
 }

-template <typename T>
-inline void *cast_const_to_void(const T *t) {
-  return static_cast<void *>(const_cast<T *>(t));
-}
 }  // namespace

 template <typename T>
 class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto data_layout_str = ctx.Attr<std::string>("data_layout");
-    auto data_layout = framework::StringToDataLayout(data_layout_str);
-    PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW,
-                   "MKLDNN batch normalization handles only NCHW data layout");
-
    const float epsilon = ctx.Attr<float>("epsilon");
    const float momentum = ctx.Attr<float>("momentum");
    const bool is_test = ctx.Attr<bool>("is_test");
@@ -99,41 +97,53 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    const auto *scale = ctx.Input<Tensor>("Scale");
    const auto *shift = ctx.Input<Tensor>("Bias");

-    y->mutable_data<T>(ctx.GetPlace());
-    mean_out->mutable_data<T>(ctx.GetPlace());
-    variance_out->mutable_data<T>(ctx.GetPlace());
+    PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
+                       x->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input x tensor");
+
+    const T *x_data = x->data<T>();
+    const T *mean_data = mean->data<T>();
+    const T *variance_data = variance->data<T>();
+    T *y_data = y->mutable_data<T>(ctx.GetPlace());
+    T *mean_out_data = mean_out->mutable_data<T>(ctx.GetPlace());
+    T *variance_out_data = variance_out->mutable_data<T>(ctx.GetPlace());
+    T *batch_mean_data = nullptr;
+    T *batch_variance_data = nullptr;

    if (!is_test) {
-      batch_mean->mutable_data<T>(ctx.GetPlace());
-      batch_variance->mutable_data<T>(ctx.GetPlace());
+      batch_mean_data = batch_mean->mutable_data<T>(ctx.GetPlace());
+      batch_variance_data = batch_variance->mutable_data<T>(ctx.GetPlace());
    }

    auto propagation = is_test == true ? mkldnn::prop_kind::forward_scoring
                                       : mkldnn::prop_kind::forward_training;

-    auto dims = paddle::framework::vectorize2int(x->dims());
-
-    auto src_md =
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
-    auto dst_md =
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
-
-    auto src_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine};
-    auto dst_pd = mkldnn::memory::primitive_desc{dst_md, mkldnn_engine};
-
-    auto src = mkldnn::memory{src_pd, cast_const_to_void(x->data<T>())};
-    auto dst = mkldnn::memory{dst_pd, y->data<T>()};
+    auto src_tz = paddle::framework::vectorize2int(x->dims());
+    auto scale_tz = paddle::framework::vectorize2int(scale->dims());
+    PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1");
+    const unsigned int ic = scale_tz[0];

    unsigned flags = mkldnn::use_scale_shift;
    if (is_test) flags |= mkldnn::use_global_stats;

+    // create mkldnn memory from input x tensor
+    auto src_memory =
+        memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
+               to_void_cast(x_data));
+
+    // create primitive descriptor for batch norm forward
    using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
-    auto batch_norm_fwd_desc =
-        bn_fwd_types::op_desc{propagation, src_md, epsilon, flags};
-    auto batch_norm_fwd_pd =
-        bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine};
+    auto batch_norm_fwd_desc = bn_fwd_types::op_desc{
+        propagation, src_memory.get_primitive_desc().desc(), epsilon, flags};
+    std::shared_ptr<batch_norm_fwd::primitive_desc> batch_norm_fwd_pd =
+        std::shared_ptr<batch_norm_fwd::primitive_desc>(
+            new batch_norm_fwd::primitive_desc(batch_norm_fwd_desc,
+                                               mkldnn_engine));

-    const unsigned int ic = dims[1];
+    // Save the pd to be used in backward pass
+    const std::string key = ctx.op().Output("SavedMean");
+    const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
+    dev_ctx.SetBlob(key_batch_norm_fwd_pd, batch_norm_fwd_pd);

    // MKLDNN requires a single piece of memory for scale and shift/bias data
    const size_t scaleshift_size = 2 * ic;
@@ -143,73 +153,58 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    copy_to_weights(scale->data<T>(), scale->data<T>() + ic, shift->data<T>(),
                    shift->data<T>() + ic, &scaleshift_data);

-    auto scaleshift_memory = mkldnn::memory{
-        batch_norm_fwd_pd.weights_primitive_desc(), scaleshift_data.data()};
+    // crate mkldnn memory for weights(scale/shift)
+    auto scaleshift_memory = memory(batch_norm_fwd_pd->weights_primitive_desc(),
+                                    scaleshift_data.data());

-    if (is_test) {
-      auto mean_memory = mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(),
-                                        cast_const_to_void(mean->data<T>())};
+    // create mkldnn memory for output y tensor
+    auto dst_memory = memory(batch_norm_fwd_pd->dst_primitive_desc(), y_data);

+    if (is_test) {
+      // create mkldnn memory for stats (as input)
+      auto mean_memory = memory(batch_norm_fwd_pd->mean_primitive_desc(),
+                                to_void_cast(mean_data));
      auto variance_memory =
-          mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(),
-                         cast_const_to_void(variance->data<T>())};
+          memory(batch_norm_fwd_pd->variance_primitive_desc(),
+                 to_void_cast(variance_data));

      run_batch_norm_op<typename bn_fwd_types::op_type>(
-          batch_norm_fwd_pd, src, (const mkldnn::primitive::at &)mean_memory,
+          *batch_norm_fwd_pd, src_memory,
+          (const mkldnn::primitive::at &)mean_memory,
          (const mkldnn::primitive::at &)variance_memory, scaleshift_memory,
-          dst);
+          dst_memory);
    } else {
+      // create mkldnn memory for stats (as output)
      auto mean_memory =
-          mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(),
-                         cast_const_to_void(batch_mean->data<T>())};
-
-      auto variance_memory =
-          mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(),
-                         cast_const_to_void(batch_variance->data<T>())};
+          memory(batch_norm_fwd_pd->mean_primitive_desc(), batch_mean_data);
+      auto variance_memory = memory(
+          batch_norm_fwd_pd->variance_primitive_desc(), batch_variance_data);

-      run_batch_norm_op<bn_fwd_types::op_type>(batch_norm_fwd_pd, src,
-                                               scaleshift_memory, dst,
+      run_batch_norm_op<bn_fwd_types::op_type>(*batch_norm_fwd_pd, src_memory,
+                                               scaleshift_memory, dst_memory,
                                               mean_memory, variance_memory);
    }

    if (!is_test) {
-      const unsigned int in = dims[0];
-      const unsigned int sample_size = x->numel() / in / ic;
-
-      // saved_xx is use just in this batch of data
-      EigenVectorArrayMap<T> saved_mean_e(
-          batch_mean->mutable_data<T>(ctx.GetPlace()), ic);
-      EigenVectorArrayMap<T> saved_variance_e(
-          batch_variance->mutable_data<T>(ctx.GetPlace()), ic);
-      saved_mean_e.setZero();
-      saved_variance_e.setZero();
-
-      const unsigned int x_arr_size = in * ic;
-      ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, x_arr_size);
-      for (unsigned int nc = 0; nc < x_arr_size; ++nc) {
-        saved_mean_e(nc % ic) += x_arr.col(nc).sum();
-      }
-      saved_mean_e /= in * sample_size;
-      for (unsigned int nc = 0; nc < x_arr_size; ++nc) {
-        saved_variance_e(nc % ic) +=
-            (x_arr.col(nc) - saved_mean_e(nc % ic)).matrix().squaredNorm();
-      }
-      saved_variance_e /= in * sample_size;
-
-      ConstEigenVectorArrayMap<T> mean_arr{mean->data<T>(), ic};
-      ConstEigenVectorArrayMap<T> variance_arr{variance->data<T>(), ic};
-
-      EigenVectorArrayMap<T> running_mean_arr(
-          mean_out->mutable_data<T>(ctx.GetPlace()), ic);
-      EigenVectorArrayMap<T> running_var_arr(
-          variance_out->mutable_data<T>(ctx.GetPlace()), ic);
+      // mkldnn only compute stats for current batch
+      // so we need compute momentum stats via Eigen lib
+      EigenVectorArrayMap<T> batch_mean_e(batch_mean_data, ic);
+      EigenVectorArrayMap<T> batch_variance_e(batch_variance_data, ic);
+      ConstEigenVectorArrayMap<T> mean_e(mean_data, ic);
+      ConstEigenVectorArrayMap<T> variance_e{variance_data, ic};
+
+      EigenVectorArrayMap<T> running_mean_e(mean_out_data, ic);
+      EigenVectorArrayMap<T> running_variance_e(variance_out_data, ic);

      auto one_minus_momentum = 1. - momentum;
-      running_mean_arr =
-          mean_arr * momentum + saved_mean_e * one_minus_momentum;
-      running_var_arr =
-          variance_arr * momentum + saved_variance_e * one_minus_momentum;
+      running_mean_e = mean_e * momentum + batch_mean_e * one_minus_momentum;
+      running_variance_e =
+          variance_e * momentum + batch_variance_e * one_minus_momentum;
    }
+
+    y->set_layout(DataLayout::kMKLDNN);
+    y->set_format(
+        (memory::format)dst_memory.get_primitive_desc().desc().data.format);
  }
 };

@@ -217,11 +212,6 @@ template <typename T>
 class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 public:
  void Compute(const paddle::framework::ExecutionContext &ctx) const override {
-    auto data_layout_str = ctx.Attr<std::string>("data_layout");
-    auto data_layout = framework::StringToDataLayout(data_layout_str);
-    PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW,
-                   "MKLDNN batch normalization handles only NCHW data layout");
-
    auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
    auto mkldnn_engine = dev_ctx.GetEngine();

@@ -238,88 +228,132 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    auto *diff_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
    auto *diff_shift = ctx.Output<Tensor>(framework::GradVarName("Bias"));

-    diff_x->mutable_data<T>(ctx.GetPlace());
-    diff_scale->mutable_data<T>(ctx.GetPlace());
-    diff_shift->mutable_data<T>(ctx.GetPlace());
+    PADDLE_ENFORCE(diff_y->layout() == DataLayout::kMKLDNN &&
+                       diff_y->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input diff_y tensor");
+
+    const T *x_data = x->data<T>();
+    const T *diff_y_data = diff_y->data<T>();
+    const T *batch_mean_data = batch_mean->data<T>();
+    const T *batch_variance_data = batch_variance->data<T>();
+    const T *scale_data = scale->data<T>();
+    const T *shift_data = shift->data<T>();
+    T *diff_x_data = diff_x->mutable_data<T>(ctx.GetPlace());
+    T *diff_scale_data = diff_scale->mutable_data<T>(ctx.GetPlace());
+    T *diff_shift_data = diff_shift->mutable_data<T>(ctx.GetPlace());
+
+    auto src_tz = paddle::framework::vectorize2int(x->dims());
+    auto diff_src_tz = src_tz;
+    auto dst_tz = src_tz;
+    auto diff_dst_tz = dst_tz;
+    auto scale_tz = paddle::framework::vectorize2int(scale->dims());
+    PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1");
+
+    const unsigned int ic = scale_tz[0];
+
+    // Retrieve bn_fwd_pd from device context
+    const std::string key = ctx.op().Input("SavedMean");
+    const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
+    auto batch_norm_fwd_pd =
+        std::static_pointer_cast<batch_norm_fwd::primitive_desc>(
+            dev_ctx.GetBlob(key_batch_norm_fwd_pd));
+    PADDLE_ENFORCE(batch_norm_fwd_pd != nullptr,
+                   "Fail to find batch_norm_fwd_pd in device context");

-    auto dims = paddle::framework::vectorize2int(x->dims());
-    unsigned flags = mkldnn::use_scale_shift | !mkldnn::use_global_stats;
+    using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;

-    auto src_md =
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
-    auto dst_md =
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
-    auto diff_src_md =
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
-    auto diff_dst_md =
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
+    // create mkldnn memory from input diff_y tensor
+    auto user_diff_dst_memory =
+        memory({{{diff_dst_tz}, memory::data_type::f32, diff_y->format()},
+                mkldnn_engine},
+               to_void_cast(diff_y_data));

-    using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
-    using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
+    // create mkldnn memory from input x tensor
+    auto src_memory =
+        memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
+               to_void_cast(x_data));

-    auto batch_norm_fwd_desc = bn_fwd_types::op_desc{
-        mkldnn::prop_kind::forward_training, src_md, epsilon, flags};
-    auto batch_norm_fwd_pd =
-        bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine};
+    // for diff_dst, try to use same format as dst in forward pass
+    auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc();
+    auto diff_dst_md = diff_dst_pd.desc();

+    // create primitive descriptor for batch norm backward
+    unsigned flags = mkldnn::use_scale_shift;
    auto batch_norm_bwd_desc = bn_bwd_types::op_desc{
-        mkldnn::prop_kind::backward, diff_dst_md, dst_md, epsilon, flags};
+        mkldnn::prop_kind::backward, diff_dst_md,
+        src_memory.get_primitive_desc().desc(), epsilon, flags};
    auto batch_norm_bwd_pd = bn_bwd_types::op_prim{
-        batch_norm_bwd_desc, mkldnn_engine, batch_norm_fwd_pd};
-
-    auto src = mkldnn::memory{{src_md, mkldnn_engine},
-                              cast_const_to_void(x->data<T>())};
-
-    auto mean = mkldnn::memory{batch_norm_bwd_pd.mean_primitive_desc(),
-                               cast_const_to_void(batch_mean->data<T>())};
-
-    auto variance =
-        mkldnn::memory{batch_norm_bwd_pd.variance_primitive_desc(),
-                       cast_const_to_void(batch_variance->data<T>())};
-
-    auto diff_dst = mkldnn::memory{{diff_dst_md, mkldnn_engine},
-                                   cast_const_to_void(diff_y->data<T>())};
+        batch_norm_bwd_desc, mkldnn_engine, *batch_norm_fwd_pd};
+
+    // reorder user_diff_dst if it's not in preferred format
+    auto diff_dst_memory = user_diff_dst_memory;
+    primitive reorder_diff_dst;
+    bool is_diff_dst_reordered = false;
+    if (diff_dst_pd != user_diff_dst_memory.get_primitive_desc()) {
+      diff_dst_memory = memory(diff_dst_pd);
+      reorder_diff_dst = reorder(user_diff_dst_memory, diff_dst_memory);
+      is_diff_dst_reordered = true;
+    }

-    const unsigned int ic = dims[1];
+    // create mkldnn memory for input tensors (src/mean/variance)
+    auto mean_memory = memory(batch_norm_bwd_pd.mean_primitive_desc(),
+                              to_void_cast(batch_mean_data));
+    auto variance_memory = memory(batch_norm_bwd_pd.variance_primitive_desc(),
+                                  to_void_cast(batch_variance_data));

+    // MKLDNN requires a single piece of memory for scale and shift/bias data
    const size_t scaleshift_size = 2 * ic;

    std::vector<T> scaleshift_data;
    scaleshift_data.reserve(scaleshift_size);
-    copy_to_weights(scale->data<T>(), scale->data<T>() + ic, shift->data<T>(),
-                    shift->data<T>() + ic, &scaleshift_data);
+    copy_to_weights(scale_data, scale_data + ic, shift_data, shift_data + ic,
+                    &scaleshift_data);

-    auto scaleshift_memory = mkldnn::memory{
-        batch_norm_bwd_pd.weights_primitive_desc(), scaleshift_data.data()};
+    // create mkldnn memory for input tensors (scale/shift)
+    auto scaleshift_memory = memory(batch_norm_bwd_pd.weights_primitive_desc(),
+                                    scaleshift_data.data());

+    // create mkldnn memory for output diff weights (combined scale/shift)
    std::vector<T> diff_scaleshift_data;
    diff_scaleshift_data.reserve(scaleshift_size);
-    copy_to_weights(diff_scale->data<T>(), diff_scale->data<T>() + ic,
-                    diff_shift->data<T>(), diff_shift->data<T>() + ic,
-                    &diff_scaleshift_data);
-
    auto diff_scaleshift_memory =
-        mkldnn::memory{batch_norm_bwd_pd.diff_weights_primitive_desc(),
-                       diff_scaleshift_data.data()};
-
-    auto diff_src = mkldnn::memory{{diff_src_md, mkldnn_engine},
-                                   static_cast<void *>(diff_x->data<T>())};
-
-    run_batch_norm_op<bn_bwd_types::op_type>(
-        batch_norm_bwd_pd, src, mean, variance, diff_dst, scaleshift_memory,
-        diff_src, diff_scaleshift_memory);
-
+        memory(batch_norm_bwd_pd.diff_weights_primitive_desc(),
+               diff_scaleshift_data.data());
+
+    // here assume diff_src is in the same format of src
+    auto diff_src_memory = memory(src_memory.get_primitive_desc(), diff_x_data);
+
+    // finally create batch_norm backward primitive
+    auto batch_norm_bwd_prim =
+        batch_norm_bwd(batch_norm_bwd_pd, src_memory, mean_memory,
+                       variance_memory, diff_dst_memory, scaleshift_memory,
+                       diff_src_memory, diff_scaleshift_memory);
+
+    // execute optional reorder and batch_norm backward primitive
+    std::vector<primitive> pipeline;
+    if (is_diff_dst_reordered) pipeline.push_back(reorder_diff_dst);
+    pipeline.push_back(batch_norm_bwd_prim);
+    stream(stream::kind::eager).submit(pipeline).wait();
+
+    // copy back diff sacle/shift to output tensors (diff scale/shift)
+    diff_scaleshift_data.resize(scaleshift_size);
    auto it = std::begin(diff_scaleshift_data);
-    std::copy(it, std::next(it, ic), diff_scale->data<T>());
+    std::copy(it, std::next(it, ic), diff_scale_data);
    std::copy(std::next(it, ic), std::end(diff_scaleshift_data),
-              diff_shift->data<T>());
+              diff_shift_data);
+
+    // set layout/format of output tensors
+    diff_x->set_layout(DataLayout::kMKLDNN);
+    diff_x->set_format((memory::format)diff_src_memory.get_primitive_desc()
+                           .desc()
+                           .data.format);
  }
 };
 }  // namespace operators
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP_KERNEL(batch_norm, MKLDNN, paddle::platform::CPUPlace,
+REGISTER_OP_KERNEL(batch_norm, MKLDNN, ::paddle::platform::CPUPlace,
                   ops::BatchNormMKLDNNOpKernel<float>);
-REGISTER_OP_KERNEL(batch_norm_grad, MKLDNN, paddle::platform::CPUPlace,
+REGISTER_OP_KERNEL(batch_norm_grad, MKLDNN, ::paddle::platform::CPUPlace,
                   ops::BatchNormMKLDNNGradOpKernel<float>);
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -110,19 +110,19 @@ class BatchNormOp : public framework::OperatorWithKernel {
                                         ctx.Input<Tensor>("Variance")->type()),
                      "Variance input should be of float type");

-    framework::LibraryType library_{framework::LibraryType::kPlain};
    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    framework::LibraryType library = framework::LibraryType::kPlain;
    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
-
 #ifdef PADDLE_WITH_MKLDNN
-    if (library_ == framework::LibraryType::kPlain &&
+    if (library == framework::LibraryType::kPlain &&
        platform::CanMKLDNNBeUsed(ctx)) {
-      library_ = framework::LibraryType::kMKLDNN;
+      library = framework::LibraryType::kMKLDNN;
      layout = framework::DataLayout::kMKLDNN;
    }
 #endif
+
    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
-                                   library_);
+                                   library);
  }
 };

@@ -370,19 +370,21 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
      PADDLE_THROW("can't find Y@GRAD");
    }

-    framework::LibraryType library_{framework::LibraryType::kPlain};
    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+
 #ifdef PADDLE_WITH_MKLDNN
-    if (library_ == framework::LibraryType::kPlain &&
+    if (library == framework::LibraryType::kPlain &&
        platform::CanMKLDNNBeUsed(ctx)) {
-      library_ = framework::LibraryType::kMKLDNN;
-      layout_ = framework::DataLayout::kMKLDNN;
+      library = framework::LibraryType::kMKLDNN;
+      layout = framework::DataLayout::kMKLDNN;
    }
 #endif
+
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        layout_, library_);
+        layout, library);
  }
 };


--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -75,9 +75,8 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvOp::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
  framework::LibraryType library{framework::LibraryType::kPlain};
-
-  std::string data_format = ctx.Attr<std::string>("data_format");
  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  std::string data_format = ctx.Attr<std::string>("data_format");
  framework::DataLayout layout = framework::StringToDataLayout(data_format);

 #ifdef PADDLE_WITH_CUDA

--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -34,6 +34,12 @@ void GRPCClient::InitEventLoop() {
  client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this)));
 }

+void GRPCClient::SendComplete() {
+  for (auto& it : channels_) {
+    this->AsyncSendComplete(it.first);
+  }
+}
+
 GRPCClient::~GRPCClient() {
  Wait();
  cq_.Shutdown();
@@ -210,6 +216,19 @@ void GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
  req_count_++;
 }

+void GRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) {
+  const auto ch = GetChannel(ep);
+
+  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
+  s->Prepare(time_out);
+
+  sendrecv::VariableMessage req;
+  req.set_varname(COMPLETE_MESSAGE);
+  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+  req_count_++;
+}
+
 void GRPCClient::Wait() {
  std::unique_lock<std::mutex> lk(sync_mutex_);
  sync_cond_.wait(lk, [this] { return req_count_ == 0; });

--- a/paddle/fluid/operators/detail/grpc_client.h
+++ b/paddle/fluid/operators/detail/grpc_client.h
@@ -195,6 +195,8 @@ class GRPCClient : public RPCClient {

  void Wait() override;

+  void SendComplete() override;
+
 protected:
  void InitImpl() override;

@@ -204,6 +206,9 @@ class GRPCClient : public RPCClient {

  void Proceed();

+  void AsyncSendComplete(const std::string& ep,
+                         int64_t time_out = RPCClient::rpc_time_out);
+
  std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);

 private:

--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -162,16 +162,19 @@ class RequestPrefetch final : public RequestBase {

  void Process() override {
    // prefetch process...
-    std::string varname = request_->OutVarname();
-    VLOG(3) << "RequestPrefetch " << varname;
+    std::string in_var_name = request_->Varname();
+    std::string out_var_name = request_->OutVarname();
+    VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name
+            << " out_var_name: " << out_var_name;

    auto scope = request_->GetMutableLocalScope();
-    auto invar = scope->FindVar(varname);
-    framework::Variable* outvar = nullptr;
+    auto invar = scope->FindVar(in_var_name);
+    // out var must be created in local scope!
+    framework::Variable* outvar = scope->Var(out_var_name);

-    request_handler_->Handle(varname, scope, invar, &outvar);
+    request_handler_->Handle(in_var_name, scope, invar, &outvar, out_var_name);

-    SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(),
+    SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(),
                          &reply_);
    Finish(reply_, &responder_);
  }
@@ -287,7 +290,7 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
  } else if (rpc_name == kRequestPrefetch) {
    b = new RequestPrefetch(&service_, cq.get(), handler, req_id);
  } else {
-    PADDLE_ENFORCE(false, "not surpported rpc");
+    PADDLE_ENFORCE(false, "not supported rpc");
  }

  reqs[req_id] = b;

--- a/paddle/fluid/operators/detail/request_handler.h
+++ b/paddle/fluid/operators/detail/request_handler.h
@@ -40,6 +40,7 @@ constexpr char kRequestPrefetch[] = "RequestPrefetch";
 #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
 #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
 #define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
+#define COMPLETE_MESSAGE "COMPLETE@RECV"

 class RPCServer;

@@ -60,9 +61,12 @@ class RequestHandler {
  void SetDevCtx(const platform::DeviceContext* dev_ctx) { dev_ctx_ = dev_ctx; }
  void SetProgram(framework::ProgramDesc* program) { program_ = program; }
  void SetExecutor(framework::Executor* executor) { executor_ = executor; }
+
+  // Used for dist lookup table prefetch
  void SetPrefetchPreparedCtx(
-      std::unique_ptr<framework::ExecutorPrepareContext> prepared) {
-    prefetch_ctx_.reset(prepared.release());
+      std::unordered_map<
+          std::string, std::shared_ptr<framework::ExecutorPrepareContext>>* g) {
+    prefetch_var_name_to_prepared_ctx_ = g;
  }

  // Used for async.
@@ -78,9 +82,6 @@ class RequestHandler {
  bool sync_mode() { return sync_mode_; }
  framework::Scope* scope() { return scope_; }
  const platform::DeviceContext* dev_ctx() { return dev_ctx_; }
-  framework::ExecutorPrepareContext* prefetch_ctx() {
-    return prefetch_ctx_.get();
-  }
  framework::ProgramDesc* program() { return program_; }
  framework::Executor* executor() { return executor_; }

@@ -99,8 +100,8 @@ class RequestHandler {
  //           *request_handler_->dev_ctx(), &reply_);
  //    }
  virtual bool Handle(const std::string& varname, framework::Scope* scope,
-                      framework::Variable* var,
-                      framework::Variable** outvar) = 0;
+                      framework::Variable* var, framework::Variable** outvar,
+                      const std::string& out_var_name = "") = 0;

 protected:
  const bool sync_mode_;
@@ -109,12 +110,17 @@ class RequestHandler {
  framework::Executor* executor_;
  framework::Scope* scope_;
  framework::ProgramDesc* program_;
-  std::unique_ptr<framework::ExecutorPrepareContext> prefetch_ctx_;
+
+  // used for distribute lookup table prefetch
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>*
+      prefetch_var_name_to_prepared_ctx_;

  // Used for async.
  std::unordered_map<std::string,
                     std::shared_ptr<framework::ExecutorPrepareContext>>*
      grad_to_prepared_ctx_;
+
  RPCServer* rpc_server_;
 };


--- a/paddle/fluid/operators/detail/request_handler_impl.cc
+++ b/paddle/fluid/operators/detail/request_handler_impl.cc
@@ -30,7 +30,8 @@ namespace detail {
 bool RequestSendHandler::Handle(const std::string& varname,
                                framework::Scope* scope,
                                framework::Variable* invar,
-                                framework::Variable** outvar) {
+                                framework::Variable** outvar,
+                                const std::string& out_var_name) {
  VLOG(4) << "RequestSendHandler:" << varname;

  // Async
@@ -49,6 +50,9 @@ bool RequestSendHandler::Handle(const std::string& varname,
  if (varname == BATCH_BARRIER_MESSAGE) {
    VLOG(3) << "sync: recv batch barrier message";
    rpc_server_->IncreaseBatchBarrier(kRequestSend);
+  } else if (varname == COMPLETE_MESSAGE) {
+    VLOG(3) << "sync: recv complete message";
+    rpc_server_->DecreaseClientNum();
  } else {
    VLOG(3) << "sync: received var_name: " << varname;
    if (sync_mode_) {
@@ -79,7 +83,8 @@ void RequestSendHandler::ResetSparseVarRecorder() {
 bool RequestGetHandler::Handle(const std::string& varname,
                               framework::Scope* scope,
                               framework::Variable* invar,
-                               framework::Variable** outvar) {
+                               framework::Variable** outvar,
+                               const std::string& out_var_name) {
  VLOG(4) << "RequestGetHandler:" << varname;

  if (varname != FETCH_BARRIER_MESSAGE) {
@@ -102,13 +107,14 @@ bool RequestGetHandler::Handle(const std::string& varname,
 bool RequestPrefetchHandler::Handle(const std::string& varname,
                                    framework::Scope* scope,
                                    framework::Variable* invar,
-                                    framework::Variable** outvar) {
+                                    framework::Variable** outvar,
+                                    const std::string& out_var_name) {
  VLOG(4) << "RequestPrefetchHandler " << varname;

-  auto var_desc = program_->Block(0).FindVar(varname);
-  *outvar = scope->FindVar(varname);
+  auto var_desc = program_->Block(0).FindVar(out_var_name);
  InitializeVariable(*outvar, var_desc->GetType());
-  executor_->RunPreparedContext(prefetch_ctx_.get(), scope);
+  executor_->RunPreparedContext(
+      (*prefetch_var_name_to_prepared_ctx_)[varname].get(), scope);

  return true;
 }

--- a/paddle/fluid/operators/detail/request_handler_impl.h
+++ b/paddle/fluid/operators/detail/request_handler_impl.h
@@ -39,7 +39,8 @@ class RequestSendHandler final : public RequestHandler {
  explicit RequestSendHandler(bool sync_mode) : RequestHandler(sync_mode) {}
  virtual ~RequestSendHandler() {}
  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar) override;
+              framework::Variable* var, framework::Variable** outvar,
+              const std::string& out_var_name = "") override;
  void ResetSparseVarRecorder();

 private:
@@ -52,7 +53,8 @@ class RequestGetHandler final : public RequestHandler {
  explicit RequestGetHandler(bool sync_mode) : RequestHandler(sync_mode) {}
  virtual ~RequestGetHandler() {}
  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar) override;
+              framework::Variable* var, framework::Variable** outvar,
+              const std::string& out_var_name = "") override;
 };

 class RequestPrefetchHandler final : public RequestHandler {
@@ -60,7 +62,8 @@ class RequestPrefetchHandler final : public RequestHandler {
  explicit RequestPrefetchHandler(bool sync_mode) : RequestHandler(sync_mode) {}
  virtual ~RequestPrefetchHandler() {}
  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar) override;
+              framework::Variable* var, framework::Variable** outvar,
+              const std::string& out_var_name = "") override;
 };

 }  // namespace detail

--- a/paddle/fluid/operators/detail/rpc_client.h
+++ b/paddle/fluid/operators/detail/rpc_client.h
@@ -53,6 +53,11 @@ class RPCClient {
  virtual void AsyncSendFetchBarrier(const std::string& ep,
                                     int64_t time_out = rpc_time_out) = 0;

+  // SendComplete tells all the server that current trainer have no more data
+  // to train, so that the pserver can reduce it's barrier count, and continue
+  // to train with other trainers.
+  virtual void SendComplete() = 0;
+
  virtual void Wait() = 0;

  static constexpr int64_t rpc_time_out = 120 * 1000;

--- a/paddle/fluid/operators/detail/rpc_server.cc
+++ b/paddle/fluid/operators/detail/rpc_server.cc
@@ -43,7 +43,7 @@ void RPCServer::SavePort() const {

 void RPCServer::WaitBarrier(const std::string& rpc_name) {
  std::unique_lock<std::mutex> lock(this->mutex_);
-  barrier_cond_.wait(lock, [=] {
+  barrier_cond_.wait(lock, [this, &rpc_name] {
    return (barrier_counter_[rpc_name] >= client_num_ || exit_flag_.load());
  });

@@ -53,19 +53,23 @@ void RPCServer::WaitBarrier(const std::string& rpc_name) {
 void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
  VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
  int b = 0;
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    b = ++barrier_counter_[rpc_name];
-  }
-
-  VLOG(3) << "RPCServer IncreaseBatchBarrier " << rpc_name
-          << ", barrier_count:" << b << ", fan_in" << client_num_;
-
+  std::unique_lock<std::mutex> lock(mutex_);
+  b = ++barrier_counter_[rpc_name];
  if (b >= client_num_) {
+    lock.unlock();
    barrier_cond_.notify_all();
+    lock.lock();
  }
 }

+void RPCServer::DecreaseClientNum() {
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    client_num_--;
+  }
+  barrier_cond_.notify_all();
+}
+
 void RPCServer::ResetBarrierCounter() {
  VLOG(3) << "RPCServer ResetBarrierCounter ";
  std::unique_lock<std::mutex> lock(mutex_);

--- a/paddle/fluid/operators/detail/rpc_server.h
+++ b/paddle/fluid/operators/detail/rpc_server.h
@@ -60,7 +60,7 @@ class RPCServer {
  void SetCond(const std::string& rpc_name);
  void WaitCond(const std::string& rpc_name);
  void IncreaseBatchBarrier(const std::string rpc_name);
-
+  void DecreaseClientNum();
  void ResetBarrierCounter();

 protected:
@@ -79,8 +79,7 @@ class RPCServer {
  std::string bind_address_;
  std::atomic<int> exit_flag_;
  int selected_port_;
-
-  const int client_num_;
+  int client_num_;

  std::unordered_map<std::string, RequestHandler*> rpc_call_map_;
  std::unordered_map<std::string, int> rpc_thread_num_;

--- a/paddle/fluid/operators/detail/rpc_server_test.cc
+++ b/paddle/fluid/operators/detail/rpc_server_test.cc
@@ -98,11 +98,17 @@ void StartServer() {
  framework::Executor exe(place);
  platform::CPUDeviceContext ctx(place);
  auto* block = AppendPrefetchBlcok(&program);
-  auto prepared = exe.Prepare(program, block->ID());
+  std::string in_var_name("ids");
+  std::vector<int> prefetch_block_ids{block->ID()};
+  auto prepared = exe.Prepare(program, prefetch_block_ids);
  InitTensorsOnServer(&scope, &place, 10);

+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>
+      prefetch_var_name_to_prepared;
+  prefetch_var_name_to_prepared[in_var_name] = prepared[0];
  g_req_handler->SetProgram(&program);
-  g_req_handler->SetPrefetchPreparedCtx(std::move(prepared));
+  g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared);
  g_req_handler->SetDevCtx(&ctx);
  g_req_handler->SetScope(&scope);
  g_req_handler->SetExecutor(&exe);

--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -66,40 +66,41 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
        .SetDefault(-1)
        .EqualGreaterThan(-1);
    AddComment(string::Sprintf(R"DOC(
-Limited Elementwise %s Operator.
+Limited Elementwise %s Operator

 The equation is:

 $$%s$$

-$X$ is a tensor of any dimension and the dimensions of tensor $Y$ must be
-smaller than or equal to the dimensions of $X$.
+- $X$: a tensor of any dimension. 
+- $Y$: a tensor whose dimensions must be less than or equal to the dimensions of $X$.

 There are two cases for this operator:
-1. The shape of $Y$ is same with $X$;
-2. The shape of $Y$ is a congiguous subsequencet of $X$. The trailing dimensions
-   of size 1 for $Y$ will be ignored for the consideration of subsequence.

+1. The shape of $Y$ is the same with $X$.
+2. The shape of $Y$ is a continuous subsequence of $X$.

 For case 2:

-$Y$ will be broadcasted to match the shape of $X$ and axis should be
-set to index of the start dimension to broadcast $Y$ onto $X$.
+1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index 
+   for broadcasting $Y$ onto $X$. 
+2. If $axis$ is -1 (default), $axis = rank(X) - rank(Y)$.
+3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of 
+   subsequence, such as shape(Y) = (2, 1) => (2).

-If axis is -1, it is treated as axis=rank(X)-rank(Y).
+For example:

-For example
  .. code-block:: python

    shape(X) = (2, 3, 4, 5), shape(Y) = (,)
    shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
-    shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
+    shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5), with axis=-1(default) or axis=2
    shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
    shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
    shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0

-Either of the inputs $X$ and $Y$ or none can carry the LoD (Level of Details)
-information. However, the output only shares the LoD information with input $X$.
+The inputs $X$ and $Y$ can carry the different LoD information. 
+But the output only shares the LoD information with the input $X$.

 )DOC",
                               GetName(), GetEquation()));

--- a/paddle/fluid/operators/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/gen_nccl_id_op.cc
@@ -67,6 +67,10 @@ class GenNCCLIdOp : public framework::OperatorBase {
      client->AsyncSendVar(ep, dev_ctx, *scope, NCCL_ID_VARNAME);
    }
    client->Wait();
+    for (auto& ep : endpoint_list) {
+      client->AsyncSendBatchBarrier(ep);
+    }
+    client->Wait();
    VLOG(3) << "sending completed...";
  }


--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -96,19 +96,22 @@ static int64_t GetTimestamp() {
  return tp.tv_sec * 1000 + tp.tv_usec / 1000;
 }

-void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
-                                  framework::ProgramDesc *program,
-                                  framework::Scope *recv_scope,
-                                  framework::BlockDesc *prefetch_block) const {
+void ListenAndServOp::RunSyncLoop(
+    framework::Executor *executor, framework::ProgramDesc *program,
+    framework::Scope *recv_scope,
+    const std::vector<int> &prefetch_block_id_list) const {
  size_t num_blocks = program->Size();
  PADDLE_ENFORCE_GE(num_blocks, 2,
                    "server program should have at least 2 blocks");

-  std::vector<int> block_list;
-  for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
-    block_list.push_back(blkid);
+  std::vector<int> optimize_block_id_list;
+  for (int blkid = 1; blkid < num_blocks; ++blkid) {
+    if (std::find(prefetch_block_id_list.begin(), prefetch_block_id_list.end(),
+                  blkid) == prefetch_block_id_list.end()) {
+      optimize_block_id_list.push_back(blkid);
+    }
  }
-  auto optimize_prepared = executor->Prepare(*program, block_list);
+  auto optimize_prepared = executor->Prepare(*program, optimize_block_id_list);
  // Insert placeholder for block0 which holds current op itself.
  optimize_prepared.insert(
      optimize_prepared.begin(),
@@ -135,16 +138,17 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
    std::vector<size_t> parallel_blkids;
    parallel_blkids.push_back(1);
    double ts = GetTimestamp();
-    for (size_t blkid = 2; blkid < num_blocks; ++blkid) {
-      if (blkid != static_cast<size_t>(prefetch_block->ID())) {
-        if (program->Block(blkid).Parent() != last_parent_blkid) {
-          ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
-                                program, recv_scope);
-          parallel_blkids.clear();
-          last_parent_blkid = program->Block(blkid).Parent();
-        }
-        parallel_blkids.push_back(blkid);
+    for (size_t i = 1; i < optimize_block_id_list.size(); ++i) {
+      // skip the first optimize block because it is already in the
+      // parallel_blkids.
+      int blkid = optimize_block_id_list[i];
+      if (program->Block(blkid).Parent() != last_parent_blkid) {
+        ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
+                              program, recv_scope);
+        parallel_blkids.clear();
+        last_parent_blkid = program->Block(blkid).Parent();
      }
+      parallel_blkids.push_back(blkid);
    }
    ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program,
                          recv_scope);
@@ -210,18 +214,19 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
  }  // while(true)
 }

-static void FillRequestCtx(detail::RequestHandler *h, framework::Scope *scope,
-                           platform::DeviceContext *dev_ctx,
-                           framework::Executor *executor,
-                           framework::ProgramDesc *program,
-                           framework::ExecutorPrepareContext *prefetch_ctx,
-                           detail::RPCServer *rpc_server) {
+static void FillRequestCtx(
+    detail::RequestHandler *h, framework::Scope *scope,
+    platform::DeviceContext *dev_ctx, framework::Executor *executor,
+    framework::ProgramDesc *program,
+    std::unordered_map<std::string,
+                       std::shared_ptr<framework::ExecutorPrepareContext>>
+        *prefetch_ctx,
+    detail::RPCServer *rpc_server) {
  h->SetScope(scope);
  h->SetDevCtx(dev_ctx);
  h->SetExecutor(executor);
  h->SetProgram(program);
-  h->SetPrefetchPreparedCtx(
-      std::unique_ptr<framework::ExecutorPrepareContext>(prefetch_ctx));
+  h->SetPrefetchPreparedCtx(prefetch_ctx);
  h->SetRPCServer(rpc_server);
 }

@@ -255,17 +260,42 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
                            request_prefetch_handler_.get());

  auto *optimize_block = Attr<framework::BlockDesc *>(kOptimizeBlock);
-  auto *prefetch_block = Attr<framework::BlockDesc *>(kPrefetchBlock);
  auto *program = optimize_block->Program();
  framework::Executor executor(dev_place);

  // prepare for prefetch
-  VLOG(3) << "prefetch block id is " << prefetch_block->ID();
-  auto prefetch_prepared = executor.Prepare(*program, prefetch_block->ID());
+  std::vector<int> prefetch_block_id_list;
+  std::unordered_map<int, std::string> block_id_to_prefetch_var_name;
+
+  auto prefetch_var_name_to_block_id_str =
+      Attr<std::vector<std::string>>(kPrefetchVarNameToBlockId);
+  for (const auto &prefetch_var_name_and_id :
+       prefetch_var_name_to_block_id_str) {
+    std::vector<std::string> pieces;
+    split(prefetch_var_name_and_id, ':', &pieces);
+    VLOG(3) << "after split, prefetch_var = " << pieces[0]
+            << ", id=" << pieces[1];
+    PADDLE_ENFORCE_EQ(pieces.size(), 2);
+
+    int block_id = std::stoi(pieces[1]);
+    prefetch_block_id_list.push_back(block_id);
+    block_id_to_prefetch_var_name[block_id] = pieces[0];
+  }
+
+  auto prefetch_prepared = executor.Prepare(*program, prefetch_block_id_list);
+
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>
+      prefetch_var_name_to_prepared_ctx;
+  for (size_t i = 0; i < prefetch_block_id_list.size(); ++i) {
+    auto block_id = prefetch_block_id_list[i];
+    auto prefetch_var_name = block_id_to_prefetch_var_name[block_id];
+    prefetch_var_name_to_prepared_ctx[prefetch_var_name] = prefetch_prepared[i];
+  }

  auto f = std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope,
-                     &dev_ctx, &executor, program, prefetch_prepared.release(),
-                     rpc_service_.get());
+                     &dev_ctx, &executor, program,
+                     &prefetch_var_name_to_prepared_ctx, rpc_service_.get());

  f(request_send_handler_.get());
  f(request_get_handler_.get());
@@ -283,7 +313,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
  // Write to a file of server selected port for python use.
  SavePort();
  if (sync_mode) {
-    RunSyncLoop(&executor, program, &recv_scope, prefetch_block);
+    RunSyncLoop(&executor, program, &recv_scope, prefetch_block_id_list);
  } else {
    RunAsyncLoop(&executor, program);
  }
@@ -309,8 +339,9 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<bool>("sync_mode", "if works at sync_mode or not").SetDefault(true);
    AddAttr<framework::BlockDesc *>(kOptimizeBlock,
                                    "BlockID to run on server side.");
-    AddAttr<framework::BlockDesc *>(kPrefetchBlock,
-                                    "prefetch block to run on server side.");
+    AddAttr<std::vector<std::string>>(kPrefetchVarNameToBlockId,
+                                      "prefetch blocks to run on server side.")
+        .SetDefault({});
    AddAttr<int>("Fanin", "How many clients send to this server.")
        .SetDefault(1);
  }

--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <atomic>
 #include <set>
 #include <string>
+#include <vector>

 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -30,7 +31,7 @@ namespace paddle {
 namespace operators {

 constexpr char kOptimizeBlock[] = "OptimizeBlock";
-constexpr char kPrefetchBlock[] = "PrefetchBlock";
+constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id";

 void RunServer(std::shared_ptr<detail::RPCServer> service);

@@ -46,7 +47,7 @@ class ListenAndServOp : public framework::OperatorBase {
  void RunSyncLoop(framework::Executor* executor,
                   framework::ProgramDesc* program,
                   framework::Scope* recv_scope,
-                   framework::BlockDesc* prefetch_block) const;
+                   const std::vector<int>& prefetch_block_id_list) const;

  void RunAsyncLoop(framework::Executor* executor,
                    framework::ProgramDesc* program) const;

--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -20,13 +20,16 @@
 #ifdef PADDLE_WITH_MKLML
 #include <mkl_cblas.h>
 #include <mkl_lapacke.h>
+#include <mkl_service.h>
 #include <mkl_vml_functions.h>
 #endif

 #ifdef PADDLE_USE_OPENBLAS
 #include <cblas.h>
+#ifdef LAPACK_FOUND
 #include <lapacke.h>
 #endif
+#endif

 #ifndef LAPACK_FOUND
 extern "C" {
@@ -46,6 +49,18 @@ namespace paddle {
 namespace operators {
 namespace math {

+static void SetNumThreads(int num_threads) {
+#ifdef PADDLE_USE_OPENBLAS
+  int real_num_threads = num_threads > 1 ? num_threads : 1;
+  openblas_set_num_threads(real_num_threads);
+#elif defined(PADDLE_WITH_MKLML)
+  int real_num_threads = num_threads > 1 ? num_threads : 1;
+  mkl_set_num_threads(real_num_threads);
+#else
+  PADDLE_ENFORCE(false, "To be implemented.");
+#endif
+}
+
 /**
 * Matrix Descriptor of a memory buffer.
 *

--- a/paddle/fluid/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
@@ -21,8 +21,10 @@ limitations under the License. */

 #ifdef PADDLE_USE_OPENBLAS
 #include <cblas.h>
+#ifdef LAPACK_FOUND
 #include <lapacke.h>
 #endif
+#endif

 #ifndef LAPACK_FOUND
 extern "C" {

--- a/paddle/fluid/operators/mean_iou_op.cc
+++ b/paddle/fluid/operators/mean_iou_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/mean_iou_op.h"
+
+namespace paddle {
+namespace operators {
+
+class MeanIoUOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Predictions"),
+                   "Input (Predictions) of MeanIoU op should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input (labels) of MeanIoU op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutMeanIou"),
+                   "Output (OutMeanIou) of MeanIoU op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutWrong"),
+                   "Output (OutWrong) of MeanIoU op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutCorrect"),
+                   "Output (OutWrong) of MeanIoU op should not be null.");
+
+    int64_t num_classes =
+        static_cast<int64_t>(ctx->Attrs().Get<int>("num_classes"));
+
+    ctx->SetOutputDim("OutMeanIou", {1});
+    ctx->SetOutputDim("OutWrong", {num_classes});
+    ctx->SetOutputDim("OutCorrect", {num_classes});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Predictions")->type()),
+        ctx.GetPlace());
+  }
+};
+
+class MeanIoUOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Predictions",
+             "(Tensor), A Tensor of prediction results for semantic labels"
+             " with type int32 or int64. The rank should be greater than 1.");
+    AddInput(
+        "Labels",
+        "(Tensor), A Tensor of ground truth labels with type int32 or int64."
+        "Its shape should be the same as Input(Predictions).");
+    AddInput("InWrongs",
+             "(vector<Tensor>), A list of Tensor with shape "
+             "[num_classes]. They are used to collect wrong number among "
+             "batches. Empty list is also valid here.")
+        .AsDuplicable()
+        .AsDispensable();
+    AddInput(
+        "InCorrects",
+        "(vector<Tensor>), A list of Tensor with shape "
+        "[num_classes]. They are used to collect correct number among batches. "
+        "Empty list is also valid here.")
+        .AsDuplicable()
+        .AsDispensable();
+    AddInput("InMeanIou",
+             "(vector<Tensor>), A list of Tensor that Output(mean_iou) should "
+             "be added to. Empty list is also valid here.")
+        .AsDuplicable()
+        .AsDispensable();
+    AddOutput("OutMeanIou",
+              "(vector<Tensor>), A Tensor representing the"
+              " mean intersection-over-union with shape [1].");
+    AddOutput("OutWrong", "(Tensor), A Tensor with shape [num_classes]. ");
+    AddOutput("OutCorrect", "(Tensor), A Tensor with shape [num_classes]. ");
+    AddAttr<int>("num_classes", "(int), The possible number of labels.");
+
+    AddComment(R"DOC(
+mean-IOU Operator.
+Mean Intersection-Over-Union is a common evaluation metric for
+semantic image segmentation, which first computes the IOU for each
+semantic class and then computes the average over classes. 
+IOU is defined as follows: 
+    IOU = true_positive / (true_positive + false_positive + false_negative).
+It is based on pixel level area while "IOU Similarity Operator" 
+is based on area of rectangle.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(mean_iou, ops::MeanIoUOp, ops::MeanIoUOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(mean_iou, ops::MeanIoUKernel<int>,
+                       ops::MeanIoUKernel<int32_t>,
+                       ops::MeanIoUKernel<int64_t>);
--- a/paddle/fluid/operators/mean_iou_op.cu
+++ b/paddle/fluid/operators/mean_iou_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/mean_iou_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__global__ void CountCUDAKernel(const int num_classes, const int count,
+                                const T* predictions, const T* labels,
+                                int* wrong, int* correct) {
+  extern __shared__ int blcok_cache[];
+  int* wrong_c = blcok_cache;
+  int* correct_c = blcok_cache + num_classes;
+  // init cache
+  for (int i = threadIdx.x; i < num_classes * 2; i += blockDim.x) {
+    blcok_cache[i] = 0;
+  }
+  __syncthreads();
+
+  T pred;
+  T label;
+  CUDA_1D_KERNEL_LOOP(i, count) {
+    pred = predictions[i];
+    label = labels[i];
+    if (pred == label) {
+      atomicAdd(correct_c + pred, 1);
+    } else {
+      atomicAdd(wrong_c + pred, 1);
+      atomicAdd(wrong_c + label, 1);
+    }
+  }
+
+  __syncthreads();
+
+  for (int i = threadIdx.x; i < num_classes; i += blockDim.x) {
+    atomicAdd(wrong + i, wrong_c[i]);
+    atomicAdd(correct + i, correct_c[i]);
+  }
+}
+
+__global__ void ComputeIoUCUDAKernel(const int num_classes, int* wrong,
+                                     int* correct, float* ious, float* iou) {
+  __shared__ int valid_count_c;
+  if (threadIdx.x == 0) {
+    valid_count_c = 0;
+  }
+  __syncthreads();
+  CUDA_1D_KERNEL_LOOP(i, num_classes) {
+    int wrong_n = wrong[i];
+    int correct_n = correct[i];
+    int denominator = wrong_n + correct_n;
+    if (denominator > 0) {
+      atomicAdd(&valid_count_c, 1);
+      ious[i] = static_cast<float>(correct_n) / denominator;
+    } else {
+      ious[i] = 0;
+    }
+  }
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    float iou_sum = 0;
+    for (int i = 0; i < num_classes; ++i) {
+      iou_sum += ious[i];
+    }
+    iou[0] += iou_sum / valid_count_c;
+  }
+}
+
+template <typename T>
+class MeanIoUCUDAOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& place = *ctx.template device_context<platform::CUDADeviceContext>()
+                       .eigen_device();
+    // get input and output tensor
+    auto* predictions = ctx.Input<Tensor>("Predictions");
+    auto* labels = ctx.Input<Tensor>("Labels");
+    auto* out_mean_iou = ctx.Output<Tensor>("OutMeanIou");
+    auto* out_wrong = ctx.Output<Tensor>("OutWrong");
+    auto* out_correct = ctx.Output<Tensor>("OutCorrect");
+    int num_classes = static_cast<int>(ctx.Attr<int>("num_classes"));
+
+    // Get data ptr
+    const T* predictions_data = predictions->data<T>();
+    const T* labels_data = labels->data<T>();
+    int* out_wrong_data = out_wrong->mutable_data<int>(ctx.GetPlace());
+    int* out_correct_data = out_correct->mutable_data<int>(ctx.GetPlace());
+    float* out_mean_iou_data =
+        out_mean_iou->mutable_data<float>(ctx.GetPlace());
+
+    // Get Eigen tensor
+    auto out_mean_iou_t = EigenTensor<float, 1>::From(*out_mean_iou);
+    auto out_wrong_t = EigenTensor<int, 1>::From(*out_wrong);
+    auto out_correct_t = EigenTensor<int, 1>::From(*out_correct);
+
+    // Temporary tensor
+    Tensor ious;
+    float* ious_data = ious.mutable_data<float>(
+        {static_cast<int64_t>(num_classes)}, ctx.GetPlace());
+    auto ious_t = EigenTensor<float, 1>::From(ious);
+
+    // Init out_wrong, out_correct and out_mean_iou
+    out_wrong_t.device(place) = out_wrong_t.constant(0);
+    out_correct_t.device(place) = out_correct_t.constant(0);
+    out_mean_iou_t.device(place) = out_mean_iou_t.constant(0.0f);
+
+    // collect pre wrong, correct and mean_iou
+    auto in_mean_ious = ctx.MultiInput<Tensor>("InMeanIou");
+    for (int i = 0; i < in_mean_ious.size(); ++i) {
+      out_mean_iou_t.device(place) +=
+          EigenTensor<float, 1>::From(*in_mean_ious[i]);
+    }
+    auto in_wrongs = ctx.MultiInput<Tensor>("InWrongs");
+    for (int i = 0; i < in_wrongs.size(); ++i) {
+      out_wrong_t.device(place) += EigenTensor<int, 1>::From(*in_wrongs[i]);
+    }
+    auto in_corrects = ctx.MultiInput<Tensor>("InCorrects");
+    for (int i = 0; i < in_corrects.size(); ++i) {
+      out_correct_t.device(place) += EigenTensor<int, 1>::From(*in_corrects[i]);
+    }
+    // compute
+    auto stream = ctx.cuda_device_context().stream();
+    int block = PADDLE_CUDA_NUM_THREADS;
+    int grid = (predictions->numel() + block - 1) / block;
+    int cache_size = (num_classes * 2 + 1) * sizeof(int);
+    CountCUDAKernel<T><<<grid, block, cache_size, stream>>>(
+        num_classes, predictions->numel(), predictions_data, labels_data,
+        out_wrong_data, out_correct_data);
+    ctx.device_context().Wait();
+    ComputeIoUCUDAKernel<<<1, block, 0, stream>>>(num_classes, out_wrong_data,
+                                                  out_correct_data, ious_data,
+                                                  out_mean_iou_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(mean_iou, ops::MeanIoUCUDAOpKernel<int>,
+                        ops::MeanIoUCUDAOpKernel<int64_t>,
+                        ops::MeanIoUCUDAOpKernel<int32_t>);
--- a/paddle/fluid/operators/mean_iou_op.h
+++ b/paddle/fluid/operators/mean_iou_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+template <typename T, int D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+
+template <typename T>
+class MeanIoUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
+                       .eigen_device();
+    // get input and output tensor
+    auto* predictions = ctx.Input<Tensor>("Predictions");
+    auto* labels = ctx.Input<Tensor>("Labels");
+    auto* out_mean_iou = ctx.Output<Tensor>("OutMeanIou");
+    auto* out_wrong = ctx.Output<Tensor>("OutWrong");
+    auto* out_correct = ctx.Output<Tensor>("OutCorrect");
+    int num_classes = static_cast<int>(ctx.Attr<int>("num_classes"));
+
+    // get data ptr
+    const T* predictions_data = predictions->data<T>();
+    const T* labels_data = labels->data<T>();
+    float* out_mean_iou_data =
+        out_mean_iou->mutable_data<float>(ctx.GetPlace());
+    int* out_wrong_data = out_wrong->mutable_data<int>(ctx.GetPlace());
+    int* out_correct_data = out_correct->mutable_data<int>(ctx.GetPlace());
+
+    // get eigen tensor
+    auto out_mean_iou_t = EigenTensor<float, 1>::From(*out_mean_iou);
+    auto out_wrong_t = EigenTensor<int, 1>::From(*out_wrong);
+    auto out_correct_t = EigenTensor<int, 1>::From(*out_correct);
+
+    // Tmp tensor
+    Tensor denominator;
+    Tensor valid_count;
+    Tensor iou_sum;
+
+    // get data ptr of tmp tensor
+    int* denominator_data = denominator.mutable_data<int>(
+        {static_cast<int64_t>(num_classes)}, ctx.GetPlace());
+    int* valid_count_data = valid_count.mutable_data<int>({1}, ctx.GetPlace());
+    float* iou_sum_data = iou_sum.mutable_data<float>({1}, ctx.GetPlace());
+
+    // get eigen tensor of tmp tensor
+    auto denominator_t = EigenTensor<int, 1>::From(denominator);
+    auto valid_count_t = EigenTensor<int, 1>::From(valid_count);
+    auto iou_sum_t = EigenTensor<float, 1>::From(iou_sum);
+
+    // init out_wrong, out_correct and out_mean_iou
+    out_wrong_t = out_wrong_t.constant(0);
+    out_correct_t = out_correct_t.constant(0);
+    out_mean_iou_t = out_mean_iou_t.constant(0);
+
+    // collect pre wrong, correct and mean_iou
+    auto in_mean_ious = ctx.MultiInput<Tensor>("InMeanIou");
+    for (size_t i = 0; i < in_mean_ious.size(); ++i) {
+      out_mean_iou_t.device(place) +=
+          EigenTensor<float, 1>::From(*in_mean_ious[i]);
+    }
+    auto in_wrongs = ctx.MultiInput<Tensor>("InWrongs");
+    for (size_t i = 0; i < in_wrongs.size(); ++i) {
+      out_wrong_t.device(place) += EigenTensor<int, 1>::From(*in_wrongs[i]);
+    }
+    auto in_corrects = ctx.MultiInput<Tensor>("InCorrects");
+    for (size_t i = 0; i < in_corrects.size(); ++i) {
+      out_correct_t.device(place) += EigenTensor<int, 1>::From(*in_corrects[i]);
+    }
+
+    // compute
+    for (int64_t i = 0; i < predictions->numel(); ++i) {
+      if (predictions_data[i] == labels_data[i]) {
+        out_correct_data[predictions_data[i]] += 1;
+      } else {
+        out_wrong_data[labels_data[i]] += 1;
+        out_wrong_data[predictions_data[i]] += 1;
+      }
+    }
+
+    denominator_t = out_wrong_t + out_correct_t;
+    valid_count_t =
+        (denominator_t > denominator_t.constant(0.0f)).cast<int>().sum();
+
+    for (int i = 0; i < num_classes; ++i) {
+      if (denominator_data[i] == 0) {
+        denominator_data[i] = 1;
+      }
+    }
+
+    iou_sum_t =
+        (out_correct_t.cast<float>() / denominator_t.cast<float>()).sum();
+    out_mean_iou_data[0] += (iou_sum_data[0] / valid_count_data[0]);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/merge_ids_op.cc
+++ b/paddle/fluid/operators/merge_ids_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/merge_ids_op.h"
+
+namespace paddle {
+namespace operators {
+
+class MergeIdsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}");
+    AddInput(
+        "X",
+        "(LoDTensors) multi input tensor with shape{batch_num, N}, N is the "
+        "size of embedding table")
+        .AsDuplicable();
+    AddOutput("Out", "(LoDTensor) The merged outputs of the input tensors.");
+
+    AddComment(R"DOC(
+Merge multi LoDTensor's into one according to Ids's shard num.
+
+
+split_ids_op -> prefetch_op -> merge_ids_op
+
+
+merge_ids_op should be used after split_ids_op and prefetch_op, split_ids_op
+ will split input Ids into multiple tensors according to Id's shard number.
+prefetch_op will send them to parameter server to prefetch embedding value
+back. During split, the order of ids is disordered. In merge_ids_op we use
+the original Ids to restore the order of the fetched embedding value and
+ also pass the lod information to the merged output.
+
+
+Example:
+
+    Ids = [1,2,3,4,5,6] # 3 shared
+
+split_ids_op ->
+
+    Id0 = [3, 6] # id % 3 == 0
+    Id1 = [1, 4] # id % 3 == 1
+    Id2 = [2, 5] # id % 3 == 2
+
+prefetch_op ->
+
+    X0 = [[0.3 0.3]   # 3
+          [0.6 0.6]]  # 6
+    X1 = [[0.1 0.1]   # 1
+          [0.4 0.4]]  # 4
+    X2 = [[0.2 0.2]   # 2
+          [0.5 0.5]]  # 5
+
+merge_ids_op ->
+
+    Out = [[0.1 0.1]  # 1
+           [0.2 0.2]  # 2
+           [0.3 0.3]  # 3
+           [0.4 0.4]  # 4
+           [0.5 0.5]  # 5
+           [0.6 0.6]] # 6
+)DOC");
+  }
+};
+
+class MergeIdsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Ids"), "MergeIdsOp must has input Ids.");
+    PADDLE_ENFORCE(ctx->HasInputs("X"), "MergeIdsOp must has input X.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "MergeIdsOp must has output Out.");
+
+    auto ids_var_type = ctx->GetInputsVarType("Ids").front();
+    auto ids_dims = ctx->GetInputDim("Ids");
+    if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
+      PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
+      PADDLE_ENFORCE_EQ(ids_dims[1], 1);
+    }
+    auto x_var_type = ctx->GetInputsVarType("X");
+    for (auto &var_type : x_var_type) {
+      PADDLE_ENFORCE_EQ(var_type, framework::proto::VarType::LOD_TENSOR,
+                        "input X only support lod tensors");
+    }
+    ctx->ShareLoD("Ids", "Out");
+  }
+
+ private:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.MultiInput<framework::Tensor>("X").front()->type()),
+        ctx.GetPlace());
+  }
+};
+
+class MergeIdsOpInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    auto *input_var = block->Var(op_desc.Input("Ids")[0]);
+    for (auto &out_var : op_desc.Output("Out")) {
+      block->Var(out_var)->SetType(input_var->GetType());
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(merge_ids, ops::MergeIdsOp, ops::MergeIdsOpMaker,
+                  ops::MergeIdsOpInferVarType);
+REGISTER_OP_CPU_KERNEL(
+    merge_ids, ops::MergeIdsOpKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/fluid/operators/merge_ids_op.h
+++ b/paddle/fluid/operators/merge_ids_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class MergeIdsOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto place = ctx.GetPlace();
+    if (!platform::is_cpu_place(place)) {
+      PADDLE_THROW("MergeIds do not support GPU kernel");
+    }
+    VLOG(3) << "run in MergeIdsOpKernel";
+
+    const auto *ids_var = ctx.InputVar("Ids");
+    PADDLE_ENFORCE(ids_var->IsType<framework::LoDTensor>(),
+                   "only support to merge Ids of LoDTensor");
+
+    const auto &ids_tensor = ids_var->Get<framework::LoDTensor>();
+    const auto &ids_dims = ids_tensor.dims();
+    const int64_t *ids = ids_tensor.data<int64_t>();
+
+    auto x_tensors = ctx.MultiInput<framework::LoDTensor>("X");
+
+    auto *out = ctx.Output<framework::LoDTensor>("Out");
+
+    int batch_size = 0;
+    int embedding_size = 0;
+    for (auto &input : x_tensors) {
+      if (framework::product(input->dims()) != 0) {
+        if (embedding_size == 0) {
+          embedding_size = input->dims()[1];
+        }
+        PADDLE_ENFORCE_EQ(embedding_size, input->dims()[1],
+                          "embedding size of all input should be the same");
+        batch_size += input->dims()[0];
+      }
+    }
+    PADDLE_ENFORCE_EQ(
+        batch_size, ids_dims[0],
+        "the batch size of ids and merged embedding value should be the same");
+
+    const size_t shard_num = x_tensors.size();
+
+    if (shard_num == 1) {
+      VLOG(3) << "only one shard, we can copy the data directly";
+      TensorCopy(*x_tensors[0], place, out);
+    } else {
+      std::vector<int> in_indexs(shard_num, 0);
+      auto *out_data = out->mutable_data<T>(
+          framework::make_ddim({batch_size, embedding_size}), place);
+      // copy data from ins[shard_num] to out.
+      for (int i = 0; i < ids_dims[0]; ++i) {
+        int64_t id = ids[i];
+        size_t shard_id = static_cast<size_t>(id) % shard_num;
+        int index = in_indexs[shard_id];
+        memcpy(out_data + embedding_size * i,
+               x_tensors[shard_id]->data<T>() + index * embedding_size,
+               sizeof(T) * embedding_size);
+        in_indexs[shard_id] += 1;
+      }
+
+      for (size_t i = 0; i < shard_num; ++i) {
+        PADDLE_ENFORCE_EQ(in_indexs[i], x_tensors[i]->dims()[0],
+                          "after merge, all data in x_tensor should be used");
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/reader/create_batch_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc
@@ -20,7 +20,7 @@ namespace reader {

 class BatchReader : public framework::DecoratedReader {
 public:
-  BatchReader(ReaderBase* reader, int batch_size)
+  BatchReader(const std::shared_ptr<ReaderBase>& reader, int batch_size)
      : DecoratedReader(reader), batch_size_(batch_size) {
    buffer_.reserve(batch_size_);
  }

--- a/paddle/fluid/operators/reader/create_custom_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc
@@ -22,7 +22,8 @@ namespace reader {

 class CustomReader : public framework::DecoratedReader {
 public:
-  CustomReader(ReaderBase* reader, const framework::BlockDesc& sub_block,
+  CustomReader(const std::shared_ptr<ReaderBase>& reader,
+               const framework::BlockDesc& sub_block,
               const std::vector<std::string>& source_var_names,
               const std::vector<std::string>& sink_var_names)
      : DecoratedReader(reader),

--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -34,7 +34,8 @@ static constexpr size_t kChannelSize = 1;  // kCacheSize - 2
 class DoubleBufferReader : public framework::DecoratedReader {
 public:
  explicit DoubleBufferReader(
-      ReaderBase* reader, platform::Place target_place = platform::CPUPlace())
+      const std::shared_ptr<ReaderBase>& reader,
+      platform::Place target_place = platform::CPUPlace())
      : DecoratedReader(reader), place_(target_place) {
    cpu_tensor_cache_.resize(kCacheSize);
    gpu_tensor_cache_.resize(kCacheSize);

--- a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
@@ -21,7 +21,7 @@ namespace reader {

 class MultiPassReader : public framework::DecoratedReader {
 public:
-  MultiPassReader(ReaderBase* reader, int pass_num)
+  MultiPassReader(const std::shared_ptr<ReaderBase>& reader, int pass_num)
      : DecoratedReader(reader), pass_num_(pass_num), pass_count_(0) {}

  void ReadNext(std::vector<framework::LoDTensor>* out) override {

--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
@@ -23,7 +23,8 @@ namespace reader {

 class ShuffleReader : public framework::DecoratedReader {
 public:
-  ShuffleReader(ReaderBase* reader, size_t buffer_size, size_t seed = 0)
+  ShuffleReader(const std::shared_ptr<ReaderBase>& reader, size_t buffer_size,
+                size_t seed = 0)
      : DecoratedReader(reader), buffer_size_(buffer_size), seed_(seed) {
    VLOG(10) << "Create shuffle reader of " << reader_;
    if (seed_ == 0) {

--- a/paddle/fluid/operators/reader/create_threaded_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
@@ -21,7 +21,8 @@ namespace reader {

 class ThreadedReader : public framework::DecoratedReader {
 public:
-  explicit ThreadedReader(ReaderBase* reader) : DecoratedReader(reader) {}
+  explicit ThreadedReader(const std::shared_ptr<ReaderBase>& reader)
+      : DecoratedReader(reader) {}

  void ReadNext(std::vector<framework::LoDTensor>* out) override {
    std::lock_guard<std::mutex> lock(mutex_);

--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -21,12 +21,17 @@ limitations under the License. */
 #include <unistd.h>
 #endif

+#include <algorithm>
 #include "gflags/gflags.h"

 DEFINE_double(fraction_of_cpu_memory_to_use, 1,
              "Default use 100% of CPU memory for PaddlePaddle,"
              "reserve the rest for page tables, etc");

+DEFINE_uint64(
+    initial_cpu_memory_in_mb, 500,
+    "Default initial 500MB of CPU memory for PaddlePaddle, in MD unit.");
+
 DEFINE_double(
    fraction_of_cuda_pinned_memory_to_use, 0.5,
    "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
@@ -54,7 +59,10 @@ inline size_t CpuTotalPhysicalMemory() {
 size_t CpuMaxAllocSize() {
  // For distributed systems, it requires configuring and limiting
  // the fraction of memory to use.
-  return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
+  return std::min(
+      static_cast<size_t>(FLAGS_fraction_of_cpu_memory_to_use *
+                          CpuTotalPhysicalMemory()),
+      static_cast<size_t>(FLAGS_initial_cpu_memory_in_mb * 1 << 20));
 }

 size_t CpuMinChunkSize() {

--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -322,7 +322,6 @@ class DeviceTracerImpl : public DeviceTracer {
    DisableActivity();
    dynload::cuptiUnsubscribe(subscriber_);
    CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_));
-    PADDLE_ENFORCE(dynload::cuptiFinalize());
    enabled_ = false;
  }


--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -72,7 +72,6 @@ extern void *cupti_dso_handle;
  __macro(cuptiGetResultString);              \
  __macro(cuptiActivityGetNumDroppedRecords); \
  __macro(cuptiActivityFlushAll);             \
-  __macro(cuptiFinalize);                     \
  __macro(cuptiSubscribe);                    \
  __macro(cuptiUnsubscribe);                  \
  __macro(cuptiEnableCallback);               \

--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -41,6 +41,11 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) {
  }
 }

+// NOTE(minqiyang): according to the ncclGroupEnd documentations:
+// https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
+// ncclGroupEnd will wait for all communicators to be initialized, which will
+// cause blocking problem when a runtime_error was thrown, so try only guard
+// NCCL actions when use it.
 class NCCLGroupGuard {
 public:
  static std::mutex &NCCLMutex() {

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -413,6 +413,9 @@ All parameter, weight, gradient are variables in Paddle.

  py::class_<framework::Executor>(m, "Executor")
      .def(py::init<const platform::Place &>())
+#ifdef PADDLE_WITH_DISTRIBUTE
+      .def("complete", &Executor::Complete)
+#endif
      .def("run",
           (void (Executor::*)(const ProgramDesc &, Scope *, int, bool, bool)) &
               Executor::Run);

--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifndef MATHFUNCTIONS_H_
-#define MATHFUNCTIONS_H_
+#pragma once

 #ifdef PADDLE_WITH_MKLML
 #include <mkl_cblas.h>
@@ -21,7 +20,7 @@ limitations under the License. */
 #include <mkl_vml_functions.h>
 #endif

-#if defined(PADDLE_USE_VECLIB)
+#ifdef PADDLE_USE_VECLIB
 extern "C" {
 #include <cblas.h>
 #include <clapack.h>
@@ -30,8 +29,10 @@ extern "C" {

 #ifdef PADDLE_USE_OPENBLAS
 #include <cblas.h>
+#ifdef LAPACK_FOUND
 #include <lapacke.h>
 #endif
+#endif

 #ifndef LAPACK_FOUND
 extern "C" {
@@ -126,5 +127,3 @@ template <class T>
 void vTanh(const int n, const T* a, T* r);

 }  // namespace paddle
-
-#endif  // MATHFUNCTIONS_H_
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -132,7 +132,8 @@ EOF
        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-        -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
+        -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
+        -DWITH_ANAKIN=ON
 }

 function abort(){

--- a/python/paddle/batch.py
+++ b/python/paddle/batch.py
@@ -15,7 +15,7 @@
 __all__ = ['batch']


-def batch(reader, batch_size, drop_last=False):
+def batch(reader, batch_size, drop_last=True):
    """
    Create a batched reader.


--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -382,7 +382,7 @@ class Operator(object):
        'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
        'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine',
        'ncclInit', 'channel_create', 'channel_close', 'channel_send',
-        'channel_recv', 'select'
+        'channel_recv', 'select', 'gen_nccl_id'
    }

    def __init__(self,

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
@@ -96,10 +96,11 @@ def train(use_cuda, train_program, params_dirname):
    train_reader = paddle.batch(
        paddle.reader.shuffle(
            cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10),
-        batch_size=BATCH_SIZE)
+        batch_size=BATCH_SIZE,
+        drop_last=False)

    test_reader = paddle.batch(
-        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
+        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False)

    def event_handler(event):
        if isinstance(event, fluid.EndStepEvent):

--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
@@ -73,10 +73,11 @@ def train(use_cuda, train_program, params_dirname):
    train_reader = paddle.batch(
        paddle.reader.shuffle(
            cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10),
-        batch_size=BATCH_SIZE)
+        batch_size=BATCH_SIZE,
+        drop_last=False)

    test_reader = paddle.batch(
-        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
+        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False)

    def event_handler(event):
        if isinstance(event, fluid.EndStepEvent):

--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
@@ -87,7 +87,9 @@ def train(use_cuda, train_program, params_dirname):
    def event_handler(event):
        if isinstance(event, fluid.EndEpochEvent):
            test_reader = paddle.batch(
-                paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE)
+                paddle.dataset.imdb.test(word_dict),
+                batch_size=BATCH_SIZE,
+                drop_last=False)
            avg_cost, acc = trainer.test(
                reader=test_reader, feed_order=['words', 'label'])

@@ -113,7 +115,8 @@ def train(use_cuda, train_program, params_dirname):
    train_reader = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.imdb.train(word_dict), buf_size=25000),
-        batch_size=BATCH_SIZE)
+        batch_size=BATCH_SIZE,
+        drop_last=False)

    trainer.train(
        num_epochs=1,

--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -56,7 +56,7 @@ BATCH_SIZE = 200

 # fix the order of training data
 train_reader = paddle.batch(
-    paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE)
+    paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE, drop_last=False)

 # train_reader = paddle.batch(
 #     paddle.reader.shuffle(

--- a/python/paddle/fluid/tests/unittests/test_mean_iou.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_iou.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def compute_mean_iou(predictions, labels, num_classes, in_wrongs, in_corrects,
+                     in_mean_ious):
+    assert predictions.shape == labels.shape
+    predictions = predictions.flatten()
+    labels = labels.flatten()
+
+    out_wrong = np.zeros([num_classes]).astype("int32")
+    for _, wrong in in_wrongs:
+        out_wrong += wrong
+    out_correct = np.zeros([num_classes]).astype("int32")
+    for _, correct in in_corrects:
+        out_correct += correct
+
+    for pred, label in zip(predictions, labels):
+        if pred == label:
+            out_correct[pred] += 1
+        else:
+            out_wrong[pred] += 1
+            out_wrong[label] += 1
+
+    denominator = out_wrong + out_correct
+    valid_count = (denominator != 0).sum()
+    denominator = np.where(denominator > 0, denominator,
+                           np.ones(denominator.shape))
+    mean_iou = (out_correct / denominator).sum() / valid_count
+
+    for _, in_mean_iou in in_mean_ious:
+        mean_iou += in_mean_iou
+    return mean_iou, out_wrong, out_correct
+
+
+class TestMeanIOUOp(OpTest):
+    def setUp(self):
+        self.config()
+        self.op_type = "mean_iou"
+        predictions = np.random.randint(0, self.num_classes,
+                                        self.image_size).astype("int32")
+        labels = np.random.randint(0, self.num_classes,
+                                   self.image_size).astype("int32")
+
+        in_wrongs = []
+        for i in range(self.in_wrong_num):
+            in_wrongs.append(("in_wrong_%d" % i, np.random.randint(
+                0, 10, [self.num_classes]).astype("int32")))
+
+        in_corrects = []
+        for i in range(self.in_correct_num):
+            in_corrects.append(("in_correct_%d" % i, np.random.randint(
+                0, 10, [self.num_classes]).astype("int32")))
+
+        in_mean_ious = []
+        for i in range(self.in_mean_iou_num):
+            in_mean_ious.append(("in_mean_iou_%d" % i, np.random.uniform(
+                0, 1, [1]).astype("float32")))
+
+        self.inputs = {
+            'Predictions': predictions,
+            'Labels': labels,
+            'InWrongs': in_wrongs,
+            'InCorrects': in_corrects,
+            'InMeanIou': in_mean_ious
+        }
+        self.attrs = {'num_classes': long(self.num_classes)}
+        mean_iou, out_wrong, out_correct = compute_mean_iou(
+            predictions, labels, self.num_classes, in_wrongs, in_corrects,
+            in_mean_ious)
+        self.outputs = {
+            'OutMeanIou': mean_iou,
+            'OutWrong': out_wrong,
+            'OutCorrect': out_correct
+        }
+
+    def config(self):
+        self.num_classes = 10
+        self.image_size = [128, 128]
+        self.in_wrong_num = 0
+        self.in_correct_num = 0
+        self.in_mean_iou_num = 0
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestCase1(TestMeanIOUOp):
+    def config(self):
+        self.num_classes = 5
+        self.image_size = [100, 128]
+        self.in_wrong_num = 2
+        self.in_correct_num = 2
+        self.in_mean_iou_num = 2
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
+++ b/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestMergeIdsOp(OpTest):
+    def setUp(self):
+        self.op_type = "merge_ids"
+        ids = np.array([[0], [2], [2], [3], [5], [5], [6]]).astype('int64')
+        x0 = np.array([[0.1, 0.2], [0.2, 0.3], [0.3, 0.4]]).astype('float32')
+        x1 = np.array([]).astype('float32')
+        x2 = np.array([[0.4, 0.5], [0.4, 0.5], [0.5, 0.6],
+                       [0.5, 0.6]]).astype('float32')
+        out = np.array([[0.1, 0.2], [0.4, 0.5], [0.4, 0.5], [0.2, 0.3],
+                        [0.5, 0.6], [0.5, 0.6], [0.3, 0.4]]).astype('float32')
+        self.inputs = {'Ids': ids, "X": [('x0', x0), ('x1', x1), ('x2', x2)]}
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -515,35 +515,38 @@ class DistributeTranspiler:
                                       grad_to_block_id, None)

        # process distributed lookup_table
-        prefetch_block = None
+        prefetch_var_name_to_block_id = []
        if self.has_distributed_lookup_table:
            pserver_index = self.pserver_endpoints.index(endpoint)
            table_opt_block = self._create_table_optimize_block(
                pserver_index, pserver_program, pre_block_idx, grad_to_block_id)
-            prefetch_block = self._create_prefetch_block(
+            prefetch_var_name_to_block_id = self._create_prefetch_block(
                pserver_index, pserver_program, table_opt_block)

        # NOTE: if has_distributed_lookup_table is False, then prefetch_block will
        # not be executed, so it's safe to use optimize_block to hold the place
        if self.has_distributed_lookup_table:
-            assert prefetch_block is not None
+            assert len(prefetch_var_name_to_block_id) > 0
        else:
-            assert prefetch_block is None
-            prefetch_block = pserver_program.global_block()
+            assert len(prefetch_var_name_to_block_id) == 0
+
+        attrs = {
+            "OptimizeBlock": pserver_program.block(1),
+            "endpoint": endpoint,
+            "Fanin": self.trainer_num,
+            "sync_mode": self.sync_mode,
+            "grad_to_block_id": grad_to_block_id
+        }
+        if len(prefetch_var_name_to_block_id) > 0:
+            attrs['prefetch_var_name_to_block_id'] \
+                = prefetch_var_name_to_block_id

        # step5 append the listen_and_serv op
        pserver_program.global_block().append_op(
            type="listen_and_serv",
            inputs={'X': recv_inputs},
            outputs={},
-            attrs={
-                "OptimizeBlock": pserver_program.block(1),
-                "endpoint": endpoint,
-                "Fanin": self.trainer_num,
-                "PrefetchBlock": prefetch_block,
-                "sync_mode": self.sync_mode,
-                "grad_to_block_id": grad_to_block_id
-            })
+            attrs=attrs)

        pserver_program.sync_with_cpp()
        return pserver_program
@@ -608,8 +611,15 @@ class DistributeTranspiler:
    def _replace_lookup_table_op_with_prefetch(self, program,
                                               pserver_endpoints):
        # 1. replace lookup_table_op with split_ids_op -> prefetch_op -> sum_op
-        self.prefetch_input_vars = None
-        self.prefetch_output_vars = None
+        # self.all_prefetch_input_vars =
+        #       [[var0_prefetch_in_pserver0, var0_prefetch_in_pserver1]
+        #        [var1_prefetch_in_pserver0, var1_prefetch_in_pserver1]]
+        self.all_prefetch_input_vars = []
+
+        # self.all_prefetch_input_vars =
+        #       [[var0_prefetch_in_pserver0, var0_prefetch_in_pserver1]
+        #        [var1_prefetch_in_pserver0, var1_prefetch_in_pserver1]]
+        self.all_prefetch_output_vars = []

        continue_search_lookup_table_op = True
        while continue_search_lookup_table_op:
@@ -619,26 +629,27 @@ class DistributeTranspiler:
                if op.type == LOOKUP_TABLE_TYPE:
                    continue_search_lookup_table_op = True

-                    op_index = list(all_ops).index(op)
+                    lookup_table_op_index = list(all_ops).index(op)
                    ids_name = op.input("Ids")
                    out_name = op.output("Out")

-                    if self.prefetch_input_vars is None:
-                        ids_var = program.global_block().vars[ids_name[0]]
-                        self.prefetch_input_vars = self.create_splited_vars(
-                            source_var=ids_var,
-                            block=program.global_block(),
-                            tag="_prefetch_in_")
-                    if self.prefetch_output_vars is None:
-                        out_var = program.global_block().vars[out_name[0]]
-                        self.prefetch_output_vars = self.create_splited_vars(
-                            source_var=out_var,
-                            block=program.global_block(),
-                            tag="_prefetch_out_")
+                    ids_var = program.global_block().vars[ids_name[0]]
+                    prefetch_input_vars = self.create_splited_vars(
+                        source_var=ids_var,
+                        block=program.global_block(),
+                        tag="_prefetch_in_")
+                    self.all_prefetch_input_vars.append(prefetch_input_vars)
+
+                    out_var = program.global_block().vars[out_name[0]]
+                    prefetch_output_vars = self.create_splited_vars(
+                        source_var=out_var,
+                        block=program.global_block(),
+                        tag="_prefetch_out_")
+                    self.all_prefetch_output_vars.append(prefetch_output_vars)

                    # insert split_ids_op
                    program.global_block().insert_op(
-                        index=op_index,
+                        index=lookup_table_op_index,
                        type="split_ids",
                        inputs={
                            'Ids': [
@@ -646,14 +657,14 @@ class DistributeTranspiler:
                                for varname in ids_name
                            ]
                        },
-                        outputs={"Out": self.prefetch_input_vars})
+                        outputs={"Out": prefetch_input_vars})

                    # insert prefetch_op
                    program.global_block().insert_op(
-                        index=op_index + 1,
+                        index=lookup_table_op_index + 1,
                        type="prefetch",
-                        inputs={'X': self.prefetch_input_vars},
-                        outputs={"Out": self.prefetch_output_vars},
+                        inputs={'X': prefetch_input_vars},
+                        outputs={"Out": prefetch_output_vars},
                        attrs={
                            "epmap": pserver_endpoints,
                            RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
@@ -661,16 +672,21 @@ class DistributeTranspiler:

                    # insert concat_op
                    program.global_block().insert_op(
-                        index=op_index + 2,
-                        type="concat",
-                        inputs={'X': self.prefetch_output_vars},
+                        index=lookup_table_op_index + 2,
+                        type="merge_ids",
+                        inputs={
+                            'Ids': [
+                                program.global_block().vars[varname]
+                                for varname in ids_name
+                            ],
+                            'X': prefetch_output_vars
+                        },
                        outputs={
                            "Out": [
                                program.global_block().vars[varname]
                                for varname in out_name
                            ]
-                        },
-                        attrs={"axis": 0})
+                        })

                    # delete lookup_table_op
                    delete_ops(program.global_block(), [op])
@@ -709,30 +725,34 @@ class DistributeTranspiler:
                               optimize_block):
        # STEP: create prefetch block
        table_var = pserver_program.global_block().vars[self.table_name]
-        prefetch_block = pserver_program.create_block(optimize_block.idx)
-        trainer_ids = self.prefetch_input_vars[pserver_index]
-        pserver_ids = pserver_program.global_block().create_var(
-            name=trainer_ids.name,
-            type=trainer_ids.type,
-            shape=trainer_ids.shape,
-            dtype=trainer_ids.dtype)
-        trainer_out = self.prefetch_output_vars[pserver_index]
-        pserver_out = pserver_program.global_block().create_var(
-            name=trainer_out.name,
-            type=trainer_out.type,
-            shape=trainer_out.shape,
-            dtype=trainer_out.dtype)
-        prefetch_block.append_op(
-            type="lookup_sparse_table",
-            inputs={'Ids': pserver_ids,
-                    "W": table_var},
-            outputs={"Out": pserver_out},
-            attrs={
-                "is_sparse": True,  # has no effect on lookup_table op
-                "is_distributed": True,
-                "padding_idx": -1
-            })
-        return prefetch_block
+        prefetch_var_name_to_block_id = []
+        for index in range(len(self.all_prefetch_input_vars)):
+            prefetch_block = pserver_program.create_block(optimize_block.idx)
+            trainer_ids = self.all_prefetch_input_vars[index][pserver_index]
+            pserver_ids = pserver_program.global_block().create_var(
+                name=trainer_ids.name,
+                type=trainer_ids.type,
+                shape=trainer_ids.shape,
+                dtype=trainer_ids.dtype)
+            trainer_out = self.all_prefetch_output_vars[index][pserver_index]
+            pserver_out = pserver_program.global_block().create_var(
+                name=trainer_out.name,
+                type=trainer_out.type,
+                shape=trainer_out.shape,
+                dtype=trainer_out.dtype)
+            prefetch_block.append_op(
+                type="lookup_sparse_table",
+                inputs={'Ids': pserver_ids,
+                        "W": table_var},
+                outputs={"Out": pserver_out},
+                attrs={
+                    "is_sparse": True,  # has no effect on lookup_table op
+                    "is_distributed": True,
+                    "padding_idx": -1
+                })
+            prefetch_var_name_to_block_id.append(trainer_ids.name + ":" + str(
+                prefetch_block.idx))
+        return prefetch_var_name_to_block_id

    def _create_table_optimize_block(self, pserver_index, pserver_program,
                                     pre_block_idx, grad_to_block_id):

--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -240,14 +240,15 @@ class ExtraLayerAttribute(object):
    :type error_clipping_threshold: float
    :param drop_rate: Dropout rate. Dropout will create a mask on layer output.
                      The dropout rate is the zero rate of this mask. The
-                      details of what dropout is please refer to `here
-                      <https://www.cs.toronto.edu/~hinton/absps/
-                      JMLRdropout.pdf>`_.
+                      details of what dropout is please refer to `JMLRdropout
+                      <https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf
+                      >`_.
    :type drop_rate: float
    :param device: device ID of layer. device=-1, use CPU. device>=0, use GPU.
-                   The details allocation in parallel_nn please refer to `here
-                   <http://www.paddlepaddle.org/doc/ui/cmd_argument/
-                   use_case.html#case-2-specify-layers-in-different-devices>`_.
+                   The details allocation in parallel_nn please refer to `use_case
+                   <https://github.com/PaddlePaddle/Paddle/blob/develop/doc/v2
+                   /howto/cmd_parameter/use_case_en.md#case-2-specify-layers-in
+                   -different-devices>`_.
    :type device: int
    """


--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -2556,7 +2556,7 @@ def img_conv_layer(input,
    the output will be obtained by concatenating the two results.

    The details of grouped convolution, please refer to:
-    `ImageNet Classification with Deep Convolutional Neural Networks
+    `ImageNet Classification With Deep Convolutional Neural Networks
    <http://www.cs.toronto.edu/~kriz/imagenet_classification_with_deep_convolutional.pdf>`_
    
    The example usage is:
@@ -5678,8 +5678,8 @@ def warp_ctc_layer(input,
    <https://github.com/baidu-research/warp-ctc>`_ library, which is used in
    `Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin
    <https://arxiv.org/pdf/1512.02595v1.pdf>`_, to compute Connectionist Temporal
-    Classification (CTC) loss. Besides, another `warp-ctc
-    <https://github.com/gangliao/warp-ctc>`_ repository, which is forked from
+    Classification (CTC) loss. Besides, another `warp-ctc repository
+    <https://github.com/gangliao/warp-ctc>`_ , which is forked from
    the official one, is maintained to enable more compiling options. During the
    building process, PaddlePaddle will clone the source codes, build and
    install it to :code:`third_party/install/warpctc` directory.

--- a/python/paddle/v2/minibatch.py
+++ b/python/paddle/v2/minibatch.py
@@ -15,7 +15,7 @@
 __all__ = ['batch']


-def batch(reader, batch_size, drop_last=False):
+def batch(reader, batch_size, drop_last=True):
    """
    Create a batched reader.


--- a/tools/codestyle/docstring_checker.py
+++ b/tools/codestyle/docstring_checker.py
--- a/tools/codestyle/pylint_pre_commit.hook
+++ b/tools/codestyle/pylint_pre_commit.hook
@@ -7,13 +7,13 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 export PYTHONPATH=$DIR:$PYTHONPATH

 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
-for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do
+for file in $(git diff --name-status | awk '$1 != "D" {print $2}'); do
    pylint --disable=all --load-plugins=docstring_checker \
    --enable=doc-string-one-line,doc-string-end-with,doc-string-with-all-args,doc-string-triple-quotes,doc-string-missing,doc-string-indent-error,doc-string-with-returns,doc-string-with-raises $file;
    TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
 done

-#exit $TOTAL_ERRORS
+exit $TOTAL_ERRORS
 #For now, just warning:
-exit 0
+#exit 0