Merge branch 'develop' of github.com:PaddlePaddle/Paddle into overlap_memcpy_with_dist

3d875b69 · Yancey1989 · f52d78d1 · d07d9535 · 3d875b69 · 3d875b69
87 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -61,6 +61,7 @@ option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen"            OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
+option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 # CMAKE_BUILD_TYPE
@@ -193,7 +194,10 @@ set(EXTERNAL_LIBS
 if(WITH_GPU)
    include(cuda)
    include(tensorrt)
-endif(WITH_GPU)
+    include(external/anakin)
+else()
+  set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
+endif()
 if(WITH_AMD_GPU)
    find_package(HIP)

--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -180,7 +180,7 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
        print_train_time(start_time, time.time(), num_samples)
        print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))),
        # evaluation
-        if not args.no_test and batch_acc:
+        if not args.no_test and batch_acc and not args.use_reader_op:
            pass_test_acc = test(exe, infer_prog, test_reader, feeder,
                                 batch_acc)
            print(", Test Accuracy: %f" % pass_test_acc)
@@ -277,11 +277,12 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
            batch_id += 1
        print_train_time(start_time, time.time(), num_samples)
-        if not args.no_test and batch_acc:
+        if not args.no_test and batch_acc and not args.use_reader_op:
+            # we have not implement record io for test
+            # skip test when use args.use_reader_op
            test_acc = test(startup_exe, infer_prog, test_reader, feeder,
                            batch_acc)
            print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc))
-        exit(0)
 def print_arguments(args):

--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -199,7 +199,10 @@ def get_model(args):
    batched_train_reader = paddle.batch(
        paddle.reader.shuffle(
            train_reader, buf_size=5120),
-        batch_size=args.batch_size * args.gpus)
+        batch_size=args.batch_size * args.gpus,
-    batched_test_reader = paddle.batch(train_reader, batch_size=args.batch_size)
+        drop_last=True)
+    batched_test_reader = paddle.batch(
+        train_reader, batch_size=args.batch_size, drop_last=True)
-    return avg_cost, inference_program, optimizer, batched_train_reader, batched_test_reader, batch_acc
+    return avg_cost, inference_program, optimizer, batched_train_reader,\
+                   batched_test_reader, batch_acc
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -118,6 +118,10 @@ endif()
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
+if(WITH_DISTRIBUTE)
+  add_definitions(-DPADDLE_WITH_DISTRIBUTE)
+endif()
 if(WITH_GOLANG)
  # we need to symlink Paddle directory into GOPATH. If we
  # don't do it and we have code that depends on Paddle, go

--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
+if (NOT WITH_ANAKIN)
+  return()
+endif()
+set(ANAKIN_INSTALL_DIR "${THIRD_PARTY_PATH}/install/anakin" CACHE PATH
+  "Anakin install path." FORCE)
+set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header files")
+set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library")
+set(ANAKIN_COMPILE_EXTRA_FLAGS -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp)
+set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/3.0/anakin_release_simple.tar.gz")
+# A helper function used in Anakin, currently, to use it, one need to recursively include
+# nearly all the header files.
+function(fetch_include_recursively root_dir)
+    if (IS_DIRECTORY ${root_dir})
+        include_directories(${root_dir})
+    endif()
+    file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*)
+    foreach(sub ${ALL_SUB})
+        if (IS_DIRECTORY ${root_dir}/${sub})
+            fetch_include_recursively(${root_dir}/${sub})
+        endif()
+    endforeach()
+endfunction()
+# download library
+message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
+execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
+execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
+execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
+execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
+execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
+if (WITH_ANAKIN)
+    message(STATUS "Anakin for inference is enabled")
+    message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
+    fetch_include_recursively(${ANAKIN_INCLUDE})
+    link_directories(${ANAKIN_LIBRARY})
+endif()
--- a/doc/fluid/api/detection.rst
+++ b/doc/fluid/api/detection.rst
--- a/doc/fluid/api/gen_doc.sh
+++ b/doc/fluid/api/gen_doc.sh
 #!/bin/bash
-python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst
+python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler > layers.rst
 for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer
 do

--- a/doc/fluid/api/io.rst
+++ b/doc/fluid/api/io.rst
@@ -59,21 +59,3 @@ get_inference_program
 ..  autofunction:: paddle.fluid.io.get_inference_program
    :noindex:
-save_checkpoint
---------------
-..  autofunction:: paddle.fluid.io.save_checkpoint
-    :noindex:
-load_checkpoint
---------------
-..  autofunction:: paddle.fluid.io.load_checkpoint
-    :noindex:
-clean_checkpoint
----------------
-..  autofunction:: paddle.fluid.io.clean_checkpoint
-    :noindex:
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -181,12 +181,6 @@ Print
 ..  autofunction:: paddle.fluid.layers.Print
    :noindex:
-is_empty
--------
-..  autofunction:: paddle.fluid.layers.is_empty
-    :noindex:
 device
 ======
@@ -261,19 +255,6 @@ double_buffer
 ..  autofunction:: paddle.fluid.layers.double_buffer
    :noindex:
-random_data_generator
---------------------
-..  autofunction:: paddle.fluid.layers.random_data_generator
-    :noindex:
-Preprocessor
------------
-..  autoclass:: paddle.fluid.layers.Preprocessor
-    :members:
-    :noindex:
 nn
 ==
@@ -613,30 +594,6 @@ roi_pool
 ..  autofunction:: paddle.fluid.layers.roi_pool
    :noindex:
-dice_loss
---------
-..  autofunction:: paddle.fluid.layers.dice_loss
-    :noindex:
-resize_bilinear
---------------
-..  autofunction:: paddle.fluid.layers.resize_bilinear
-    :noindex:
-gather
------
-..  autofunction:: paddle.fluid.layers.gather
-    :noindex:
-random_crop
-----------
-..  autofunction:: paddle.fluid.layers.random_crop
-    :noindex:
 ops
 ===
@@ -784,12 +741,6 @@ sum
 ..  autofunction:: paddle.fluid.layers.sum
    :noindex:
-shape
-----
-..  autofunction:: paddle.fluid.layers.shape
-    :noindex:
 sigmoid
 -------
@@ -1039,3 +990,93 @@ zeros
 ..  autofunction:: paddle.fluid.layers.zeros
    :noindex:
+detection
+=========
+multi_box_head
+--------------
+..  autofunction:: paddle.fluid.layers.multi_box_head
+    :noindex:
+bipartite_match
+---------------
+..  autofunction:: paddle.fluid.layers.bipartite_match
+    :noindex:
+target_assign
+-------------
+..  autofunction:: paddle.fluid.layers.target_assign
+    :noindex:
+detection_output
+----------------
+..  autofunction:: paddle.fluid.layers.detection_output
+    :noindex:
+ssd_loss
+--------
+..  autofunction:: paddle.fluid.layers.ssd_loss
+    :noindex:
+detection_map
+-------------
+..  autofunction:: paddle.fluid.layers.detection_map
+    :noindex:
+iou_similarity
+--------------
+..  autofunction:: paddle.fluid.layers.iou_similarity
+    :noindex:
+box_coder
+---------
+..  autofunction:: paddle.fluid.layers.box_coder
+    :noindex:
+learning_rate_scheduler
+=======================
+exponential_decay
+-----------------
+..  autofunction:: paddle.fluid.layers.exponential_decay
+    :noindex:
+natural_exp_decay
+-----------------
+..  autofunction:: paddle.fluid.layers.natural_exp_decay
+    :noindex:
+inverse_time_decay
+------------------
+..  autofunction:: paddle.fluid.layers.inverse_time_decay
+    :noindex:
+polynomial_decay
+----------------
+..  autofunction:: paddle.fluid.layers.polynomial_decay
+    :noindex:
+piecewise_decay
+---------------
+..  autofunction:: paddle.fluid.layers.piecewise_decay
+    :noindex:
+noam_decay
+----------
+..  autofunction:: paddle.fluid.layers.noam_decay
+    :noindex:
--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
@@ -89,13 +89,6 @@ DecayedAdagradOptimizer
    :members:
    :noindex:
-RMSPropOptimizer
----------------
-..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
-    :members:
-    :noindex:
 Adadelta
 --------

--- a/doc/fluid/api/profiler.rst
+++ b/doc/fluid/api/profiler.rst
@@ -23,15 +23,3 @@ profiler
 ..  autofunction:: paddle.fluid.profiler.profiler
    :noindex:
-start_profiler
--------------
-..  autofunction:: paddle.fluid.profiler.start_profiler
-    :noindex:
-stop_profiler
-------------
-..  autofunction:: paddle.fluid.profiler.stop_profiler
-    :noindex:
--- a/doc/survey/dynamic_graph.md
+++ b/doc/survey/dynamic_graph.md
@@ -171,7 +171,7 @@ Pytorch chooses immediate evaluation. It avoids ever materializing a "forward gr
 ## What can fluid learn from them?
-TBD
+Please refer to `paddle/contrib/dynamic/`.
 # Appendix

--- a/doc/v2/api/config/evaluators.rst
+++ b/doc/v2/api/config/evaluators.rst
@@ -101,7 +101,7 @@ value_printer
    :noindex:
 Detection
-=====
+==========
 detection_map
 -------------

--- a/doc/v2/api/config/layer.rst
+++ b/doc/v2/api/config/layer.rst
@@ -11,7 +11,7 @@ Data layer
 data
 ----
-..  autoclass:: paddle.v2.layer.data
+..  autofunction:: paddle.v2.layer.data
    :noindex:
 Fully Connected Layers
@@ -21,12 +21,12 @@ Fully Connected Layers
 fc
 --
-..  autoclass:: paddle.v2.layer.fc
+..  autofunction:: paddle.v2.layer.fc
    :noindex:
 selective_fc
 ------------
-..  autoclass:: paddle.v2.layer.selective_fc
+..  autofunction:: paddle.v2.layer.selective_fc
    :noindex:
 Conv Layers
@@ -34,34 +34,34 @@ Conv Layers
 conv_operator
 -------------
-..  autoclass:: paddle.v2.layer.conv_operator
+..  autofunction:: paddle.v2.layer.conv_operator
    :noindex:
 conv_projection
 ---------------
-..  autoclass:: paddle.v2.layer.conv_projection
+..  autofunction:: paddle.v2.layer.conv_projection
    :noindex:
 conv_shift
 ----------
-..  autoclass:: paddle.v2.layer.conv_shift
+..  autofunction:: paddle.v2.layer.conv_shift
    :noindex:
 img_conv
 --------
-..  autoclass:: paddle.v2.layer.img_conv
+..  autofunction:: paddle.v2.layer.img_conv
    :noindex:
 ..  _api_v2.layer_context_projection:
 context_projection
 ------------------
-..  autoclass:: paddle.v2.layer.context_projection
+..  autofunction:: paddle.v2.layer.context_projection
    :noindex:
 row_conv
 --------
-..  autoclass:: paddle.v2.layer.row_conv
+..  autofunction:: paddle.v2.layer.row_conv
    :noindex:
 Image Pooling Layer
@@ -69,27 +69,27 @@ Image Pooling Layer
 img_pool
 --------
-..  autoclass:: paddle.v2.layer.img_pool
+..  autofunction:: paddle.v2.layer.img_pool
    :noindex:
 spp
 ---
-..  autoclass:: paddle.v2.layer.spp
+..  autofunction:: paddle.v2.layer.spp
    :noindex:
 maxout
 ------
-..  autoclass:: paddle.v2.layer.maxout
+..  autofunction:: paddle.v2.layer.maxout
    :noindex:
 roi_pool
 --------
-..  autoclass:: paddle.v2.layer.roi_pool
+..  autofunction:: paddle.v2.layer.roi_pool
    :noindex:
 pad
 ----
-..  autoclass:: paddle.v2.layer.pad
+..  autofunction:: paddle.v2.layer.pad
    :noindex:
 Norm Layer
@@ -97,27 +97,27 @@ Norm Layer
 img_cmrnorm
 -----------
-..  autoclass:: paddle.v2.layer.img_cmrnorm
+..  autofunction:: paddle.v2.layer.img_cmrnorm
    :noindex:
 batch_norm
 ----------
-..  autoclass:: paddle.v2.layer.batch_norm
+..  autofunction:: paddle.v2.layer.batch_norm
    :noindex:
 sum_to_one_norm
 ---------------
-..  autoclass:: paddle.v2.layer.sum_to_one_norm
+..  autofunction:: paddle.v2.layer.sum_to_one_norm
    :noindex:
 cross_channel_norm
 ------------------
-..  autoclass:: paddle.v2.layer.cross_channel_norm
+..  autofunction:: paddle.v2.layer.cross_channel_norm
    :noindex:
 row_l2_norm
 -----------
-..  autoclass:: paddle.v2.layer.row_l2_norm
+..  autofunction:: paddle.v2.layer.row_l2_norm
    :noindex:
 Recurrent Layers
@@ -125,22 +125,22 @@ Recurrent Layers
 recurrent
 ---------
-..  autoclass:: paddle.v2.layer.recurrent
+..  autofunction:: paddle.v2.layer.recurrent
    :noindex:
 lstmemory
 ---------
-..  autoclass:: paddle.v2.layer.lstmemory
+..  autofunction:: paddle.v2.layer.lstmemory
    :noindex:
 grumemory
 ---------
-..  autoclass:: paddle.v2.layer.grumemory
+..  autofunction:: paddle.v2.layer.grumemory
    :noindex:
 gated_unit
 -----------
-..  autoclass:: paddle.v2.layer.gated_unit
+..  autofunction:: paddle.v2.layer.gated_unit
    :noindex:
 Recurrent Layer Group
@@ -148,32 +148,32 @@ Recurrent Layer Group
 memory
 ------
-..  autoclass:: paddle.v2.layer.memory
+..  autofunction:: paddle.v2.layer.memory
    :noindex:
 recurrent_group
 ---------------
-..  autoclass:: paddle.v2.layer.recurrent_group
+..  autofunction:: paddle.v2.layer.recurrent_group
    :noindex:
 lstm_step
 ---------
-..  autoclass:: paddle.v2.layer.lstm_step
+..  autofunction:: paddle.v2.layer.lstm_step
    :noindex:
 gru_step
 --------
-..  autoclass:: paddle.v2.layer.gru_step
+..  autofunction:: paddle.v2.layer.gru_step
    :noindex:
 beam_search
 ------------
-..  autoclass:: paddle.v2.layer.beam_search
+..  autofunction:: paddle.v2.layer.beam_search
    :noindex:
 get_output
 ----------
-..  autoclass:: paddle.v2.layer.get_output
+..  autofunction:: paddle.v2.layer.get_output
    :noindex:
 Mixed Layer
@@ -183,54 +183,54 @@ Mixed Layer
 mixed
 -----
-..  autoclass:: paddle.v2.layer.mixed
+..  autofunction:: paddle.v2.layer.mixed
    :noindex:
 ..  _api_v2.layer_embedding:
 embedding
 ---------
-..  autoclass:: paddle.v2.layer.embedding
+..  autofunction:: paddle.v2.layer.embedding
    :noindex:
 scaling_projection
 ------------------
-..  autoclass:: paddle.v2.layer.scaling_projection
+..  autofunction:: paddle.v2.layer.scaling_projection
    :noindex:
 dotmul_projection
 -----------------
-..  autoclass:: paddle.v2.layer.dotmul_projection
+..  autofunction:: paddle.v2.layer.dotmul_projection
    :noindex:
 dotmul_operator
 ---------------
-..  autoclass:: paddle.v2.layer.dotmul_operator
+..  autofunction:: paddle.v2.layer.dotmul_operator
    :noindex:
 full_matrix_projection
 ----------------------
-..  autoclass:: paddle.v2.layer.full_matrix_projection
+..  autofunction:: paddle.v2.layer.full_matrix_projection
    :noindex:
 identity_projection
 -------------------
-..  autoclass:: paddle.v2.layer.identity_projection
+..  autofunction:: paddle.v2.layer.identity_projection
    :noindex:
 slice_projection
 -------------------
-..  autoclass:: paddle.v2.layer.slice_projection
+..  autofunction:: paddle.v2.layer.slice_projection
    :noindex:
 table_projection
 ----------------
-..  autoclass:: paddle.v2.layer.table_projection
+..  autofunction:: paddle.v2.layer.table_projection
    :noindex:
 trans_full_matrix_projection
 ----------------------------
-..  autoclass:: paddle.v2.layer.trans_full_matrix_projection
+..  autofunction:: paddle.v2.layer.trans_full_matrix_projection
    :noindex:
 Aggregate Layers
@@ -245,51 +245,46 @@ AggregateLevel
 pooling
 -------
-..  autoclass:: paddle.v2.layer.pooling
+..  autofunction:: paddle.v2.layer.pooling
    :noindex:
 ..  _api_v2.layer_last_seq:
 last_seq
 --------
-..  autoclass:: paddle.v2.layer.last_seq
+..  autofunction:: paddle.v2.layer.last_seq
    :noindex:
 ..  _api_v2.layer_first_seq:
 first_seq
 ---------
-..  autoclass:: paddle.v2.layer.first_seq
+..  autofunction:: paddle.v2.layer.first_seq
    :noindex:
 sub_seq
 ---------
-..  autoclass:: paddle.v2.layer.sub_seq
+..  autofunction:: paddle.v2.layer.sub_seq
    :noindex:
 concat
 ------
-..  autoclass:: paddle.v2.layer.concat
+..  autofunction:: paddle.v2.layer.concat
    :noindex:
 seq_concat
 ----------
-..  autoclass:: paddle.v2.layer.seq_concat
+..  autofunction:: paddle.v2.layer.seq_concat
    :noindex:
 seq_slice
 ---------
-..  autoclass:: paddle.v2.layer.seq_slice
+..  autofunction:: paddle.v2.layer.seq_slice
-    :noindex:
-kmax_sequence_score
-------------------
-..  autoclass:: paddle.v2.layer.kmax_sequence_score
    :noindex:
 sub_nested_seq
 --------------
-..  autoclass:: paddle.v2.layer.sub_nested_seq
+..  autofunction:: paddle.v2.layer.sub_nested_seq
    :noindex:
 Reshaping Layers
@@ -297,7 +292,7 @@ Reshaping Layers
 block_expand
 ------------
-..  autoclass:: paddle.v2.layer.block_expand
+..  autofunction:: paddle.v2.layer.block_expand
    :noindex:
 ..  _api_v2.layer_expand:
@@ -309,22 +304,22 @@ ExpandLevel
 expand
 ------
-..  autoclass:: paddle.v2.layer.expand
+..  autofunction:: paddle.v2.layer.expand
    :noindex:
 repeat
 ------
-..  autoclass:: paddle.v2.layer.repeat
+..  autofunction:: paddle.v2.layer.repeat
    :noindex:
 rotate
 ------
-..  autoclass:: paddle.v2.layer.rotate
+..  autofunction:: paddle.v2.layer.rotate
    :noindex:
 seq_reshape
 -----------
-..  autoclass:: paddle.v2.layer.seq_reshape
+..  autofunction:: paddle.v2.layer.seq_reshape
    :noindex:
 Math Layers
@@ -332,94 +327,94 @@ Math Layers
 addto
 -----
-..  autoclass:: paddle.v2.layer.addto
+..  autofunction:: paddle.v2.layer.addto
    :noindex:
 linear_comb
 -----------
-..  autoclass:: paddle.v2.layer.linear_comb
+..  autofunction:: paddle.v2.layer.linear_comb
    :noindex:
 interpolation
 -------------
-..  autoclass:: paddle.v2.layer.interpolation
+..  autofunction:: paddle.v2.layer.interpolation
    :noindex:
 bilinear_interp
 ---------------
-..  autoclass:: paddle.v2.layer.bilinear_interp
+..  autofunction:: paddle.v2.layer.bilinear_interp
    :noindex:
 dropout
 --------
-..  autoclass:: paddle.v2.layer.dropout
+..  autofunction:: paddle.v2.layer.dropout
    :noindex:
 dot_prod
 ---------
-.. autoclass:: paddle.v2.layer.dot_prod
+.. autofunction:: paddle.v2.layer.dot_prod
    :noindex:
 out_prod
 --------
-.. autoclass:: paddle.v2.layer.out_prod
+.. autofunction:: paddle.v2.layer.out_prod
    :noindex:
 power
 -----
-..  autoclass:: paddle.v2.layer.power
+..  autofunction:: paddle.v2.layer.power
    :noindex:
 scaling
 -------
-..  autoclass:: paddle.v2.layer.scaling
+..  autofunction:: paddle.v2.layer.scaling
    :noindex:
 clip
 ----
-..  autoclass:: paddle.v2.layer.clip
+..  autofunction:: paddle.v2.layer.clip
    :noindex:
 resize
 ------
-..  autoclass:: paddle.v2.layer.resize
+..  autofunction:: paddle.v2.layer.resize
    :noindex:
 slope_intercept
 ---------------
-..  autoclass:: paddle.v2.layer.slope_intercept
+..  autofunction:: paddle.v2.layer.slope_intercept
    :noindex:
 tensor
 ------
-..  autoclass:: paddle.v2.layer.tensor
+..  autofunction:: paddle.v2.layer.tensor
    :noindex:
 ..  _api_v2.layer_cos_sim:
 cos_sim
 -------
-..  autoclass:: paddle.v2.layer.cos_sim
+..  autofunction:: paddle.v2.layer.cos_sim
    :noindex:
 l2_distance
 -----------
-..  autoclass:: paddle.v2.layer.l2_distance
+..  autofunction:: paddle.v2.layer.l2_distance
    :noindex:
 trans
 -----
-..  autoclass:: paddle.v2.layer.trans
+..  autofunction:: paddle.v2.layer.trans
    :noindex:
 scale_shift
 -----------
-..  autoclass:: paddle.v2.layer.scale_shift
+..  autofunction:: paddle.v2.layer.scale_shift
    :noindex:
 factorization_machine
 ---------------------
-..  autoclass:: paddle.v2.layer.factorization_machine
+..  autofunction:: paddle.v2.layer.factorization_machine
    :noindex:
 Sampling Layers
@@ -427,17 +422,17 @@ Sampling Layers
 maxid
 -----
-..  autoclass:: paddle.v2.layer.max_id
+..  autofunction:: paddle.v2.layer.max_id
    :noindex:
 sampling_id
 -----------
-..  autoclass:: paddle.v2.layer.sampling_id
+..  autofunction:: paddle.v2.layer.sampling_id
    :noindex:
 multiplex
 ---------
-..  autoclass:: paddle.v2.layer.multiplex
+..  autofunction:: paddle.v2.layer.multiplex
    :noindex:
 ..  _api_v2.layer_costs:
@@ -447,97 +442,97 @@ Cost Layers
 cross_entropy_cost
 ------------------
-..  autoclass:: paddle.v2.layer.cross_entropy_cost
+..  autofunction:: paddle.v2.layer.cross_entropy_cost
    :noindex:
 cross_entropy_with_selfnorm_cost
 --------------------------------
-..  autoclass:: paddle.v2.layer.cross_entropy_with_selfnorm_cost
+..  autofunction:: paddle.v2.layer.cross_entropy_with_selfnorm_cost
    :noindex:
 multi_binary_label_cross_entropy_cost
 -------------------------------------
-..  autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
+..  autofunction:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
    :noindex:
 classification_cost
 -------------------
-.. autoclass:: paddle.v2.layer.classification_cost
+.. autofunction:: paddle.v2.layer.classification_cost
   :noindex:
 huber_regression_cost
 -------------------------
-..  autoclass:: paddle.v2.layer.huber_regression_cost
+..  autofunction:: paddle.v2.layer.huber_regression_cost
    :noindex:
 huber_classification_cost
 -------------------------
-..  autoclass:: paddle.v2.layer.huber_classification_cost
+..  autofunction:: paddle.v2.layer.huber_classification_cost
    :noindex:
 lambda_cost
 -----------
-..  autoclass:: paddle.v2.layer.lambda_cost
+..  autofunction:: paddle.v2.layer.lambda_cost
    :noindex:
 square_error_cost
 -----------------
-..  autoclass:: paddle.v2.layer.square_error_cost
+..  autofunction:: paddle.v2.layer.square_error_cost
    :noindex:
 rank_cost
 ---------
-..  autoclass:: paddle.v2.layer.rank_cost
+..  autofunction:: paddle.v2.layer.rank_cost
    :noindex:
 sum_cost
 ---------
-..  autoclass:: paddle.v2.layer.sum_cost
+..  autofunction:: paddle.v2.layer.sum_cost
    :noindex:
 crf
 ---
-..  autoclass:: paddle.v2.layer.crf
+..  autofunction:: paddle.v2.layer.crf
    :noindex:
 crf_decoding
 ------------
-..  autoclass:: paddle.v2.layer.crf_decoding
+..  autofunction:: paddle.v2.layer.crf_decoding
    :noindex:
 ctc
 ---
-..  autoclass:: paddle.v2.layer.ctc
+..  autofunction:: paddle.v2.layer.ctc
    :noindex:
 warp_ctc
 --------
-..  autoclass:: paddle.v2.layer.warp_ctc
+..  autofunction:: paddle.v2.layer.warp_ctc
    :noindex:
 nce
 ---
-..  autoclass:: paddle.v2.layer.nce
+..  autofunction:: paddle.v2.layer.nce
    :noindex:
 hsigmoid
 ---------
-..  autoclass:: paddle.v2.layer.hsigmoid
+..  autofunction:: paddle.v2.layer.hsigmoid
    :noindex:
 smooth_l1_cost
 --------------
-..  autoclass:: paddle.v2.layer.smooth_l1_cost
+..  autofunction:: paddle.v2.layer.smooth_l1_cost
    :noindex:
 multibox_loss
 --------------
-..  autoclass:: paddle.v2.layer.multibox_loss
+..  autofunction:: paddle.v2.layer.multibox_loss
    :noindex:
 detection_output
 ----------------
-..  autoclass:: paddle.v2.layer.detection_output
+..  autofunction:: paddle.v2.layer.detection_output
    :noindex:
 Check Layer
@@ -545,7 +540,7 @@ Check Layer
 eos
 ---
-..  autoclass:: paddle.v2.layer.eos
+..  autofunction:: paddle.v2.layer.eos
    :noindex:
 Activation
@@ -553,5 +548,5 @@ Activation
 prelu
 --------
-..  autoclass:: paddle.v2.layer.prelu
+..  autofunction:: paddle.v2.layer.prelu
    :noindex:
--- a/doc/v2/api/index_en.rst
+++ b/doc/v2/api/index_en.rst
@@ -8,4 +8,3 @@ API
    model_configs.rst
    data.rst
    run_logic.rst
-    fluid/index.rst
--- a/doc/v2/build_and_install/pip_install_cn.rst
+++ b/doc/v2/build_and_install/pip_install_cn.rst
@@ -60,6 +60,7 @@ paddlepaddle-gpu==0.11.0            使用CUDA 7.5和cuDNN 5编译的0.11.0版
    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
 .. _pip_dependency:

--- a/doc/v2/build_and_install/pip_install_en.rst
+++ b/doc/v2/build_and_install/pip_install_en.rst
@@ -63,6 +63,7 @@ If the links below shows up the login form, just click "Log in as guest" to star
    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
 .. _pip_dependency:

--- a/paddle/contrib/CMakeLists.txt
+++ b/paddle/contrib/CMakeLists.txt
@@ -14,3 +14,4 @@
 #
 add_subdirectory(inference)
+add_subdirectory(tape)
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -17,48 +17,9 @@ if(APPLE)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
 endif(APPLE)
-set(ANAKIN_INCLUDE "" CACHE STRING "root of Anakin header files")
-set(ANAKIN_LIBRARY "" CACHE STRING "path of Anakin library")
 set(inference_deps paddle_inference_api paddle_fluid_api)
-# if anakin is set enable anakin api implementation
-if(ANAKIN_INCLUDE AND ANAKIN_LIBRARY)
-    set(ANAKIN_FOUND ON)
-else()
-    set(ANAKIN_FOUND OFF)
-endif()
-function(fetch_include_recursively root_dir) 
-    if (IS_DIRECTORY ${root_dir}) 
-        include_directories(${root_dir})
-    endif()
-    file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*)
-    foreach(sub ${ALL_SUB})
-        if (IS_DIRECTORY ${root_dir}/${sub})
-            fetch_include_recursively(${root_dir}/${sub})
-        endif()
-    endforeach()
-endfunction()
-if (ANAKIN_FOUND)
-    # Anakin's code style doesn't follow google c style.
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp")
-    message(STATUS "Anakin for inference is enabled")
-    message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
-    fetch_include_recursively(${ANAKIN_INCLUDE})
-    link_directories(${ANAKIN_LIBRARY})
-    nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
-    target_link_libraries(inference_anakin_api anakin anakin_saber_common)
-    list(APPEND inference_deps inference_anakin_api)
-endif()
 function(inference_api_test TARGET_NAME)
    if (WITH_TESTING)
        set(options "")
@@ -79,7 +40,7 @@ function(inference_api_test TARGET_NAME)
 endfunction(inference_api_test)
 cc_library(paddle_inference_api
-    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc 
+    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 cc_test(test_paddle_inference_api
@@ -89,9 +50,17 @@ cc_test(test_paddle_inference_api
 inference_api_test(test_paddle_inference_api_impl
                    ARGS test_word2vec test_image_classification)
-if (ANAKIN_FOUND)
+if (WITH_ANAKIN)
+    # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
+    # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
+    # compile the libinference_anakin_api.a and compile with anakin.so.
+    nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
+    target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
+    target_link_libraries(inference_anakin_api anakin anakin_saber_common)
    cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
-    DEPS ${inference_deps})
+                                  ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
+                                  DEPS inference_anakin_api)
+    target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
 endif()
 if(WITH_TESTING)

--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <cuda.h>
 #include "paddle/contrib/inference/paddle_inference_api_anakin_engine.h"
+#include <cuda.h>
 namespace paddle {

--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
@@ -19,10 +19,9 @@ limitations under the License. */
 #pragma once
-// NOTE This header file do not have namespace.
-//#include <test/framework/net/paddle_api.h>
 #include "paddle/contrib/inference/paddle_inference_api.h"
+// from anakin
 #include "framework/core/net/net.h"
 #include "saber/saber_types.h"

--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
@@ -12,17 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "gflags/gflags.h"
 #include "paddle/contrib/inference/paddle_inference_api.h"
+DEFINE_string(model, "", "Directory of the inference model.");
 namespace paddle {
 AnakinConfig GetConfig() {
  AnakinConfig config;
-  config.model_file = "./mobilenet_v2.anakin.bin";
+  config.model_file = FLAGS_model;
  config.device = 0;
  config.max_batch_size = 1;
  return config;

--- a/paddle/contrib/tape/CMakeLists.txt
+++ b/paddle/contrib/tape/CMakeLists.txt
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+if(APPLE)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
+endif(APPLE)
+cc_library(tape_variable SRCS variable.cc DEPS ${FLUID_CORE_MODULES})
+cc_library(tape SRCS tape.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} tape_variable)
+cc_test(test_tape
+        SRCS test_tape.cc
+        DEPS tape tape_variable)
--- a/paddle/contrib/tape/README.md
+++ b/paddle/contrib/tape/README.md
+# Dynamic Graph on Fluid
+PaddlePaddle Fluid is targeting the autodiff without tape, which, however, is very
+challenging and we are still way from there. DyNet and PyTorch provide a good design
+idea, the *tape*, that significantly eases the challenge.  Also, DyNet provides
+a C++ API that is as convenient as Python but with higher efficiency and could
+conveniently integrate with industrial/production systems. This package, `tape`,
+combines the good of
+1. tape from PyTorch and DyNet
+2. C++ API and core from DyNet
+3. rich set of operators from PaddlePaddle
+## Overview
+We can implement Dynet-like Tape(See this [survey](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/survey/dynamic_graph.md))
+by wrapping Paddle Fluid's `Operator` and `Variable`.
+The user API is straight forward since
+1. it is imperative. And it uses host language's control flow logic.
+1. it avoids extra concepts such as `Scope` and `Executor`.
+All of these benefits come at the cost of just adding one line `reset_global_tape`
+at every iteration.
+## Code Structure
+In short, the `Tape` contains a vector of `OpHandle`s. And an `OpHandle` contains its
+`type`, the pointers to the `Variable`s, and necessary attributes.
+```c++
+class Variable {
+public:
+  VriableHandle Grad(); // returns its gradient variable
+private:
+  framework::VarDesc desc_; // compile time infershape, necessary for lazy execution
+  framework::Variable var_; // run time variable, holds data memory
+};
+using VariableHandle = shared_ptr<Variable>;
+struct OpHandle {
+  string type_;
+  map<string, vector<VariableHandle>> inputs_;
+  map<string, vector<VariableHandle>> outputs_;
+  AttributeMap attrs_;
+};
+class Tape {
+public:
+  void AddOp(OpHandle); // add op
+  void Forward();       // execute the tape_
+  void Backward();      // execute the backward of the tape_
+private:
+  vector<OpHandle> tape_;
+};
+```
+We uses `Function` to indicate layers. It takes care of parameter
+initialization and `AddOp` to the Tape when it is called.
+```c++
+class Linear {
+ public:
+  Linear(int in_dim, int out_dim, const std::string &act)
+      : w_(new Variable("LinearWeight")),
+        b_(new Variable("LinearBias")),
+        act_(act) {
+    Tape init_tape;
+    std::string initializer = "fill_constant";
+    framework::AttributeMap attrs;
+    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
+    attrs["shape"] = std::vector<int>{in_dim, out_dim};
+    attrs["value"] = 1.0f;
+    init_tape.AddOp(initializer, {}, {{"Out", {w_}}}, attrs);
+    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
+    attrs["shape"] = std::vector<int>{out_dim};
+    attrs["value"] = 1.0f;
+    init_tape.AddOp(initializer, {}, {{"Out", {b_}}}, attrs);
+    init_tape.Forward();
+  }
+  VariableHandle operator()(VariableHandle input) {
+    VariableHandle pre_bias(new Variable("linear"));
+    get_global_tape().AddOp("mul",
+                            {{"X", {input}}, {"Y", {w_}}},
+                            {{"Out", {pre_bias}}},
+                            {{"x_num_col_dims", 1}, {"y_num_col_dims", 1}});
+    VariableHandle pre_act(new Variable("linear"));
+    get_global_tape().AddOp("elementwise_add",
+                            {{"X", {pre_bias}}, {"Y", {b_}}},
+                            {{"Out", {pre_act}}},
+                            {{"axis", 1}});
+    VariableHandle post_act(new Variable("linear"));
+    get_global_tape().AddOp(act_,
+                            {{"X", {pre_act}}},
+                            {{"Out", {post_act}}},
+                            {});
+    return post_act;
+  }
+  std::vector<VariableHandle> Params() { return {w_, b_}; }
+ private:
+  VariableHandle w_;
+  VariableHandle b_;
+  std::string act_;
+};
+```
+## User API
+```c++
+// Model function
+paddle::tape::Linear linear1(3, 3, "relu"); // init weight and bias
+paddle::tape::Linear linear2(3, 3, "relu"); // init weight and bias
+paddle::tape::Mean mean;
+// Optimizer
+paddle::tape::SGD sgd(0.001);
+// Data Feeder
+paddle::tape::Fill data_feeder(...);
+VariableHandle input(new paddle::tape::Variable("input"));
+VariableHandle label(new paddle::tape::Variable("label"));
+for (int i = 0; i < 2; ++i) {
+  reset_global_tape();
+  data_feeder(input, label);
+  auto loss = softmax(linear2(linear1(input)), label); // compile time InferShape & InferVarType
+  LOG(INFO) << loss.value(); // Run forward up to loss
+  // Run backward, store gradient of w at w->Grad()
+  get_global_tape.Backward(loss);
+  // Update w
+  sgd(linear1.Params());
+  sgd(linear2.Params());
+}
+```
+<details>
+  <summary></summary>
+digraph G {
+	subgraph cluster_0 {
+                node [shape=record,style=filled];
+		style=filled;
+		color=lightgrey;
+                linear1 [label="{type: mul | {input | {<before_mul1>X: before_mul1 |<weight1> Y: weight1}} |  {output |<before_bias1> Out: before_bias1}}"];
+                elementwise_add1 [label="{type: elementwise_add | {input | {<before_bias1>X: before_bias1 |<bias1> Y: bias1}} |  {output |<before_act1> Out: before_act1}}"];
+                relu1 [label="{type: relu | {input | {<before_act1>X: before_act1 }} |  {output |<after_act1> Out: after_act1}}"];
+		linear1 -> elementwise_add1->relu1;
+		label = "forward tape";
+	}
+        linear1:before_mul1->before_mul1
+        linear1:weight1->weight1
+        linear1:before_bias1->before_bias1
+        elementwise_add1:bias1->bias1
+        elementwise_add1:before_bias1->before_bias1
+        elementwise_add1:before_act1->before_act1
+        relu1:before_act1->before_act1
+        relu1:after_act1->after_act1
+	subgraph cluster_1 {
+                node [shape=record,style=filled];
+		style=filled;
+		color=lightgrey;
+                linear1_grad [label="{type: mul_grad | {input | {<before_mul1>X: before_mul1 |<weight1> Y: weight1|<before_bias1_grad> Out_grad: before_bias1_grad}} |  {output |{<before_mul1_grad>X_grad: before_mul1_grad |<weight1_grad> Y_grad: weight1_grad}}}"];
+                elementwise_add1_grad [label="{type: elementwise_add_grad | {input | <before_act1_grad> Out_grad: before_act1_grad} |  {output |{<before_bias1_grad>X_grad: before_bias1_grad |<bias1_grad> Y_grad: bias1_grad}}}"];
+                relu1_grad [label="{type: relu_grad |  {input |<after_act1_grad> Out_grad: after_act1_grad} | {ouput | {<before_act1_grad>X_grad: before_act1_grad }}}"];
+		linear1_grad -> elementwise_add1_grad ->relu1_grad [dir=back];
+                label = "backward tape";
+	}
+        relu1_grad:after_act1_grad->after_act1_grad
+        relu1_grad:before_act1_grad->before_act1_grad
+        elementwise_add1_grad:before_act1_grad->before_act1_grad
+        elementwise_add1_grad:before_bias1_grad->before_bias1_grad
+        elementwise_add1_grad:bias1_grad->bias1_grad
+        linear1_grad:before_mul1->before_mul1
+        linear1_grad:weight1->weight1
+        linear1_grad:before_bias1_grad->before_bias1_grad
+        linear1_grad:before_mul1_grad->before_mul1_grad
+        linear1_grad:weight1_grad->weight1_grad
+	subgraph cluster_2 {
+                node [shape=record];
+                label = "Linear1";
+                weight1
+                bias1
+	}
+        weight1 -> weight1_grad [ label="Grad()", style="dashed" ];
+        bias1 -> bias1_grad [ label="Grad()", style="dashed"];
+}
+</details>
+![Image](https://github.com/tonyyang-svail/Paddle/blob/cpp_tap/paddle/contrib/tape/computation_graph.png)
+## Code Reuse
+We want to stay close to Paddle Fluid as much as possible.
+### Reuse All Operators
+As all Ops are registered at `OpInfoMap`, the effort of adding a new `Function`
+is about 10 lines of code, similar to expose an operator to Python.
+### Reuse Compile Time InferShape and InferVarType
+Note that all the symbolic information is stored at `tape::Varaible::desc_`, instead
+of `ProgramDesc.block.vars`, we create a temporary `BlockDesc` to do `InferShape` and
+`InferVarType` every time we `AddOp` to the tape.
+### Reuse Operator::Run
+We use smart pointer, instead of `Scope`, to manage memory. So we create a temporary
+`Scope` for every `Operator::Run()`.
+## Possible Feature
+### Release Memory on Backward
+We can release memory aggressively. During backward, we can delete the OpHandle once
+we have finished its backward. Since all the variable is managed by smart pointer, the
+memory is automatically released when its `ref_count` goes to 0.
+### Kernel Fusion
+As a symbolic representation of the Tape is constructed first before the actual
+execution, it would be possible to perform graph optimization. One use case is kernel
+fusion.
--- a/paddle/contrib/tape/computation_graph.png
+++ b/paddle/contrib/tape/computation_graph.png
--- a/paddle/contrib/tape/function.h
+++ b/paddle/contrib/tape/function.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include "paddle/contrib/tape/tape.h"
+#include "paddle/contrib/tape/variable.h"
+#include "paddle/fluid/framework/type_defs.h"
+namespace paddle {
+namespace tape {
+class Function {};
+class Fill {
+ public:
+  Fill(const std::string &initializer, const framework::AttributeMap &attrs)
+      : initializer_(initializer), attrs_(attrs) {}
+  void operator()(VariableHandle var) {
+    get_global_tape().AddOp(initializer_, {}, {{"Out", {var}}}, attrs_);
+  }
+ private:
+  const std::string initializer_;
+  const framework::AttributeMap attrs_;
+};
+class Mean {
+ public:
+  VariableHandle operator()(VariableHandle var) {
+    VariableHandle out(new Variable("mean"));
+    get_global_tape().AddOp("mean", {{"X", {var}}}, {{"Out", {out}}}, {});
+    return out;
+  }
+};
+class Linear {
+ public:
+  Linear(int in_dim, int out_dim, const std::string &act)
+      : w_(new Variable("LinearWeight")),
+        b_(new Variable("LinearBias")),
+        act_(act) {
+    Tape init_tape;
+    std::string initializer = "fill_constant";
+    framework::AttributeMap attrs;
+    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
+    attrs["shape"] = std::vector<int>{in_dim, out_dim};
+    attrs["value"] = 1.0f;
+    init_tape.AddOp(initializer, {}, {{"Out", {w_}}}, attrs);
+    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
+    attrs["shape"] = std::vector<int>{out_dim};
+    attrs["value"] = 1.0f;
+    init_tape.AddOp(initializer, {}, {{"Out", {b_}}}, attrs);
+    init_tape.Forward();
+  }
+  VariableHandle operator()(VariableHandle input) {
+    VariableHandle pre_bias(new Variable("linear"));
+    get_global_tape().AddOp("mul",
+                            {{"X", {input}}, {"Y", {w_}}},
+                            {{"Out", {pre_bias}}},
+                            {{"x_num_col_dims", 1}, {"y_num_col_dims", 1}});
+    VariableHandle pre_act(new Variable("linear"));
+    get_global_tape().AddOp("elementwise_add",
+                            {{"X", {pre_bias}}, {"Y", {b_}}},
+                            {{"Out", {pre_act}}},
+                            {{"axis", 1}});
+    VariableHandle post_act(new Variable("linear"));
+    get_global_tape().AddOp(
+        act_, {{"X", {pre_act}}}, {{"Out", {post_act}}}, {});
+    return post_act;
+  }
+  std::vector<VariableHandle> Params() { return {w_, b_}; }
+ private:
+  VariableHandle w_;
+  VariableHandle b_;
+  std::string act_;
+};
+class SGD {
+ public:
+  SGD(float learning_rate) : learning_rate_(new Variable("sgd")) {
+    Tape init_tape;
+    std::string initializer = "fill_constant";
+    framework::AttributeMap attrs;
+    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
+    attrs["shape"] = std::vector<int>{1};
+    attrs["value"] = learning_rate;
+    init_tape.AddOp(initializer, {}, {{"Out", {learning_rate_}}}, attrs);
+    init_tape.Forward();
+  }
+  void operator()(VariableHandle input) {
+    PADDLE_ENFORCE(get_global_tape().HasBeenBackwarded(),
+                   "optimization must happen after the backward");
+    Tape temp_tape;
+    temp_tape.AddOp("sgd",
+                    {{"Param", {input}},
+                     {"LearningRate", {learning_rate_}},
+                     {"Grad", {input->Grad()}}},
+                    {{"ParamOut", {input}}},
+                    {});
+    temp_tape.Forward();
+  }
+ private:
+  VariableHandle learning_rate_;
+};
+}
+}
--- a/paddle/contrib/tape/tape.cc
+++ b/paddle/contrib/tape/tape.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/contrib/tape/tape.h"
+#include <list>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/dim.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/pybind/pybind.h"
+namespace paddle {
+namespace tape {
+// borrowed from
+// https://stackoverflow.com/questions/874134/find-if-string-ends-with-another-string-in-c
+inline bool ends_with(std::string const &value, std::string const &ending) {
+  if (ending.size() > value.size()) return false;
+  return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
+}
+std::ostream &operator<<(std::ostream &os, const framework::VarDesc &var_desc) {
+  os << var_desc.Name();
+  os << "[" << var_desc.GetType() << "]";
+  os << "[" << var_desc.GetDataType() << "]";
+  os << "{";
+  for (auto &i : var_desc.GetShape()) {
+    os << i << ",";
+  }
+  os << "}";
+  return os;
+}
+std::string to_string(const std::string &type,
+                      const VariableHandleMap &in_vars,
+                      const VariableHandleMap &out_vars,
+                      const framework::AttributeMap &attrs) {
+  std::stringstream ss;
+  ss << type << " ";
+  for (auto &param_name : in_vars) {
+    for (auto &var : param_name.second) {
+      ss << param_name.first << ":(" << var->Desc() << ") ";
+    }
+  }
+  for (auto &param_name : out_vars) {
+    for (auto &var : param_name.second) {
+      ss << param_name.first << ":(" << var->Desc() << ") ";
+    }
+  }
+  return ss.str();
+}
+framework::OpDesc CreateOpDesc(const std::string &type,
+                               const VariableHandleMap &in_vars,
+                               const VariableHandleMap &out_vars,
+                               const framework::AttributeMap &attrs) {
+  framework::VariableNameMap inputs;
+  for (auto &param_name : in_vars) {
+    for (auto &var : param_name.second) {
+      inputs[param_name.first].emplace_back(var->Name());
+    }
+  }
+  framework::VariableNameMap outputs;
+  for (auto &param_name : out_vars) {
+    for (auto &var : param_name.second) {
+      outputs[param_name.first].emplace_back(var->Name());
+    }
+  }
+  return framework::OpDesc(type, inputs, outputs, attrs);
+}
+void InferShapeAndVarType(const std::string &type,
+                          const VariableHandleMap &in_vars,
+                          VariableHandleMap *out_vars,
+                          const framework::AttributeMap &attrs) {
+  framework::OpDesc op_desc = CreateOpDesc(type, in_vars, *out_vars, attrs);
+  // Create a temporary block for compile-time
+  framework::ProgramDesc program_desc;
+  framework::BlockDesc *block_desc = program_desc.MutableBlock(0);
+  PADDLE_ENFORCE(block_desc);
+  for (auto &param_name : in_vars) {
+    for (auto &var : param_name.second) {
+      *block_desc->Var(var->Name())->Proto() = *var->MutableDesc()->Proto();
+    }
+  }
+  for (auto &param_name : *out_vars) {
+    for (auto &var : param_name.second) {
+      *block_desc->Var(var->Name())->Proto() = *var->MutableDesc()->Proto();
+    }
+  }
+  LOG(INFO) << "- " << to_string(type, in_vars, *out_vars, attrs);
+  op_desc.InferShape(*block_desc);
+  op_desc.InferVarType(block_desc);
+  for (auto &param_name : *out_vars) {
+    for (auto &var : param_name.second) {
+      *var->MutableDesc()->Proto() = *block_desc->Var(var->Name())->Proto();
+    }
+  }
+  LOG(INFO) << "+ " << to_string(type, in_vars, *out_vars, attrs);
+}
+void Tape::AddOp(const std::string &type,
+                 const VariableHandleMap &in_vars,
+                 VariableHandleMap out_vars,
+                 const framework::AttributeMap &attrs) {
+  InferShapeAndVarType(type, in_vars, &out_vars, attrs);
+  tape_.emplace_back(type, in_vars, out_vars, attrs);
+}
+// Temporary Scope for Operator::Run()
+class ScopeWrapper : public framework::Scope {
+ public:
+  ScopeWrapper(const VariableHandleMap &in_vars,
+               const VariableHandleMap &out_vars) {
+    for (auto &v : in_vars) {
+      for (auto &vv : v.second) {
+        if (!vars_.count(vv->Name())) {
+          vars_[vv->Name()].reset(vv->Var());
+        }
+      }
+    }
+    for (auto &v : out_vars) {
+      for (auto &vv : v.second) {
+        if (!vars_.count(vv->Name())) {
+          vars_[vv->Name()].reset(vv->Var());
+        }
+      }
+    }
+  }
+  ~ScopeWrapper() {
+    for (auto &pair : vars_) {
+      pair.second.release();
+    }
+  }
+};
+void Tape::Forward() {
+  LOG(INFO) << "Starting forward -------------------------";
+  PADDLE_ENFORCE(!has_been_backwarded_);
+  while (current_position_ < tape_.size()) {
+    OpHandle &op = tape_[current_position_];
+    // Create Output Tensor, this is only necessary for OpWithKernel
+    for (auto &param2var : op.outputs_) {
+      for (auto &var : param2var.second) {
+        var->InitializeVariable();
+      }
+    }
+    framework::OpDesc op_desc =
+        CreateOpDesc(op.type_, op.inputs_, op.outputs_, op.attrs_);
+    ScopeWrapper scope(op.inputs_, op.outputs_);
+    framework::OpRegistry::CreateOp(op_desc)->Run(scope, platform::CPUPlace());
+    current_position_++;
+  }
+  LOG(INFO) << "Finishing forward -------------------------";
+}
+void Tape::Backward(VariableHandle target) {
+  PADDLE_ENFORCE(!has_been_backwarded_);
+  Forward();
+  // TODO(tonyyang-svail): check output of last op is target
+  backward_tape_.reset(new Tape());
+  framework::AttributeMap attrs;
+  // FIXME(tonyyang-svail): Need to infer_data_type
+  attrs["dtype"] = framework::proto::VarType::Type::VarType_Type_FP32;
+  attrs["shape"] = std::vector<int>{1};
+  attrs["value"] = 1.0f;
+  backward_tape_->AddOp(
+      "fill_constant", {}, {{"Out", {target->Grad()}}}, attrs);
+  for (auto it = tape_.rbegin(); it != tape_.rend(); ++it) {
+    framework::OpDesc op_desc =
+        CreateOpDesc(it->type_, it->inputs_, it->outputs_, it->attrs_);
+    std::unordered_map<std::string, std::string> grad_to_var;
+    std::vector<std::unique_ptr<framework::OpDesc>> grad_op_descs =
+        framework::OpInfoMap::Instance()
+            .Get(op_desc.Type())
+            .GradOpMaker()(op_desc, {}, &grad_to_var, {});
+    for (auto &op_desc : grad_op_descs) {
+      std::unordered_map<std::string, VariableHandle> name2var;
+      for (auto &param2vars : it->inputs_) {
+        for (auto &a : param2vars.second) {
+          name2var[a->Name()] = a;
+        }
+      }
+      for (auto &param2vars : it->outputs_) {
+        for (auto &a : param2vars.second) {
+          name2var[a->Name()] = a;
+        }
+      }
+      VariableHandleMap in_vars;
+      VariableHandleMap out_vars;
+      std::map<const framework::VariableNameMap *, VariableHandleMap *>
+          loop_over{{&op_desc->Inputs(), &in_vars},
+                    {&op_desc->Outputs(), &out_vars}};
+      for (auto &each : loop_over) {
+        auto &vmp = *each.first;
+        auto &vhm = *each.second;
+        for (auto &p2a : vmp) {
+          for (auto &argu : p2a.second) {
+            if (name2var.count(argu)) {
+              vhm[p2a.first].push_back(name2var[argu]);
+            } else {
+              PADDLE_ENFORCE(ends_with(argu, framework::kGradVarSuffix),
+                             argu.c_str());
+              std::string name = argu.substr(
+                  0, argu.size() - std::strlen(framework::kGradVarSuffix));
+              PADDLE_ENFORCE(name2var.count(name), name.c_str());
+              vhm[p2a.first].push_back(name2var[name]->Grad());
+            }
+          }
+        }
+      }
+      backward_tape_->AddOp(
+          op_desc->Type(), in_vars, out_vars, op_desc->GetAttrMap());
+    }
+    // TODO(tonyyang-svail): how to fill empty grad?
+    // TODO(tonyyang-svail): Sum var grad is necessary
+  }
+  backward_tape_->Forward();
+  has_been_backwarded_ = true;
+}
+Tape &get_global_tape() {
+  static Tape T;
+  return T;
+}
+void reset_global_tape() { get_global_tape() = Tape(); }
+}
+}
--- a/paddle/contrib/tape/tape.h
+++ b/paddle/contrib/tape/tape.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/contrib/tape/variable.h"
+namespace paddle {
+namespace tape {
+using VariableHandleMap = std::map<std::string, std::vector<VariableHandle>>;
+struct OpHandle {
+  OpHandle(const std::string &type,
+           const VariableHandleMap &in_vars,
+           const VariableHandleMap &out_vars,
+           const framework::AttributeMap &attrs)
+      : type_(type), inputs_(in_vars), outputs_(out_vars), attrs_(attrs) {}
+  std::string type_;
+  VariableHandleMap inputs_;
+  VariableHandleMap outputs_;
+  framework::AttributeMap attrs_;
+};
+class Tape {
+ public:
+  void AddOp(const std::string &type,
+             const VariableHandleMap &in_vars,
+             VariableHandleMap out_vars,
+             const framework::AttributeMap &attrs);
+  void Forward();
+  void Backward(VariableHandle target);
+  bool HasBeenBackwarded() { return has_been_backwarded_; }
+ private:
+  bool has_been_backwarded_ = false;
+  size_t current_position_ = 0;
+  std::vector<OpHandle> tape_;
+  std::shared_ptr<Tape> backward_tape_;
+};
+Tape &get_global_tape();
+void reset_global_tape();
+}
+}
--- a/paddle/contrib/tape/test_tape.cc
+++ b/paddle/contrib/tape/test_tape.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "gtest/gtest.h"
+#include "paddle/contrib/tape/function.h"
+using namespace paddle::tape;
+TEST(Tape, TestMLP) {
+  LOG(INFO) << "TestMLP";
+  Linear linear1(3, 3, "relu");
+  Linear linear2(3, 3, "relu");
+  Mean mean;
+  SGD sgd(0.001);
+  std::string initializer = "fill_constant";
+  paddle::framework::AttributeMap attrs;
+  attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
+  attrs["shape"] = std::vector<int>{3, 3};
+  attrs["value"] = 1.0f;
+  Fill filler(initializer, attrs);
+  for (int i = 0; i < 2; ++i) {
+    reset_global_tape();
+    VariableHandle input(new Variable("input"));
+    filler(input);
+    auto loss = mean(linear2(linear1(input)));
+    get_global_tape().Backward(loss);
+    for (auto w : linear1.Params()) {
+      sgd(w);
+    }
+    for (auto w : linear2.Params()) {
+      sgd(w);
+    }
+  }
+}
+int main(int argc, char** argv) {
+  std::vector<paddle::platform::Place> places;
+  places.emplace_back(paddle::platform::CPUPlace());
+  paddle::platform::DeviceContextPool::Init(places);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
--- a/paddle/contrib/tape/variable.cc
+++ b/paddle/contrib/tape/variable.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/contrib/tape/variable.h"
+namespace paddle {
+namespace tape {
+void Variable::InitializeVariable() {
+  LOG(INFO) << "Initialzing " << desc_.Name() << " as " << desc_.GetType();
+  framework::proto::VarType::Type var_type = desc_.GetType();
+  if (var_type == framework::proto::VarType::LOD_TENSOR) {
+    var_.GetMutable<framework::LoDTensor>();
+  } else if (var_type == framework::proto::VarType::SELECTED_ROWS) {
+    var_.GetMutable<framework::SelectedRows>();
+  } else {
+    PADDLE_THROW("Variable type %d is not in [LOD_TENSOR, SELECTED_ROWS]",
+                 var_type);
+  }
+}
+}
+}
--- a/paddle/contrib/tape/variable.h
+++ b/paddle/contrib/tape/variable.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include "paddle/fluid/framework/operator.h"  // framework::kGradVarSuffix
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/variable.h"
+namespace paddle {
+namespace tape {
+class Variable;
+using VariableHandle = std::shared_ptr<Variable>;
+/*
+ * Combination of
+ *     framework::VarDesc desc_;
+ *     framework::Variable var_;
+ */
+class Variable {
+ public:
+  Variable(const std::string pre_fix)
+      : desc_(pre_fix + std::to_string(count())) {}
+  Variable(const std::string pre_fix, bool is_grad)
+      : desc_(pre_fix + (is_grad ? framework::kGradVarSuffix
+                                 : std::to_string(count()))) {}
+  ~Variable() { LOG(INFO) << "Deleting " << Name(); }
+  // Instantiate LoDTensor/SelectedRow
+  void InitializeVariable();
+  VariableHandle Grad() {
+    if (grad_.expired()) {
+      VariableHandle new_grad(new Variable(desc_.Name(), true));
+      grad_ = new_grad;
+      return new_grad;
+    } else {
+      return VariableHandle(grad_);
+    }
+  }
+  // Stochastic Gradient Descent with Momentum
+  //  VariableHandle Momentum ();
+  //  void init(const std::string& initializer,
+  //            const framework::AttributeMap& attrs);
+  // void value() {};
+  const framework::VarDesc& Desc() const { return desc_; }
+  framework::VarDesc* MutableDesc() { return &desc_; }
+  // TODO(tonyyang-svail): No need to expose name
+  std::string Name() const { return desc_.Name(); }
+  framework::Variable* Var() { return &var_; }
+ private:
+  int count() {
+    static int counter = 0;
+    return counter++;
+  }
+  framework::VarDesc desc_;
+  framework::Variable var_;
+  std::weak_ptr<Variable> grad_;
+};
+}
+}
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -83,8 +83,13 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
-cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
+if(WITH_DISTRIBUTE)
-framework_proto glog lod_rank_table feed_fetch_method)
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr)
+  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+else()
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method)
+endif()
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor)

--- a/paddle/fluid/framework/details/ssa_graph_checker.h
+++ b/paddle/fluid/framework/details/ssa_graph_checker.h
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace framework {
 namespace details {
-class SSAGraph;
+struct SSAGraph;
 class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
 public:

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -20,6 +20,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/fluid/operators/detail/grpc_client.h"
+#endif
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -44,6 +47,14 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
 Executor::Executor(const platform::Place& place) : place_(place) {}
+#ifdef PADDLE_WITH_DISTRIBUTE
+void Executor::Complete() {
+  ::paddle::operators::detail::RPCClient::GetInstance<
+      ::paddle::operators::detail::GRPCClient>()
+      ->SendComplete();
+}
+#endif
 void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
  if (var_type == proto::VarType::LOD_TENSOR) {
    var->GetMutable<LoDTensor>();
@@ -319,8 +330,12 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
  }
  for (auto& op : ctx->ops_) {
-    VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
+    VLOG(4) << place_ << " " << op->DebugStringEx(local_scope);
    op->Run(*local_scope, place_);
+    // NOTE! Please do not delete this line, it's usefull because the debug
+    // string before and after op.run are different, after run the output
+    // will have right shape which is usefull for debug.
+    VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
    if (FLAGS_benchmark) {
      VLOG(2) << "Memory used after operator " + op->Type() + " running: "

--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -44,6 +44,13 @@ class Executor {
  explicit Executor(const platform::Place& place);
+#ifdef PADDLE_WITH_DISTRIBUTE
+  /*
+   * Sending signal to pserver to mark current trainer stop.
+   */
+  void Complete();
+#endif
  /* @Brief
   * Runtime evaluation of the given ProgramDesc under certain Scope
   *

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -69,6 +69,19 @@ static DDim GetDims(const Scope& scope, const std::string& name,
  }
 }
+static int GetRowSize(const Scope& scope, const std::string& name) {
+  Variable* var = scope.FindVar(name);
+  if (var == nullptr) {
+    return -1;
+  }
+  if (var->IsType<SelectedRows>()) {
+    return var->Get<SelectedRows>().rows().size();
+  }
+  return -1;
+}
 static LoD GetLoD(const Scope& scope, const std::string& name) {
  Variable* var = scope.FindVar(name);
  auto default_lod = LoD({{}});
@@ -85,6 +98,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
 }
 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
+  VLOG(10) << "- " << DebugStringEx(&scope);
  if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
    PADDLE_THROW("Cannot run operator on place %s", place);
@@ -94,6 +108,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #endif
  }
  RunImpl(scope, place);
+  VLOG(10) << "+ " << DebugStringEx(&scope);
 }
 bool OperatorBase::HasInputs(const std::string& name) const {
@@ -153,6 +168,10 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
    for (size_t i = 0; i < input.second.size(); ++i) {
      ss << input.second[i];
      if (scope) {
+        int row_size = GetRowSize(*scope, input.second[i]);
+        if (row_size >= 0) {
+          ss << "[row_size=" << row_size << "]";
+        }
        ss << "[" << GetDims(*scope, input.second[i], true) << "]";
        ss << "(" << GetLoD(*scope, input.second[i]) << ")";
      }
@@ -173,6 +192,10 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
    for (size_t i = 0; i < output.second.size(); ++i) {
      ss << output.second[i];
      if (scope) {
+        int row_size = GetRowSize(*scope, output.second[i]);
+        if (row_size >= 0) {
+          ss << "[row_size=" << row_size << "]";
+        }
        ss << "[" << GetDims(*scope, output.second[i], true) << "]";
        ss << "(" << GetLoD(*scope, output.second[i]) << ")";
      }

--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@@ -35,14 +35,15 @@ class ReaderBase {
 class DecoratedReader : public ReaderBase {
 public:
-  explicit DecoratedReader(ReaderBase* reader) : ReaderBase(), reader_(reader) {
+  explicit DecoratedReader(const std::shared_ptr<ReaderBase>& reader)
+      : ReaderBase(), reader_(reader) {
    PADDLE_ENFORCE_NOT_NULL(reader_);
  }
  void ReInit() override { reader_->ReInit(); }
 protected:
-  ReaderBase* reader_;
+  std::shared_ptr<ReaderBase> reader_;
 };
 class FileReader : public ReaderBase {
@@ -64,7 +65,7 @@ class ReaderHolder {
 public:
  void Reset(ReaderBase* reader) { reader_.reset(reader); }
-  ReaderBase* Get() const { return reader_.get(); }
+  std::shared_ptr<ReaderBase> Get() const { return reader_; }
  void ReadNext(std::vector<LoDTensor>* out) {
    PADDLE_ENFORCE_NOT_NULL(reader_);
@@ -76,7 +77,7 @@ class ReaderHolder {
  }
 private:
-  std::unique_ptr<ReaderBase> reader_;
+  std::shared_ptr<ReaderBase> reader_;
 };
 }  // namespace framework

--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -81,6 +81,9 @@ class Scope {
  // Rename variable to a new name and return the new name
  std::string Rename(const std::string& origin_name) const;
+ protected:
+  mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
 private:
  // Call Scope::NewScope for a sub-scope.
  explicit Scope(Scope const* parent) : parent_(parent) {}
@@ -93,8 +96,6 @@ class Scope {
  // Caller doesn't own the returned Variable.
  Variable* FindVarLocally(const std::string& name) const;
-  mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
  // Scope in `kids_` are owned by this class.
  mutable std::list<Scope*> kids_;
  Scope const* parent_{nullptr};

--- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
@@ -19,10 +19,17 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using batch_norm_bwd = mkldnn::batch_normalization_backward;
+using batch_norm_fwd = mkldnn::batch_normalization_forward;
+using framework::DataLayout;
+using framework::Tensor;
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::reorder;
+using mkldnn::stream;
 using paddle::platform::MKLDNNDeviceContext;
 using paddle::platform::MKLDNNMemDesc;
-using mkldnn::memory;
+using platform::to_void_cast;
 template <typename T>
 using EigenArrayMap =
@@ -64,21 +71,12 @@ void run_batch_norm_op(Args &&... args) {
  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
 }
-template <typename T>
-inline void *cast_const_to_void(const T *t) {
-  return static_cast<void *>(const_cast<T *>(t));
-}
 }  // namespace
 template <typename T>
 class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto data_layout_str = ctx.Attr<std::string>("data_layout");
-    auto data_layout = framework::StringToDataLayout(data_layout_str);
-    PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW,
-                   "MKLDNN batch normalization handles only NCHW data layout");
    const float epsilon = ctx.Attr<float>("epsilon");
    const float momentum = ctx.Attr<float>("momentum");
    const bool is_test = ctx.Attr<bool>("is_test");
@@ -99,41 +97,53 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    const auto *scale = ctx.Input<Tensor>("Scale");
    const auto *shift = ctx.Input<Tensor>("Bias");
-    y->mutable_data<T>(ctx.GetPlace());
+    PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
-    mean_out->mutable_data<T>(ctx.GetPlace());
+                       x->format() != memory::format::format_undef,
-    variance_out->mutable_data<T>(ctx.GetPlace());
+                   "Wrong layout/format set for Input x tensor");
+    const T *x_data = x->data<T>();
+    const T *mean_data = mean->data<T>();
+    const T *variance_data = variance->data<T>();
+    T *y_data = y->mutable_data<T>(ctx.GetPlace());
+    T *mean_out_data = mean_out->mutable_data<T>(ctx.GetPlace());
+    T *variance_out_data = variance_out->mutable_data<T>(ctx.GetPlace());
+    T *batch_mean_data = nullptr;
+    T *batch_variance_data = nullptr;
    if (!is_test) {
-      batch_mean->mutable_data<T>(ctx.GetPlace());
+      batch_mean_data = batch_mean->mutable_data<T>(ctx.GetPlace());
-      batch_variance->mutable_data<T>(ctx.GetPlace());
+      batch_variance_data = batch_variance->mutable_data<T>(ctx.GetPlace());
    }
    auto propagation = is_test == true ? mkldnn::prop_kind::forward_scoring
                                       : mkldnn::prop_kind::forward_training;
-    auto dims = paddle::framework::vectorize2int(x->dims());
+    auto src_tz = paddle::framework::vectorize2int(x->dims());
+    auto scale_tz = paddle::framework::vectorize2int(scale->dims());
-    auto src_md =
+    PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1");
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
+    const unsigned int ic = scale_tz[0];
-    auto dst_md =
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
-    auto src_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine};
-    auto dst_pd = mkldnn::memory::primitive_desc{dst_md, mkldnn_engine};
-    auto src = mkldnn::memory{src_pd, cast_const_to_void(x->data<T>())};
-    auto dst = mkldnn::memory{dst_pd, y->data<T>()};
    unsigned flags = mkldnn::use_scale_shift;
    if (is_test) flags |= mkldnn::use_global_stats;
+    // create mkldnn memory from input x tensor
+    auto src_memory =
+        memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
+               to_void_cast(x_data));
+    // create primitive descriptor for batch norm forward
    using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
-    auto batch_norm_fwd_desc =
+    auto batch_norm_fwd_desc = bn_fwd_types::op_desc{
-        bn_fwd_types::op_desc{propagation, src_md, epsilon, flags};
+        propagation, src_memory.get_primitive_desc().desc(), epsilon, flags};
-    auto batch_norm_fwd_pd =
+    std::shared_ptr<batch_norm_fwd::primitive_desc> batch_norm_fwd_pd =
-        bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine};
+        std::shared_ptr<batch_norm_fwd::primitive_desc>(
+            new batch_norm_fwd::primitive_desc(batch_norm_fwd_desc,
+                                               mkldnn_engine));
-    const unsigned int ic = dims[1];
+    // Save the pd to be used in backward pass
+    const std::string key = ctx.op().Output("SavedMean");
+    const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
+    dev_ctx.SetBlob(key_batch_norm_fwd_pd, batch_norm_fwd_pd);
    // MKLDNN requires a single piece of memory for scale and shift/bias data
    const size_t scaleshift_size = 2 * ic;
@@ -143,73 +153,58 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    copy_to_weights(scale->data<T>(), scale->data<T>() + ic, shift->data<T>(),
                    shift->data<T>() + ic, &scaleshift_data);
-    auto scaleshift_memory = mkldnn::memory{
+    // crate mkldnn memory for weights(scale/shift)
-        batch_norm_fwd_pd.weights_primitive_desc(), scaleshift_data.data()};
+    auto scaleshift_memory = memory(batch_norm_fwd_pd->weights_primitive_desc(),
+                                    scaleshift_data.data());
-    if (is_test) {
+    // create mkldnn memory for output y tensor
-      auto mean_memory = mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(),
+    auto dst_memory = memory(batch_norm_fwd_pd->dst_primitive_desc(), y_data);
-                                        cast_const_to_void(mean->data<T>())};
+    if (is_test) {
+      // create mkldnn memory for stats (as input)
+      auto mean_memory = memory(batch_norm_fwd_pd->mean_primitive_desc(),
+                                to_void_cast(mean_data));
      auto variance_memory =
-          mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(),
+          memory(batch_norm_fwd_pd->variance_primitive_desc(),
-                         cast_const_to_void(variance->data<T>())};
+                 to_void_cast(variance_data));
      run_batch_norm_op<typename bn_fwd_types::op_type>(
-          batch_norm_fwd_pd, src, (const mkldnn::primitive::at &)mean_memory,
+          *batch_norm_fwd_pd, src_memory,
+          (const mkldnn::primitive::at &)mean_memory,
          (const mkldnn::primitive::at &)variance_memory, scaleshift_memory,
-          dst);
+          dst_memory);
    } else {
+      // create mkldnn memory for stats (as output)
      auto mean_memory =
-          mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(),
+          memory(batch_norm_fwd_pd->mean_primitive_desc(), batch_mean_data);
-                         cast_const_to_void(batch_mean->data<T>())};
+      auto variance_memory = memory(
+          batch_norm_fwd_pd->variance_primitive_desc(), batch_variance_data);
-      auto variance_memory =
-          mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(),
-                         cast_const_to_void(batch_variance->data<T>())};
-      run_batch_norm_op<bn_fwd_types::op_type>(batch_norm_fwd_pd, src,
+      run_batch_norm_op<bn_fwd_types::op_type>(*batch_norm_fwd_pd, src_memory,
-                                               scaleshift_memory, dst,
+                                               scaleshift_memory, dst_memory,
                                               mean_memory, variance_memory);
    }
    if (!is_test) {
-      const unsigned int in = dims[0];
+      // mkldnn only compute stats for current batch
-      const unsigned int sample_size = x->numel() / in / ic;
+      // so we need compute momentum stats via Eigen lib
+      EigenVectorArrayMap<T> batch_mean_e(batch_mean_data, ic);
-      // saved_xx is use just in this batch of data
+      EigenVectorArrayMap<T> batch_variance_e(batch_variance_data, ic);
-      EigenVectorArrayMap<T> saved_mean_e(
+      ConstEigenVectorArrayMap<T> mean_e(mean_data, ic);
-          batch_mean->mutable_data<T>(ctx.GetPlace()), ic);
+      ConstEigenVectorArrayMap<T> variance_e{variance_data, ic};
-      EigenVectorArrayMap<T> saved_variance_e(
-          batch_variance->mutable_data<T>(ctx.GetPlace()), ic);
+      EigenVectorArrayMap<T> running_mean_e(mean_out_data, ic);
-      saved_mean_e.setZero();
+      EigenVectorArrayMap<T> running_variance_e(variance_out_data, ic);
-      saved_variance_e.setZero();
-      const unsigned int x_arr_size = in * ic;
-      ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, x_arr_size);
-      for (unsigned int nc = 0; nc < x_arr_size; ++nc) {
-        saved_mean_e(nc % ic) += x_arr.col(nc).sum();
-      }
-      saved_mean_e /= in * sample_size;
-      for (unsigned int nc = 0; nc < x_arr_size; ++nc) {
-        saved_variance_e(nc % ic) +=
-            (x_arr.col(nc) - saved_mean_e(nc % ic)).matrix().squaredNorm();
-      }
-      saved_variance_e /= in * sample_size;
-      ConstEigenVectorArrayMap<T> mean_arr{mean->data<T>(), ic};
-      ConstEigenVectorArrayMap<T> variance_arr{variance->data<T>(), ic};
-      EigenVectorArrayMap<T> running_mean_arr(
-          mean_out->mutable_data<T>(ctx.GetPlace()), ic);
-      EigenVectorArrayMap<T> running_var_arr(
-          variance_out->mutable_data<T>(ctx.GetPlace()), ic);
      auto one_minus_momentum = 1. - momentum;
-      running_mean_arr =
+      running_mean_e = mean_e * momentum + batch_mean_e * one_minus_momentum;
-          mean_arr * momentum + saved_mean_e * one_minus_momentum;
+      running_variance_e =
-      running_var_arr =
+          variance_e * momentum + batch_variance_e * one_minus_momentum;
-          variance_arr * momentum + saved_variance_e * one_minus_momentum;
    }
+    y->set_layout(DataLayout::kMKLDNN);
+    y->set_format(
+        (memory::format)dst_memory.get_primitive_desc().desc().data.format);
  }
 };
@@ -217,11 +212,6 @@ template <typename T>
 class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 public:
  void Compute(const paddle::framework::ExecutionContext &ctx) const override {
-    auto data_layout_str = ctx.Attr<std::string>("data_layout");
-    auto data_layout = framework::StringToDataLayout(data_layout_str);
-    PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW,
-                   "MKLDNN batch normalization handles only NCHW data layout");
    auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
    auto mkldnn_engine = dev_ctx.GetEngine();
@@ -238,88 +228,132 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    auto *diff_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
    auto *diff_shift = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-    diff_x->mutable_data<T>(ctx.GetPlace());
+    PADDLE_ENFORCE(diff_y->layout() == DataLayout::kMKLDNN &&
-    diff_scale->mutable_data<T>(ctx.GetPlace());
+                       diff_y->format() != memory::format::format_undef,
-    diff_shift->mutable_data<T>(ctx.GetPlace());
+                   "Wrong layout/format set for Input diff_y tensor");
+    const T *x_data = x->data<T>();
+    const T *diff_y_data = diff_y->data<T>();
+    const T *batch_mean_data = batch_mean->data<T>();
+    const T *batch_variance_data = batch_variance->data<T>();
+    const T *scale_data = scale->data<T>();
+    const T *shift_data = shift->data<T>();
+    T *diff_x_data = diff_x->mutable_data<T>(ctx.GetPlace());
+    T *diff_scale_data = diff_scale->mutable_data<T>(ctx.GetPlace());
+    T *diff_shift_data = diff_shift->mutable_data<T>(ctx.GetPlace());
+    auto src_tz = paddle::framework::vectorize2int(x->dims());
+    auto diff_src_tz = src_tz;
+    auto dst_tz = src_tz;
+    auto diff_dst_tz = dst_tz;
+    auto scale_tz = paddle::framework::vectorize2int(scale->dims());
+    PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1");
+    const unsigned int ic = scale_tz[0];
+    // Retrieve bn_fwd_pd from device context
+    const std::string key = ctx.op().Input("SavedMean");
+    const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
+    auto batch_norm_fwd_pd =
+        std::static_pointer_cast<batch_norm_fwd::primitive_desc>(
+            dev_ctx.GetBlob(key_batch_norm_fwd_pd));
+    PADDLE_ENFORCE(batch_norm_fwd_pd != nullptr,
+                   "Fail to find batch_norm_fwd_pd in device context");
-    auto dims = paddle::framework::vectorize2int(x->dims());
+    using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
-    unsigned flags = mkldnn::use_scale_shift | !mkldnn::use_global_stats;
-    auto src_md =
+    // create mkldnn memory from input diff_y tensor
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
+    auto user_diff_dst_memory =
-    auto dst_md =
+        memory({{{diff_dst_tz}, memory::data_type::f32, diff_y->format()},
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
+                mkldnn_engine},
-    auto diff_src_md =
+               to_void_cast(diff_y_data));
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
-    auto diff_dst_md =
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
-    using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
+    // create mkldnn memory from input x tensor
-    using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
+    auto src_memory =
+        memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
+               to_void_cast(x_data));
-    auto batch_norm_fwd_desc = bn_fwd_types::op_desc{
+    // for diff_dst, try to use same format as dst in forward pass
-        mkldnn::prop_kind::forward_training, src_md, epsilon, flags};
+    auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc();
-    auto batch_norm_fwd_pd =
+    auto diff_dst_md = diff_dst_pd.desc();
-        bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine};
+    // create primitive descriptor for batch norm backward
+    unsigned flags = mkldnn::use_scale_shift;
    auto batch_norm_bwd_desc = bn_bwd_types::op_desc{
-        mkldnn::prop_kind::backward, diff_dst_md, dst_md, epsilon, flags};
+        mkldnn::prop_kind::backward, diff_dst_md,
+        src_memory.get_primitive_desc().desc(), epsilon, flags};
    auto batch_norm_bwd_pd = bn_bwd_types::op_prim{
-        batch_norm_bwd_desc, mkldnn_engine, batch_norm_fwd_pd};
+        batch_norm_bwd_desc, mkldnn_engine, *batch_norm_fwd_pd};
-    auto src = mkldnn::memory{{src_md, mkldnn_engine},
+    // reorder user_diff_dst if it's not in preferred format
-                              cast_const_to_void(x->data<T>())};
+    auto diff_dst_memory = user_diff_dst_memory;
+    primitive reorder_diff_dst;
-    auto mean = mkldnn::memory{batch_norm_bwd_pd.mean_primitive_desc(),
+    bool is_diff_dst_reordered = false;
-                               cast_const_to_void(batch_mean->data<T>())};
+    if (diff_dst_pd != user_diff_dst_memory.get_primitive_desc()) {
+      diff_dst_memory = memory(diff_dst_pd);
-    auto variance =
+      reorder_diff_dst = reorder(user_diff_dst_memory, diff_dst_memory);
-        mkldnn::memory{batch_norm_bwd_pd.variance_primitive_desc(),
+      is_diff_dst_reordered = true;
-                       cast_const_to_void(batch_variance->data<T>())};
+    }
-    auto diff_dst = mkldnn::memory{{diff_dst_md, mkldnn_engine},
-                                   cast_const_to_void(diff_y->data<T>())};
-    const unsigned int ic = dims[1];
+    // create mkldnn memory for input tensors (src/mean/variance)
+    auto mean_memory = memory(batch_norm_bwd_pd.mean_primitive_desc(),
+                              to_void_cast(batch_mean_data));
+    auto variance_memory = memory(batch_norm_bwd_pd.variance_primitive_desc(),
+                                  to_void_cast(batch_variance_data));
+    // MKLDNN requires a single piece of memory for scale and shift/bias data
    const size_t scaleshift_size = 2 * ic;
    std::vector<T> scaleshift_data;
    scaleshift_data.reserve(scaleshift_size);
-    copy_to_weights(scale->data<T>(), scale->data<T>() + ic, shift->data<T>(),
+    copy_to_weights(scale_data, scale_data + ic, shift_data, shift_data + ic,
-                    shift->data<T>() + ic, &scaleshift_data);
+                    &scaleshift_data);
-    auto scaleshift_memory = mkldnn::memory{
+    // create mkldnn memory for input tensors (scale/shift)
-        batch_norm_bwd_pd.weights_primitive_desc(), scaleshift_data.data()};
+    auto scaleshift_memory = memory(batch_norm_bwd_pd.weights_primitive_desc(),
+                                    scaleshift_data.data());
+    // create mkldnn memory for output diff weights (combined scale/shift)
    std::vector<T> diff_scaleshift_data;
    diff_scaleshift_data.reserve(scaleshift_size);
-    copy_to_weights(diff_scale->data<T>(), diff_scale->data<T>() + ic,
-                    diff_shift->data<T>(), diff_shift->data<T>() + ic,
-                    &diff_scaleshift_data);
    auto diff_scaleshift_memory =
-        mkldnn::memory{batch_norm_bwd_pd.diff_weights_primitive_desc(),
+        memory(batch_norm_bwd_pd.diff_weights_primitive_desc(),
-                       diff_scaleshift_data.data()};
+               diff_scaleshift_data.data());
-    auto diff_src = mkldnn::memory{{diff_src_md, mkldnn_engine},
+    // here assume diff_src is in the same format of src
-                                   static_cast<void *>(diff_x->data<T>())};
+    auto diff_src_memory = memory(src_memory.get_primitive_desc(), diff_x_data);
-    run_batch_norm_op<bn_bwd_types::op_type>(
+    // finally create batch_norm backward primitive
-        batch_norm_bwd_pd, src, mean, variance, diff_dst, scaleshift_memory,
+    auto batch_norm_bwd_prim =
-        diff_src, diff_scaleshift_memory);
+        batch_norm_bwd(batch_norm_bwd_pd, src_memory, mean_memory,
+                       variance_memory, diff_dst_memory, scaleshift_memory,
+                       diff_src_memory, diff_scaleshift_memory);
+    // execute optional reorder and batch_norm backward primitive
+    std::vector<primitive> pipeline;
+    if (is_diff_dst_reordered) pipeline.push_back(reorder_diff_dst);
+    pipeline.push_back(batch_norm_bwd_prim);
+    stream(stream::kind::eager).submit(pipeline).wait();
+    // copy back diff sacle/shift to output tensors (diff scale/shift)
+    diff_scaleshift_data.resize(scaleshift_size);
    auto it = std::begin(diff_scaleshift_data);
-    std::copy(it, std::next(it, ic), diff_scale->data<T>());
+    std::copy(it, std::next(it, ic), diff_scale_data);
    std::copy(std::next(it, ic), std::end(diff_scaleshift_data),
-              diff_shift->data<T>());
+              diff_shift_data);
+    // set layout/format of output tensors
+    diff_x->set_layout(DataLayout::kMKLDNN);
+    diff_x->set_format((memory::format)diff_src_memory.get_primitive_desc()
+                           .desc()
+                           .data.format);
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP_KERNEL(batch_norm, MKLDNN, paddle::platform::CPUPlace,
+REGISTER_OP_KERNEL(batch_norm, MKLDNN, ::paddle::platform::CPUPlace,
                   ops::BatchNormMKLDNNOpKernel<float>);
-REGISTER_OP_KERNEL(batch_norm_grad, MKLDNN, paddle::platform::CPUPlace,
+REGISTER_OP_KERNEL(batch_norm_grad, MKLDNN, ::paddle::platform::CPUPlace,
                   ops::BatchNormMKLDNNGradOpKernel<float>);
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -110,19 +110,19 @@ class BatchNormOp : public framework::OperatorWithKernel {
                                         ctx.Input<Tensor>("Variance")->type()),
                      "Variance input should be of float type");
-    framework::LibraryType library_{framework::LibraryType::kPlain};
    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    framework::LibraryType library = framework::LibraryType::kPlain;
    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
 #ifdef PADDLE_WITH_MKLDNN
-    if (library_ == framework::LibraryType::kPlain &&
+    if (library == framework::LibraryType::kPlain &&
        platform::CanMKLDNNBeUsed(ctx)) {
-      library_ = framework::LibraryType::kMKLDNN;
+      library = framework::LibraryType::kMKLDNN;
      layout = framework::DataLayout::kMKLDNN;
    }
 #endif
    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
-                                   library_);
+                                   library);
  }
 };
@@ -370,19 +370,21 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
      PADDLE_THROW("can't find Y@GRAD");
    }
-    framework::LibraryType library_{framework::LibraryType::kPlain};
    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
 #ifdef PADDLE_WITH_MKLDNN
-    if (library_ == framework::LibraryType::kPlain &&
+    if (library == framework::LibraryType::kPlain &&
        platform::CanMKLDNNBeUsed(ctx)) {
-      library_ = framework::LibraryType::kMKLDNN;
+      library = framework::LibraryType::kMKLDNN;
-      layout_ = framework::DataLayout::kMKLDNN;
+      layout = framework::DataLayout::kMKLDNN;
    }
 #endif
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        layout_, library_);
+        layout, library);
  }
 };

--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -18,6 +18,17 @@
 namespace paddle {
 namespace operators {
+using conv_bwd_data = mkldnn::convolution_backward_data;
+using conv_bwd_weights = mkldnn::convolution_backward_weights;
+using conv_fwd = mkldnn::convolution_forward;
+using framework::DataLayout;
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::reorder;
+using mkldnn::stream;
+using platform::to_void_cast;
+using platform::GetMKLDNNFormat;
 template <typename T>
 class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 public:
@@ -25,6 +36,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                   "It must use CPUPlace.");
+    // Get unique name for index
+    const std::string key = ctx.op().Output("Output");
+    const std::string key_conv_pd = key + "@conv_pd";
    auto& dev_ctx =
        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
    const auto& mkldnn_engine = dev_ctx.GetEngine();
@@ -33,10 +48,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    auto* filter = ctx.Input<Tensor>("Filter");
    auto* output = ctx.Output<Tensor>("Output");
-    // Get an unique name from "argument" name of "Output" variable
+    PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
-    // This name will be used as key when saving info into device context
+                       input->format() != memory::format::format_undef,
-    const std::string key = ctx.op().Output("Output");
+                   "Wrong layout/format set for Input tensor");
-    const std::string key_conv_pd = key + "@conv_pd";
+    PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
+                       filter->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Filter tensor");
    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
@@ -63,60 +80,86 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
        paddle::framework::vectorize2int(filter->dims());
    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
-    // TODO(pzelazko-intel): support more formats
+    // create mkldnn memory from input tensors (data/weights)
-    auto src_md = platform::MKLDNNMemDesc(
+    auto user_src_memory = memory(
-        src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+        {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine},
-    auto weights_md =
+        to_void_cast(input_data));
-        platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32,
+    auto user_weights_memory =
-                                mkldnn::memory::format::oihw);
+        memory({{{weights_tz}, memory::data_type::f32, filter->format()},
-    auto dst_md = platform::MKLDNNMemDesc(
+                mkldnn_engine},
-        dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+               to_void_cast(filter_data));
-    auto src_memory =
+    /* create memory descriptor for convolution without specified format
-        mkldnn::memory({src_md, mkldnn_engine},
+     * ('any') which lets a primitive (convolution in this case) choose
-                       reinterpret_cast<void*>(const_cast<T*>(input_data)));
+     * the memory format preferred for best performance
-    auto weights_memory =
+     */
-        mkldnn::memory({weights_md, mkldnn_engine},
+    auto src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32,
-                       reinterpret_cast<void*>(const_cast<T*>(filter_data)));
+                                          memory::format::any);
-    auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine}, output_data);
+    auto weights_md = platform::MKLDNNMemDesc(
+        weights_tz, memory::data_type::f32, memory::format::any);
-    std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd =
+    auto dst_md = platform::MKLDNNMemDesc(dst_tz, memory::data_type::f32,
-        ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
+                                          memory::format::any);
-                             mkldnn_engine);
+    // create a conv primitive descriptor and save it for usage in backward
-    // save conv_pd into global device context to be referred in backward path
+    std::shared_ptr<conv_fwd::primitive_desc> conv_pd = ConvFwdPrimitiveDesc(
-    dev_ctx.SetBlob(key_conv_pd, conv_pd);
+        src_md, weights_md, dst_md, strides, paddings, mkldnn_engine);
+    // create reorder primitive if the input format is not the preferred one
+    auto src_memory = user_src_memory;
+    primitive reorder_src;
+    bool is_src_reordered = false;
+    if (memory::primitive_desc(conv_pd->src_primitive_desc()) !=
+        user_src_memory.get_primitive_desc()) {
+      src_memory = memory(conv_pd->src_primitive_desc());
+      reorder_src = reorder(user_src_memory, src_memory);
+      is_src_reordered = true;
+    }
+    auto weights_memory = user_weights_memory;
+    primitive reorder_weights;
+    bool is_weights_reordered = false;
+    if (memory::primitive_desc(conv_pd->weights_primitive_desc()) !=
+        user_weights_memory.get_primitive_desc()) {
+      weights_memory = memory(conv_pd->weights_primitive_desc());
+      reorder_weights = reorder(user_weights_memory, weights_memory);
+      is_weights_reordered = true;
+    }
+    // create memory primitive for conv dst
+    auto dst_memory = memory(conv_pd->dst_primitive_desc(), output_data);
    // create convolution op primitive
-    auto conv_prim = mkldnn::convolution_forward(*conv_pd, src_memory,
+    auto conv_prim = conv_fwd(*conv_pd, src_memory, weights_memory, dst_memory);
-                                                 weights_memory, dst_memory);
    // push primitive to stream and wait until it's executed
-    std::vector<mkldnn::primitive> pipeline{conv_prim};
+    std::vector<primitive> pipeline;
-    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+    if (is_src_reordered) pipeline.push_back(reorder_src);
+    if (is_weights_reordered) pipeline.push_back(reorder_weights);
+    pipeline.push_back(conv_prim);
+    stream(stream::kind::eager).submit(pipeline).wait();
+    // Save conv_pd/src_memory/weights_memory for backward pass
+    dev_ctx.SetBlob(key_conv_pd, conv_pd);
+    output->set_layout(DataLayout::kMKLDNN);
+    output->set_format(GetMKLDNNFormat(dst_memory));
  }
 private:
-  std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
+  std::unique_ptr<conv_fwd::primitive_desc> ConvFwdPrimitiveDesc(
-  ConvFwdPrimitiveDesc(const mkldnn::memory::desc& src,
+      const memory::desc& src, const memory::desc& weights,
-                       const mkldnn::memory::desc& weights,
+      const memory::desc& dst, const std::vector<int>& strides,
-                       const mkldnn::memory::desc& dst,
+      const std::vector<int>& paddings, const mkldnn::engine& engine) const {
-                       const std::vector<int>& strides,
+    memory::dims stride_dims = {strides[0], strides[1]};
-                       const std::vector<int>& paddings,
+    memory::dims padding_dims = {paddings[0], paddings[1]};
-                       const mkldnn::engine& engine) const {
-    mkldnn::memory::dims stride_dims = {strides[0], strides[1]};
+    auto conv_desc =
-    mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]};
+        conv_fwd::desc(mkldnn::prop_kind::forward, mkldnn::convolution_direct,
+                       src, weights, dst, stride_dims, padding_dims,
-    auto conv_desc = mkldnn::convolution_forward::desc(
+                       padding_dims, mkldnn::padding_kind::zero);
-        mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights,
-        dst, stride_dims, padding_dims, padding_dims,
+    auto p_conv_pd = new conv_fwd::primitive_desc(conv_desc, engine);
-        mkldnn::padding_kind::zero);
+    return std::unique_ptr<conv_fwd::primitive_desc>(p_conv_pd);
-    auto p_conv_pd =
-        new mkldnn::convolution_forward::primitive_desc(conv_desc, engine);
-    return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
-        p_conv_pd);
  }
 };
@@ -139,6 +182,19 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
    Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
+    PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
+                       input->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input tensor");
+    PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
+                       filter->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Filter tensor");
+    PADDLE_ENFORCE(output->layout() == DataLayout::kMKLDNN &&
+                       output->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Output tensor");
+    PADDLE_ENFORCE(output_grad->layout() == DataLayout::kMKLDNN &&
+                       output_grad->format() != memory::format::format_undef,
+                   "Wrong layout/format set for output_grad tensor");
    if (!input_grad && !filter_grad) return;
    // Get an unique name from "argument" name of "Output" variable
@@ -167,108 +223,147 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
        paddle::framework::vectorize2int(filter->dims());
    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
-    // TODO(pzelazko-intel): support more formats
+    // create mkldnn memory from input tensors (input/weights/output_grad)
-    auto src_md = platform::MKLDNNMemDesc(
+    auto user_src_memory = memory(
-        src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+        {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine},
-    auto diff_src_md = platform::MKLDNNMemDesc(
+        to_void_cast(input_data));
-        src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+    auto user_weights_memory =
-    auto weights_md =
+        memory({{{weights_tz}, memory::data_type::f32, filter->format()},
-        platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32,
+                mkldnn_engine},
-                                mkldnn::memory::format::oihw);
+               to_void_cast(filter_data));
-    auto diff_weights_md =
+    auto user_diff_dst_memory =
-        platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32,
+        memory({{{dst_tz}, memory::data_type::f32, output_grad->format()},
-                                mkldnn::memory::format::oihw);
+                mkldnn_engine},
-    auto diff_dst_md = platform::MKLDNNMemDesc(
+               to_void_cast(output_grad_data));
-        dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+    /* create memory descriptor for conv backward without specified format
-    // create memory
+     * ('any') which lets a primitive (conv backward in this case) choose
-    auto diff_dst_memory = mkldnn::memory(
+     * the memory format preferred for best performance
-        {diff_weights_md, mkldnn_engine},
+     */
-        reinterpret_cast<void*>(const_cast<T*>(output_grad_data)));
+    auto src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32,
+                                          memory::format::any);
+    auto diff_src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32,
+                                               memory::format::any);
+    auto weights_md = platform::MKLDNNMemDesc(
+        weights_tz, memory::data_type::f32, memory::format::any);
+    auto diff_weights_md = platform::MKLDNNMemDesc(
+        weights_tz, memory::data_type::f32, memory::format::any);
+    auto diff_dst_md = platform::MKLDNNMemDesc(dst_tz, memory::data_type::f32,
+                                               memory::format::any);
    // Retrieve conv_pd from device context
-    auto conv_pd =
+    auto conv_pd = std::static_pointer_cast<conv_fwd::primitive_desc>(
-        std::static_pointer_cast<mkldnn::convolution_forward::primitive_desc>(
+        dev_ctx.GetBlob(key_conv_pd));
-            dev_ctx.GetBlob(key_conv_pd));
    PADDLE_ENFORCE(conv_pd != nullptr,
                   "Fail to find conv_pd in device context");
    // create backward conv primitive for weights
    if (filter_grad) {
-      // create primitive descriptor
+      // create backward convolution primitive descriptor
-      mkldnn::convolution_backward_weights::primitive_desc conv_bwd_weights_pd =
+      auto conv_bwd_weights_desc = conv_bwd_weights::desc(
-          ConvBwdWeightsPrimitiveDesc(src_md, diff_weights_md, diff_dst_md,
+          mkldnn::convolution_direct, src_md, diff_weights_md, diff_dst_md,
-                                      strides, paddings, *conv_pd,
+          strides, paddings, paddings, mkldnn::padding_kind::zero);
-                                      mkldnn_engine);
+      auto conv_bwd_weights_pd = conv_bwd_weights::primitive_desc(
+          conv_bwd_weights_desc, mkldnn_engine, *conv_pd);
-      // create memory
+      // create reorder primitive if the input format is not the preferred one
+      auto src_memory = user_src_memory;
+      primitive reorder_src;
+      bool is_src_reordered = false;
+      if (memory::primitive_desc(conv_bwd_weights_pd.src_primitive_desc()) !=
+          user_src_memory.get_primitive_desc()) {
+        src_memory = memory(conv_bwd_weights_pd.src_primitive_desc());
+        reorder_src = reorder(user_src_memory, src_memory);
+        is_src_reordered = true;
+      }
+      auto diff_dst_memory_4filter = user_diff_dst_memory;
+      primitive reorder_diff_dst_4filter;
+      bool is_diff_dst_reordered_4filter = false;
+      if (memory::primitive_desc(
+              conv_bwd_weights_pd.diff_dst_primitive_desc()) !=
+          user_diff_dst_memory.get_primitive_desc()) {
+        diff_dst_memory_4filter =
+            memory(conv_bwd_weights_pd.diff_dst_primitive_desc());
+        reorder_diff_dst_4filter =
+            reorder(user_diff_dst_memory, diff_dst_memory_4filter);
+        is_diff_dst_reordered_4filter = true;
+      }
+      // create mkldnn memory for output (i.e. diff weights)
      auto diff_weights_memory =
-          mkldnn::memory({diff_weights_md, mkldnn_engine},
+          memory(conv_bwd_weights_pd.diff_weights_primitive_desc(),
-                         reinterpret_cast<void*>(filter_grad_data));
+                 reinterpret_cast<void*>(filter_grad_data));
-      auto src_memory =
-          mkldnn::memory({src_md, mkldnn_engine},
-                         reinterpret_cast<void*>(const_cast<T*>(input_data)));
      // create backward conv primitive for weights
-      auto conv_bwd_weights_prim = mkldnn::convolution_backward_weights(
+      auto conv_bwd_weights_prim =
-          conv_bwd_weights_pd, src_memory, diff_dst_memory,
+          conv_bwd_weights(conv_bwd_weights_pd, src_memory,
-          diff_weights_memory);
+                           diff_dst_memory_4filter, diff_weights_memory);
      // push primitive and execute it
-      std::vector<mkldnn::primitive> pipeline{conv_bwd_weights_prim};
+      std::vector<primitive> pipeline;
-      mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+      if (is_src_reordered) pipeline.push_back(reorder_src);
+      if (is_diff_dst_reordered_4filter)
+        pipeline.push_back(reorder_diff_dst_4filter);
+      pipeline.push_back(conv_bwd_weights_prim);
+      stream(stream::kind::eager).submit(pipeline).wait();
+      filter_grad->set_layout(DataLayout::kMKLDNN);
+      filter_grad->set_format(GetMKLDNNFormat(diff_weights_memory));
    }
    if (input_grad) {
-      // create primitive descriptor
+      // create backward convolution primitive descriptor
-      mkldnn::convolution_backward_data::primitive_desc conv_bwd_data_pd =
+      auto conv_bwd_data_desc = conv_bwd_data::desc(
-          ConvBwdDataPrimitiveDesc(diff_src_md, weights_md, diff_dst_md,
+          mkldnn::convolution_direct, diff_src_md, weights_md, diff_dst_md,
-                                   strides, paddings, *conv_pd, mkldnn_engine);
+          strides, paddings, paddings, mkldnn::padding_kind::zero);
+      auto conv_bwd_data_pd = conv_bwd_data::primitive_desc(
-      // create memory
+          conv_bwd_data_desc, mkldnn_engine, *conv_pd);
-      auto diff_src_memory = mkldnn::memory(
-          {diff_src_md, mkldnn_engine},
+      // create reorder primitive if the input format is not the preferred one
-          reinterpret_cast<void*>(const_cast<T*>(input_grad_data)));
+      auto weights_memory = user_weights_memory;
-      auto weights_memory =
+      primitive reorder_weights;
-          mkldnn::memory({weights_md, mkldnn_engine},
+      bool is_weights_reordered = false;
-                         reinterpret_cast<void*>(const_cast<T*>(filter_data)));
+      if (memory::primitive_desc(conv_bwd_data_pd.weights_primitive_desc()) !=
+          user_weights_memory.get_primitive_desc()) {
+        weights_memory = memory(conv_bwd_data_pd.weights_primitive_desc());
+        reorder_weights = reorder(user_weights_memory, weights_memory);
+        is_weights_reordered = true;
+      }
+      auto diff_dst_memory_4data = user_diff_dst_memory;
+      primitive reorder_diff_dst_4data;
+      bool is_diff_dst_reordered_4data = false;
+      if (memory::primitive_desc(conv_bwd_data_pd.diff_dst_primitive_desc()) !=
+          user_diff_dst_memory.get_primitive_desc()) {
+        diff_dst_memory_4data =
+            memory(conv_bwd_data_pd.diff_dst_primitive_desc());
+        reorder_diff_dst_4data =
+            reorder(user_diff_dst_memory, diff_dst_memory_4data);
+        is_diff_dst_reordered_4data = true;
+      }
+      // create mkldnn memory for output (i.e. diff src)
+      auto diff_src_memory = memory(conv_bwd_data_pd.diff_src_primitive_desc(),
+                                    reinterpret_cast<void*>(input_grad_data));
      // create backward conv primitive for data
-      auto conv_bwd_data_prim = mkldnn::convolution_backward_data(
+      auto conv_bwd_data_prim =
-          conv_bwd_data_pd, diff_dst_memory, weights_memory, diff_src_memory);
+          conv_bwd_data(conv_bwd_data_pd, diff_dst_memory_4data, weights_memory,
+                        diff_src_memory);
-      // push primitive to stream and wait until it's executed
+      // push primitive and execute it
-      std::vector<mkldnn::primitive> pipeline{conv_bwd_data_prim};
+      std::vector<primitive> pipeline;
-      mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+      if (is_weights_reordered) pipeline.push_back(reorder_weights);
+      if (is_diff_dst_reordered_4data)
+        pipeline.push_back(reorder_diff_dst_4data);
+      pipeline.push_back(conv_bwd_data_prim);
+      stream(stream::kind::eager).submit(pipeline).wait();
+      input_grad->set_layout(DataLayout::kMKLDNN);
+      input_grad->set_format(GetMKLDNNFormat(diff_src_memory));
    }
  }  // Compute()
- private:
-  mkldnn::convolution_backward_weights::primitive_desc
-  ConvBwdWeightsPrimitiveDesc(
-      const mkldnn::memory::desc& src, const mkldnn::memory::desc& diff_weights,
-      const mkldnn::memory::desc& diff_dst, const std::vector<int>& strides,
-      const std::vector<int>& paddings,
-      const mkldnn::convolution_forward::primitive_desc& conv_pd,
-      const mkldnn::engine& engine) const {
-    auto conv_bwd_weights_desc = mkldnn::convolution_backward_weights::desc(
-        mkldnn::convolution_direct, src, diff_weights, diff_dst, strides,
-        paddings, paddings, mkldnn::padding_kind::zero);
-    return mkldnn::convolution_backward_weights::primitive_desc(
-        conv_bwd_weights_desc, engine, conv_pd);
-  }
-  mkldnn::convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc(
-      const mkldnn::memory::desc& diff_src, const mkldnn::memory::desc& weights,
-      const mkldnn::memory::desc& diff_dst, const std::vector<int>& strides,
-      const std::vector<int>& paddings,
-      const mkldnn::convolution_forward::primitive_desc& conv_pd,
-      const mkldnn::engine& engine) const {
-    auto conv_bwd_data_desc = mkldnn::convolution_backward_data::desc(
-        mkldnn::convolution_direct, diff_src, weights, diff_dst, strides,
-        paddings, paddings, mkldnn::padding_kind::zero);
-    return mkldnn::convolution_backward_data::primitive_desc(conv_bwd_data_desc,
-                                                             engine, conv_pd);
-  }
 };
 }  // namespace operators

--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -75,9 +75,8 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvOp::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
  framework::LibraryType library{framework::LibraryType::kPlain};
-  std::string data_format = ctx.Attr<std::string>("data_format");
  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  std::string data_format = ctx.Attr<std::string>("data_format");
  framework::DataLayout layout = framework::StringToDataLayout(data_format);
 #ifdef PADDLE_WITH_CUDA

--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -34,6 +34,12 @@ void GRPCClient::InitEventLoop() {
  client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this)));
 }
+void GRPCClient::SendComplete() {
+  for (auto& it : channels_) {
+    this->AsyncSendComplete(it.first);
+  }
+}
 GRPCClient::~GRPCClient() {
  Wait();
  cq_.Shutdown();
@@ -210,6 +216,19 @@ void GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
  req_count_++;
 }
+void GRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) {
+  const auto ch = GetChannel(ep);
+  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
+  s->Prepare(time_out);
+  sendrecv::VariableMessage req;
+  req.set_varname(COMPLETE_MESSAGE);
+  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+  req_count_++;
+}
 void GRPCClient::Wait() {
  std::unique_lock<std::mutex> lk(sync_mutex_);
  sync_cond_.wait(lk, [this] { return req_count_ == 0; });

--- a/paddle/fluid/operators/detail/grpc_client.h
+++ b/paddle/fluid/operators/detail/grpc_client.h
@@ -195,6 +195,8 @@ class GRPCClient : public RPCClient {
  void Wait() override;
+  void SendComplete() override;
 protected:
  void InitImpl() override;
@@ -204,6 +206,9 @@ class GRPCClient : public RPCClient {
  void Proceed();
+  void AsyncSendComplete(const std::string& ep,
+                         int64_t time_out = RPCClient::rpc_time_out);
  std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
 private:

--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -162,16 +162,18 @@ class RequestPrefetch final : public RequestBase {
  void Process() override {
    // prefetch process...
-    std::string varname = request_->OutVarname();
+    std::string in_var_name = request_->Varname();
-    VLOG(3) << "RequestPrefetch " << varname;
+    std::string out_var_name = request_->OutVarname();
+    VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name
+            << " out_var_name: " << out_var_name;
    auto scope = request_->GetMutableLocalScope();
-    auto invar = scope->FindVar(varname);
+    auto invar = scope->FindVar(in_var_name);
-    framework::Variable* outvar = nullptr;
+    framework::Variable* outvar = scope->FindVar(out_var_name);
-    request_handler_->Handle(varname, scope, invar, &outvar);
+    request_handler_->Handle(in_var_name, scope, invar, &outvar, out_var_name);
-    SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(),
+    SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(),
                          &reply_);
    Finish(reply_, &responder_);
  }
@@ -287,7 +289,7 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
  } else if (rpc_name == kRequestPrefetch) {
    b = new RequestPrefetch(&service_, cq.get(), handler, req_id);
  } else {
-    PADDLE_ENFORCE(false, "not surpported rpc");
+    PADDLE_ENFORCE(false, "not supported rpc");
  }
  reqs[req_id] = b;

--- a/paddle/fluid/operators/detail/request_handler.h
+++ b/paddle/fluid/operators/detail/request_handler.h
@@ -40,6 +40,7 @@ constexpr char kRequestPrefetch[] = "RequestPrefetch";
 #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
 #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
 #define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
+#define COMPLETE_MESSAGE "COMPLETE@RECV"
 class RPCServer;
@@ -60,9 +61,12 @@ class RequestHandler {
  void SetDevCtx(const platform::DeviceContext* dev_ctx) { dev_ctx_ = dev_ctx; }
  void SetProgram(framework::ProgramDesc* program) { program_ = program; }
  void SetExecutor(framework::Executor* executor) { executor_ = executor; }
+  // Used for dist lookup table prefetch
  void SetPrefetchPreparedCtx(
-      std::unique_ptr<framework::ExecutorPrepareContext> prepared) {
+      std::unordered_map<
-    prefetch_ctx_.reset(prepared.release());
+          std::string, std::shared_ptr<framework::ExecutorPrepareContext>>* g) {
+    prefetch_var_name_to_prepared_ctx_ = g;
  }
  // Used for async.
@@ -78,9 +82,6 @@ class RequestHandler {
  bool sync_mode() { return sync_mode_; }
  framework::Scope* scope() { return scope_; }
  const platform::DeviceContext* dev_ctx() { return dev_ctx_; }
-  framework::ExecutorPrepareContext* prefetch_ctx() {
-    return prefetch_ctx_.get();
-  }
  framework::ProgramDesc* program() { return program_; }
  framework::Executor* executor() { return executor_; }
@@ -99,8 +100,8 @@ class RequestHandler {
  //           *request_handler_->dev_ctx(), &reply_);
  //    }
  virtual bool Handle(const std::string& varname, framework::Scope* scope,
-                      framework::Variable* var,
+                      framework::Variable* var, framework::Variable** outvar,
-                      framework::Variable** outvar) = 0;
+                      const std::string& out_var_name = "") = 0;
 protected:
  const bool sync_mode_;
@@ -109,12 +110,17 @@ class RequestHandler {
  framework::Executor* executor_;
  framework::Scope* scope_;
  framework::ProgramDesc* program_;
-  std::unique_ptr<framework::ExecutorPrepareContext> prefetch_ctx_;
+  // used for distribute lookup table prefetch
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>*
+      prefetch_var_name_to_prepared_ctx_;
  // Used for async.
  std::unordered_map<std::string,
                     std::shared_ptr<framework::ExecutorPrepareContext>>*
      grad_to_prepared_ctx_;
  RPCServer* rpc_server_;
 };

--- a/paddle/fluid/operators/detail/request_handler_impl.cc
+++ b/paddle/fluid/operators/detail/request_handler_impl.cc
@@ -30,7 +30,8 @@ namespace detail {
 bool RequestSendHandler::Handle(const std::string& varname,
                                framework::Scope* scope,
                                framework::Variable* invar,
-                                framework::Variable** outvar) {
+                                framework::Variable** outvar,
+                                const std::string& out_var_name) {
  VLOG(4) << "RequestSendHandler:" << varname;
  // Async
@@ -49,6 +50,9 @@ bool RequestSendHandler::Handle(const std::string& varname,
  if (varname == BATCH_BARRIER_MESSAGE) {
    VLOG(3) << "sync: recv batch barrier message";
    rpc_server_->IncreaseBatchBarrier(kRequestSend);
+  } else if (varname == COMPLETE_MESSAGE) {
+    VLOG(3) << "sync: recv complete message";
+    rpc_server_->DecreaseClientNum();
  } else {
    VLOG(3) << "sync: received var_name: " << varname;
    if (sync_mode_) {
@@ -79,7 +83,8 @@ void RequestSendHandler::ResetSparseVarRecorder() {
 bool RequestGetHandler::Handle(const std::string& varname,
                               framework::Scope* scope,
                               framework::Variable* invar,
-                               framework::Variable** outvar) {
+                               framework::Variable** outvar,
+                               const std::string& out_var_name) {
  VLOG(4) << "RequestGetHandler:" << varname;
  if (varname != FETCH_BARRIER_MESSAGE) {
@@ -102,13 +107,14 @@ bool RequestGetHandler::Handle(const std::string& varname,
 bool RequestPrefetchHandler::Handle(const std::string& varname,
                                    framework::Scope* scope,
                                    framework::Variable* invar,
-                                    framework::Variable** outvar) {
+                                    framework::Variable** outvar,
+                                    const std::string& out_var_name) {
  VLOG(4) << "RequestPrefetchHandler " << varname;
-  auto var_desc = program_->Block(0).FindVar(varname);
+  auto var_desc = program_->Block(0).FindVar(out_var_name);
-  *outvar = scope->FindVar(varname);
  InitializeVariable(*outvar, var_desc->GetType());
-  executor_->RunPreparedContext(prefetch_ctx_.get(), scope);
+  executor_->RunPreparedContext(
+      (*prefetch_var_name_to_prepared_ctx_)[varname].get(), scope);
  return true;
 }

--- a/paddle/fluid/operators/detail/request_handler_impl.h
+++ b/paddle/fluid/operators/detail/request_handler_impl.h
@@ -39,7 +39,8 @@ class RequestSendHandler final : public RequestHandler {
  explicit RequestSendHandler(bool sync_mode) : RequestHandler(sync_mode) {}
  virtual ~RequestSendHandler() {}
  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar) override;
+              framework::Variable* var, framework::Variable** outvar,
+              const std::string& out_var_name = "") override;
  void ResetSparseVarRecorder();
 private:
@@ -52,7 +53,8 @@ class RequestGetHandler final : public RequestHandler {
  explicit RequestGetHandler(bool sync_mode) : RequestHandler(sync_mode) {}
  virtual ~RequestGetHandler() {}
  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar) override;
+              framework::Variable* var, framework::Variable** outvar,
+              const std::string& out_var_name = "") override;
 };
 class RequestPrefetchHandler final : public RequestHandler {
@@ -60,7 +62,8 @@ class RequestPrefetchHandler final : public RequestHandler {
  explicit RequestPrefetchHandler(bool sync_mode) : RequestHandler(sync_mode) {}
  virtual ~RequestPrefetchHandler() {}
  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar) override;
+              framework::Variable* var, framework::Variable** outvar,
+              const std::string& out_var_name = "") override;
 };
 }  // namespace detail

--- a/paddle/fluid/operators/detail/rpc_client.h
+++ b/paddle/fluid/operators/detail/rpc_client.h
@@ -53,6 +53,11 @@ class RPCClient {
  virtual void AsyncSendFetchBarrier(const std::string& ep,
                                     int64_t time_out = rpc_time_out) = 0;
+  // SendComplete tells all the server that current trainer have no more data
+  // to train, so that the pserver can reduce it's barrier count, and continue
+  // to train with other trainers.
+  virtual void SendComplete() = 0;
  virtual void Wait() = 0;
  static constexpr int64_t rpc_time_out = 120 * 1000;

--- a/paddle/fluid/operators/detail/rpc_server.cc
+++ b/paddle/fluid/operators/detail/rpc_server.cc
@@ -43,7 +43,7 @@ void RPCServer::SavePort() const {
 void RPCServer::WaitBarrier(const std::string& rpc_name) {
  std::unique_lock<std::mutex> lock(this->mutex_);
-  barrier_cond_.wait(lock, [=] {
+  barrier_cond_.wait(lock, [this, &rpc_name] {
    return (barrier_counter_[rpc_name] >= client_num_ || exit_flag_.load());
  });
@@ -53,19 +53,23 @@ void RPCServer::WaitBarrier(const std::string& rpc_name) {
 void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
  VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
  int b = 0;
-  {
+  std::unique_lock<std::mutex> lock(mutex_);
-    std::unique_lock<std::mutex> lock(mutex_);
+  b = ++barrier_counter_[rpc_name];
-    b = ++barrier_counter_[rpc_name];
-  }
-  VLOG(3) << "RPCServer IncreaseBatchBarrier " << rpc_name
-          << ", barrier_count:" << b << ", fan_in" << client_num_;
  if (b >= client_num_) {
+    lock.unlock();
    barrier_cond_.notify_all();
+    lock.lock();
  }
 }
+void RPCServer::DecreaseClientNum() {
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    client_num_--;
+  }
+  barrier_cond_.notify_all();
+}
 void RPCServer::ResetBarrierCounter() {
  VLOG(3) << "RPCServer ResetBarrierCounter ";
  std::unique_lock<std::mutex> lock(mutex_);

--- a/paddle/fluid/operators/detail/rpc_server.h
+++ b/paddle/fluid/operators/detail/rpc_server.h
@@ -60,7 +60,7 @@ class RPCServer {
  void SetCond(const std::string& rpc_name);
  void WaitCond(const std::string& rpc_name);
  void IncreaseBatchBarrier(const std::string rpc_name);
+  void DecreaseClientNum();
  void ResetBarrierCounter();
 protected:
@@ -79,8 +79,7 @@ class RPCServer {
  std::string bind_address_;
  std::atomic<int> exit_flag_;
  int selected_port_;
+  int client_num_;
-  const int client_num_;
  std::unordered_map<std::string, RequestHandler*> rpc_call_map_;
  std::unordered_map<std::string, int> rpc_thread_num_;

--- a/paddle/fluid/operators/detail/rpc_server_test.cc
+++ b/paddle/fluid/operators/detail/rpc_server_test.cc
@@ -98,11 +98,17 @@ void StartServer() {
  framework::Executor exe(place);
  platform::CPUDeviceContext ctx(place);
  auto* block = AppendPrefetchBlcok(&program);
-  auto prepared = exe.Prepare(program, block->ID());
+  std::string in_var_name("ids");
+  std::vector<int> prefetch_block_ids{block->ID()};
+  auto prepared = exe.Prepare(program, prefetch_block_ids);
  InitTensorsOnServer(&scope, &place, 10);
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>
+      prefetch_var_name_to_prepared;
+  prefetch_var_name_to_prepared[in_var_name] = prepared[0];
  g_req_handler->SetProgram(&program);
-  g_req_handler->SetPrefetchPreparedCtx(std::move(prepared));
+  g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared);
  g_req_handler->SetDevCtx(&ctx);
  g_req_handler->SetScope(&scope);
  g_req_handler->SetExecutor(&exe);

--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -66,40 +66,41 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
        .SetDefault(-1)
        .EqualGreaterThan(-1);
    AddComment(string::Sprintf(R"DOC(
-Limited Elementwise %s Operator.
+Limited Elementwise %s Operator
 The equation is:
 $$%s$$
-$X$ is a tensor of any dimension and the dimensions of tensor $Y$ must be
+- $X$: a tensor of any dimension. 
-smaller than or equal to the dimensions of $X$.
+- $Y$: a tensor whose dimensions must be less than or equal to the dimensions of $X$.
 There are two cases for this operator:
-1. The shape of $Y$ is same with $X$;
-2. The shape of $Y$ is a congiguous subsequencet of $X$. The trailing dimensions
-   of size 1 for $Y$ will be ignored for the consideration of subsequence.
+1. The shape of $Y$ is the same with $X$.
+2. The shape of $Y$ is a continuous subsequence of $X$.
 For case 2:
-$Y$ will be broadcasted to match the shape of $X$ and axis should be
+1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index 
-set to index of the start dimension to broadcast $Y$ onto $X$.
+   for broadcasting $Y$ onto $X$. 
+2. If $axis$ is -1 (default), $axis = rank(X) - rank(Y)$.
+3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of 
+   subsequence, such as shape(Y) = (2, 1) => (2).
-If axis is -1, it is treated as axis=rank(X)-rank(Y).
+For example:
-For example
  .. code-block:: python
    shape(X) = (2, 3, 4, 5), shape(Y) = (,)
    shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
-    shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
+    shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5), with axis=-1(default) or axis=2
    shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
    shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
    shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0
-Either of the inputs $X$ and $Y$ or none can carry the LoD (Level of Details)
+The inputs $X$ and $Y$ can carry the different LoD information. 
-information. However, the output only shares the LoD information with input $X$.
+But the output only shares the LoD information with the input $X$.
 )DOC",
                               GetName(), GetEquation()));

--- a/paddle/fluid/operators/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/gen_nccl_id_op.cc
@@ -67,6 +67,10 @@ class GenNCCLIdOp : public framework::OperatorBase {
      client->AsyncSendVar(ep, dev_ctx, *scope, NCCL_ID_VARNAME);
    }
    client->Wait();
+    for (auto& ep : endpoint_list) {
+      client->AsyncSendBatchBarrier(ep);
+    }
+    client->Wait();
    VLOG(3) << "sending completed...";
  }

--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -96,19 +96,22 @@ static int64_t GetTimestamp() {
  return tp.tv_sec * 1000 + tp.tv_usec / 1000;
 }
-void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
+void ListenAndServOp::RunSyncLoop(
-                                  framework::ProgramDesc *program,
+    framework::Executor *executor, framework::ProgramDesc *program,
-                                  framework::Scope *recv_scope,
+    framework::Scope *recv_scope,
-                                  framework::BlockDesc *prefetch_block) const {
+    const std::vector<int> &prefetch_block_id_list) const {
  size_t num_blocks = program->Size();
  PADDLE_ENFORCE_GE(num_blocks, 2,
                    "server program should have at least 2 blocks");
-  std::vector<int> block_list;
+  std::vector<int> optimize_block_id_list;
-  for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
+  for (int blkid = 1; blkid < num_blocks; ++blkid) {
-    block_list.push_back(blkid);
+    if (std::find(prefetch_block_id_list.begin(), prefetch_block_id_list.end(),
+                  blkid) == prefetch_block_id_list.end()) {
+      optimize_block_id_list.push_back(blkid);
+    }
  }
-  auto optimize_prepared = executor->Prepare(*program, block_list);
+  auto optimize_prepared = executor->Prepare(*program, optimize_block_id_list);
  // Insert placeholder for block0 which holds current op itself.
  optimize_prepared.insert(
      optimize_prepared.begin(),
@@ -135,16 +138,17 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor,
    std::vector<size_t> parallel_blkids;
    parallel_blkids.push_back(1);
    double ts = GetTimestamp();
-    for (size_t blkid = 2; blkid < num_blocks; ++blkid) {
+    for (size_t i = 1; i < optimize_block_id_list.size(); ++i) {
-      if (blkid != static_cast<size_t>(prefetch_block->ID())) {
+      // skip the first optimize block because it is already in the
-        if (program->Block(blkid).Parent() != last_parent_blkid) {
+      // parallel_blkids.
-          ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
+      int blkid = optimize_block_id_list[i];
-                                program, recv_scope);
+      if (program->Block(blkid).Parent() != last_parent_blkid) {
-          parallel_blkids.clear();
+        ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
-          last_parent_blkid = program->Block(blkid).Parent();
+                              program, recv_scope);
-        }
+        parallel_blkids.clear();
-        parallel_blkids.push_back(blkid);
+        last_parent_blkid = program->Block(blkid).Parent();
      }
+      parallel_blkids.push_back(blkid);
    }
    ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program,
                          recv_scope);
@@ -210,18 +214,19 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
  }  // while(true)
 }
-static void FillRequestCtx(detail::RequestHandler *h, framework::Scope *scope,
+static void FillRequestCtx(
-                           platform::DeviceContext *dev_ctx,
+    detail::RequestHandler *h, framework::Scope *scope,
-                           framework::Executor *executor,
+    platform::DeviceContext *dev_ctx, framework::Executor *executor,
-                           framework::ProgramDesc *program,
+    framework::ProgramDesc *program,
-                           framework::ExecutorPrepareContext *prefetch_ctx,
+    std::unordered_map<std::string,
-                           detail::RPCServer *rpc_server) {
+                       std::shared_ptr<framework::ExecutorPrepareContext>>
+        *prefetch_ctx,
+    detail::RPCServer *rpc_server) {
  h->SetScope(scope);
  h->SetDevCtx(dev_ctx);
  h->SetExecutor(executor);
  h->SetProgram(program);
-  h->SetPrefetchPreparedCtx(
+  h->SetPrefetchPreparedCtx(prefetch_ctx);
-      std::unique_ptr<framework::ExecutorPrepareContext>(prefetch_ctx));
  h->SetRPCServer(rpc_server);
 }
@@ -255,17 +260,42 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
                            request_prefetch_handler_.get());
  auto *optimize_block = Attr<framework::BlockDesc *>(kOptimizeBlock);
-  auto *prefetch_block = Attr<framework::BlockDesc *>(kPrefetchBlock);
  auto *program = optimize_block->Program();
  framework::Executor executor(dev_place);
  // prepare for prefetch
-  VLOG(3) << "prefetch block id is " << prefetch_block->ID();
+  std::vector<int> prefetch_block_id_list;
-  auto prefetch_prepared = executor.Prepare(*program, prefetch_block->ID());
+  std::unordered_map<int, std::string> block_id_to_prefetch_var_name;
+  auto prefetch_var_name_to_block_id_str =
+      Attr<std::vector<std::string>>(kPrefetchVarNameToBlockId);
+  for (const auto &prefetch_var_name_and_id :
+       prefetch_var_name_to_block_id_str) {
+    std::vector<std::string> pieces;
+    split(prefetch_var_name_and_id, ':', &pieces);
+    VLOG(3) << "after split, prefetch_var = " << pieces[0]
+            << ", id=" << pieces[1];
+    PADDLE_ENFORCE_EQ(pieces.size(), 2);
+    int block_id = std::stoi(pieces[1]);
+    prefetch_block_id_list.push_back(block_id);
+    block_id_to_prefetch_var_name[block_id] = pieces[0];
+  }
+  auto prefetch_prepared = executor.Prepare(*program, prefetch_block_id_list);
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>
+      prefetch_var_name_to_prepared_ctx;
+  for (size_t i = 0; i < prefetch_block_id_list.size(); ++i) {
+    auto block_id = prefetch_block_id_list[i];
+    auto prefetch_var_name = block_id_to_prefetch_var_name[block_id];
+    prefetch_var_name_to_prepared_ctx[prefetch_var_name] = prefetch_prepared[i];
+  }
  auto f = std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope,
-                     &dev_ctx, &executor, program, prefetch_prepared.release(),
+                     &dev_ctx, &executor, program,
-                     rpc_service_.get());
+                     &prefetch_var_name_to_prepared_ctx, rpc_service_.get());
  f(request_send_handler_.get());
  f(request_get_handler_.get());
@@ -283,7 +313,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
  // Write to a file of server selected port for python use.
  SavePort();
  if (sync_mode) {
-    RunSyncLoop(&executor, program, &recv_scope, prefetch_block);
+    RunSyncLoop(&executor, program, &recv_scope, prefetch_block_id_list);
  } else {
    RunAsyncLoop(&executor, program);
  }
@@ -309,8 +339,9 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<bool>("sync_mode", "if works at sync_mode or not").SetDefault(true);
    AddAttr<framework::BlockDesc *>(kOptimizeBlock,
                                    "BlockID to run on server side.");
-    AddAttr<framework::BlockDesc *>(kPrefetchBlock,
+    AddAttr<std::vector<std::string>>(kPrefetchVarNameToBlockId,
-                                    "prefetch block to run on server side.");
+                                      "prefetch blocks to run on server side.")
+        .SetDefault({});
    AddAttr<int>("Fanin", "How many clients send to this server.")
        .SetDefault(1);
  }

--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <atomic>
 #include <set>
 #include <string>
+#include <vector>
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -30,7 +31,7 @@ namespace paddle {
 namespace operators {
 constexpr char kOptimizeBlock[] = "OptimizeBlock";
-constexpr char kPrefetchBlock[] = "PrefetchBlock";
+constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id";
 void RunServer(std::shared_ptr<detail::RPCServer> service);
@@ -46,7 +47,7 @@ class ListenAndServOp : public framework::OperatorBase {
  void RunSyncLoop(framework::Executor* executor,
                   framework::ProgramDesc* program,
                   framework::Scope* recv_scope,
-                   framework::BlockDesc* prefetch_block) const;
+                   const std::vector<int>& prefetch_block_id_list) const;
  void RunAsyncLoop(framework::Executor* executor,
                    framework::ProgramDesc* program) const;

--- a/paddle/fluid/operators/mean_iou_op.cc
+++ b/paddle/fluid/operators/mean_iou_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/mean_iou_op.h"
+namespace paddle {
+namespace operators {
+class MeanIoUOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Predictions"),
+                   "Input (Predictions) of MeanIoU op should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input (labels) of MeanIoU op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutMeanIou"),
+                   "Output (OutMeanIou) of MeanIoU op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutWrong"),
+                   "Output (OutWrong) of MeanIoU op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutCorrect"),
+                   "Output (OutWrong) of MeanIoU op should not be null.");
+    int64_t num_classes =
+        static_cast<int64_t>(ctx->Attrs().Get<int>("num_classes"));
+    ctx->SetOutputDim("OutMeanIou", {1});
+    ctx->SetOutputDim("OutWrong", {num_classes});
+    ctx->SetOutputDim("OutCorrect", {num_classes});
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Predictions")->type()),
+        ctx.GetPlace());
+  }
+};
+class MeanIoUOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Predictions",
+             "(Tensor), A Tensor of prediction results for semantic labels"
+             " with type int32 or int64. The rank should be greater than 1.");
+    AddInput(
+        "Labels",
+        "(Tensor), A Tensor of ground truth labels with type int32 or int64."
+        "Its shape should be the same as Input(Predictions).");
+    AddInput("InWrongs",
+             "(vector<Tensor>), A list of Tensor with shape "
+             "[num_classes]. They are used to collect wrong number among "
+             "batches. Empty list is also valid here.")
+        .AsDuplicable()
+        .AsDispensable();
+    AddInput(
+        "InCorrects",
+        "(vector<Tensor>), A list of Tensor with shape "
+        "[num_classes]. They are used to collect correct number among batches. "
+        "Empty list is also valid here.")
+        .AsDuplicable()
+        .AsDispensable();
+    AddInput("InMeanIou",
+             "(vector<Tensor>), A list of Tensor that Output(mean_iou) should "
+             "be added to. Empty list is also valid here.")
+        .AsDuplicable()
+        .AsDispensable();
+    AddOutput("OutMeanIou",
+              "(vector<Tensor>), A Tensor representing the"
+              " mean intersection-over-union with shape [1].");
+    AddOutput("OutWrong", "(Tensor), A Tensor with shape [num_classes]. ");
+    AddOutput("OutCorrect", "(Tensor), A Tensor with shape [num_classes]. ");
+    AddAttr<int>("num_classes", "(int), The possible number of labels.");
+    AddComment(R"DOC(
+mean-IOU Operator.
+Mean Intersection-Over-Union is a common evaluation metric for
+semantic image segmentation, which first computes the IOU for each
+semantic class and then computes the average over classes. 
+IOU is defined as follows: 
+    IOU = true_positive / (true_positive + false_positive + false_negative).
+It is based on pixel level area while "IOU Similarity Operator" 
+is based on area of rectangle.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(mean_iou, ops::MeanIoUOp, ops::MeanIoUOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(mean_iou, ops::MeanIoUKernel<int>,
+                       ops::MeanIoUKernel<int32_t>,
+                       ops::MeanIoUKernel<int64_t>);
--- a/paddle/fluid/operators/mean_iou_op.cu
+++ b/paddle/fluid/operators/mean_iou_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/mean_iou_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_info.h"
+namespace paddle {
+namespace operators {
+using platform::PADDLE_CUDA_NUM_THREADS;
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+template <typename T>
+__global__ void CountCUDAKernel(const int num_classes, const int count,
+                                const T* predictions, const T* labels,
+                                int* wrong, int* correct) {
+  extern __shared__ int blcok_cache[];
+  int* wrong_c = blcok_cache;
+  int* correct_c = blcok_cache + num_classes;
+  // init cache
+  for (int i = threadIdx.x; i < num_classes * 2; i += blockDim.x) {
+    blcok_cache[i] = 0;
+  }
+  __syncthreads();
+  T pred;
+  T label;
+  CUDA_1D_KERNEL_LOOP(i, count) {
+    pred = predictions[i];
+    label = labels[i];
+    if (pred == label) {
+      atomicAdd(correct_c + pred, 1);
+    } else {
+      atomicAdd(wrong_c + pred, 1);
+      atomicAdd(wrong_c + label, 1);
+    }
+  }
+  __syncthreads();
+  for (int i = threadIdx.x; i < num_classes; i += blockDim.x) {
+    atomicAdd(wrong + i, wrong_c[i]);
+    atomicAdd(correct + i, correct_c[i]);
+  }
+}
+__global__ void ComputeIoUCUDAKernel(const int num_classes, int* wrong,
+                                     int* correct, float* ious, float* iou) {
+  __shared__ int valid_count_c;
+  if (threadIdx.x == 0) {
+    valid_count_c = 0;
+  }
+  __syncthreads();
+  CUDA_1D_KERNEL_LOOP(i, num_classes) {
+    int wrong_n = wrong[i];
+    int correct_n = correct[i];
+    int denominator = wrong_n + correct_n;
+    if (denominator > 0) {
+      atomicAdd(&valid_count_c, 1);
+      ious[i] = static_cast<float>(correct_n) / denominator;
+    } else {
+      ious[i] = 0;
+    }
+  }
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    float iou_sum = 0;
+    for (int i = 0; i < num_classes; ++i) {
+      iou_sum += ious[i];
+    }
+    iou[0] += iou_sum / valid_count_c;
+  }
+}
+template <typename T>
+class MeanIoUCUDAOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& place = *ctx.template device_context<platform::CUDADeviceContext>()
+                       .eigen_device();
+    // get input and output tensor
+    auto* predictions = ctx.Input<Tensor>("Predictions");
+    auto* labels = ctx.Input<Tensor>("Labels");
+    auto* out_mean_iou = ctx.Output<Tensor>("OutMeanIou");
+    auto* out_wrong = ctx.Output<Tensor>("OutWrong");
+    auto* out_correct = ctx.Output<Tensor>("OutCorrect");
+    int num_classes = static_cast<int>(ctx.Attr<int>("num_classes"));
+    // Get data ptr
+    const T* predictions_data = predictions->data<T>();
+    const T* labels_data = labels->data<T>();
+    int* out_wrong_data = out_wrong->mutable_data<int>(ctx.GetPlace());
+    int* out_correct_data = out_correct->mutable_data<int>(ctx.GetPlace());
+    float* out_mean_iou_data =
+        out_mean_iou->mutable_data<float>(ctx.GetPlace());
+    // Get Eigen tensor
+    auto out_mean_iou_t = EigenTensor<float, 1>::From(*out_mean_iou);
+    auto out_wrong_t = EigenTensor<int, 1>::From(*out_wrong);
+    auto out_correct_t = EigenTensor<int, 1>::From(*out_correct);
+    // Temporary tensor
+    Tensor ious;
+    float* ious_data = ious.mutable_data<float>(
+        {static_cast<int64_t>(num_classes)}, ctx.GetPlace());
+    auto ious_t = EigenTensor<float, 1>::From(ious);
+    // Init out_wrong, out_correct and out_mean_iou
+    out_wrong_t.device(place) = out_wrong_t.constant(0);
+    out_correct_t.device(place) = out_correct_t.constant(0);
+    out_mean_iou_t.device(place) = out_mean_iou_t.constant(0.0f);
+    // collect pre wrong, correct and mean_iou
+    auto in_mean_ious = ctx.MultiInput<Tensor>("InMeanIou");
+    for (int i = 0; i < in_mean_ious.size(); ++i) {
+      out_mean_iou_t.device(place) +=
+          EigenTensor<float, 1>::From(*in_mean_ious[i]);
+    }
+    auto in_wrongs = ctx.MultiInput<Tensor>("InWrongs");
+    for (int i = 0; i < in_wrongs.size(); ++i) {
+      out_wrong_t.device(place) += EigenTensor<int, 1>::From(*in_wrongs[i]);
+    }
+    auto in_corrects = ctx.MultiInput<Tensor>("InCorrects");
+    for (int i = 0; i < in_corrects.size(); ++i) {
+      out_correct_t.device(place) += EigenTensor<int, 1>::From(*in_corrects[i]);
+    }
+    // compute
+    auto stream = ctx.cuda_device_context().stream();
+    int block = PADDLE_CUDA_NUM_THREADS;
+    int grid = (predictions->numel() + block - 1) / block;
+    int cache_size = (num_classes * 2 + 1) * sizeof(int);
+    CountCUDAKernel<T><<<grid, block, cache_size, stream>>>(
+        num_classes, predictions->numel(), predictions_data, labels_data,
+        out_wrong_data, out_correct_data);
+    ctx.device_context().Wait();
+    ComputeIoUCUDAKernel<<<1, block, 0, stream>>>(num_classes, out_wrong_data,
+                                                  out_correct_data, ious_data,
+                                                  out_mean_iou_data);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(mean_iou, ops::MeanIoUCUDAOpKernel<int>,
+                        ops::MeanIoUCUDAOpKernel<int64_t>,
+                        ops::MeanIoUCUDAOpKernel<int32_t>);
--- a/paddle/fluid/operators/mean_iou_op.h
+++ b/paddle/fluid/operators/mean_iou_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename T, int D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+template <typename T>
+class MeanIoUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
+                       .eigen_device();
+    // get input and output tensor
+    auto* predictions = ctx.Input<Tensor>("Predictions");
+    auto* labels = ctx.Input<Tensor>("Labels");
+    auto* out_mean_iou = ctx.Output<Tensor>("OutMeanIou");
+    auto* out_wrong = ctx.Output<Tensor>("OutWrong");
+    auto* out_correct = ctx.Output<Tensor>("OutCorrect");
+    int num_classes = static_cast<int>(ctx.Attr<int>("num_classes"));
+    // get data ptr
+    const T* predictions_data = predictions->data<T>();
+    const T* labels_data = labels->data<T>();
+    float* out_mean_iou_data =
+        out_mean_iou->mutable_data<float>(ctx.GetPlace());
+    int* out_wrong_data = out_wrong->mutable_data<int>(ctx.GetPlace());
+    int* out_correct_data = out_correct->mutable_data<int>(ctx.GetPlace());
+    // get eigen tensor
+    auto out_mean_iou_t = EigenTensor<float, 1>::From(*out_mean_iou);
+    auto out_wrong_t = EigenTensor<int, 1>::From(*out_wrong);
+    auto out_correct_t = EigenTensor<int, 1>::From(*out_correct);
+    // Tmp tensor
+    Tensor denominator;
+    Tensor valid_count;
+    Tensor iou_sum;
+    // get data ptr of tmp tensor
+    int* denominator_data = denominator.mutable_data<int>(
+        {static_cast<int64_t>(num_classes)}, ctx.GetPlace());
+    int* valid_count_data = valid_count.mutable_data<int>({1}, ctx.GetPlace());
+    float* iou_sum_data = iou_sum.mutable_data<float>({1}, ctx.GetPlace());
+    // get eigen tensor of tmp tensor
+    auto denominator_t = EigenTensor<int, 1>::From(denominator);
+    auto valid_count_t = EigenTensor<int, 1>::From(valid_count);
+    auto iou_sum_t = EigenTensor<float, 1>::From(iou_sum);
+    // init out_wrong, out_correct and out_mean_iou
+    out_wrong_t = out_wrong_t.constant(0);
+    out_correct_t = out_correct_t.constant(0);
+    out_mean_iou_t = out_mean_iou_t.constant(0);
+    // collect pre wrong, correct and mean_iou
+    auto in_mean_ious = ctx.MultiInput<Tensor>("InMeanIou");
+    for (size_t i = 0; i < in_mean_ious.size(); ++i) {
+      out_mean_iou_t.device(place) +=
+          EigenTensor<float, 1>::From(*in_mean_ious[i]);
+    }
+    auto in_wrongs = ctx.MultiInput<Tensor>("InWrongs");
+    for (size_t i = 0; i < in_wrongs.size(); ++i) {
+      out_wrong_t.device(place) += EigenTensor<int, 1>::From(*in_wrongs[i]);
+    }
+    auto in_corrects = ctx.MultiInput<Tensor>("InCorrects");
+    for (size_t i = 0; i < in_corrects.size(); ++i) {
+      out_correct_t.device(place) += EigenTensor<int, 1>::From(*in_corrects[i]);
+    }
+    // compute
+    for (int64_t i = 0; i < predictions->numel(); ++i) {
+      if (predictions_data[i] == labels_data[i]) {
+        out_correct_data[predictions_data[i]] += 1;
+      } else {
+        out_wrong_data[labels_data[i]] += 1;
+        out_wrong_data[predictions_data[i]] += 1;
+      }
+    }
+    denominator_t = out_wrong_t + out_correct_t;
+    valid_count_t =
+        (denominator_t > denominator_t.constant(0.0f)).cast<int>().sum();
+    for (int i = 0; i < num_classes; ++i) {
+      if (denominator_data[i] == 0) {
+        denominator_data[i] = 1;
+      }
+    }
+    iou_sum_t =
+        (out_correct_t.cast<float>() / denominator_t.cast<float>()).sum();
+    out_mean_iou_data[0] += (iou_sum_data[0] / valid_count_data[0]);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/merge_ids_op.cc
+++ b/paddle/fluid/operators/merge_ids_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/merge_ids_op.h"
+namespace paddle {
+namespace operators {
+class MergeIdsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}");
+    AddInput(
+        "X",
+        "(LoDTensors) multi input tensor with shape{batch_num, N}, N is the "
+        "size of embedding table")
+        .AsDuplicable();
+    AddOutput("Out", "(LoDTensor) The merged outputs of the input tensors.");
+    AddComment(R"DOC(
+Merge multi LoDTensor's into one according to Ids's shard num.
+split_ids_op -> prefetch_op -> merge_ids_op
+merge_ids_op should be used after split_ids_op and prefetch_op, split_ids_op
+ will split input Ids into multiple tensors according to Id's shard number.
+prefetch_op will send them to parameter server to prefetch embedding value
+back. During split, the order of ids is disordered. In merge_ids_op we use
+the original Ids to restore the order of the fetched embedding value and
+ also pass the lod information to the merged output.
+Example:
+    Ids = [1,2,3,4,5,6] # 3 shared
+split_ids_op ->
+    Id0 = [3, 6] # id % 3 == 0
+    Id1 = [1, 4] # id % 3 == 1
+    Id2 = [2, 5] # id % 3 == 2
+prefetch_op ->
+    X0 = [[0.3 0.3]   # 3
+          [0.6 0.6]]  # 6
+    X1 = [[0.1 0.1]   # 1
+          [0.4 0.4]]  # 4
+    X2 = [[0.2 0.2]   # 2
+          [0.5 0.5]]  # 5
+merge_ids_op ->
+    Out = [[0.1 0.1]  # 1
+           [0.2 0.2]  # 2
+           [0.3 0.3]  # 3
+           [0.4 0.4]  # 4
+           [0.5 0.5]  # 5
+           [0.6 0.6]] # 6
+)DOC");
+  }
+};
+class MergeIdsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Ids"), "MergeIdsOp must has input Ids.");
+    PADDLE_ENFORCE(ctx->HasInputs("X"), "MergeIdsOp must has input X.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "MergeIdsOp must has output Out.");
+    auto ids_var_type = ctx->GetInputsVarType("Ids").front();
+    auto ids_dims = ctx->GetInputDim("Ids");
+    if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
+      PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
+      PADDLE_ENFORCE_EQ(ids_dims[1], 1);
+    }
+    auto x_var_type = ctx->GetInputsVarType("X");
+    for (auto &var_type : x_var_type) {
+      PADDLE_ENFORCE_EQ(var_type, framework::proto::VarType::LOD_TENSOR,
+                        "input X only support lod tensors");
+    }
+    ctx->ShareLoD("Ids", "Out");
+  }
+ private:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.MultiInput<framework::Tensor>("X").front()->type()),
+        ctx.GetPlace());
+  }
+};
+class MergeIdsOpInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    auto *input_var = block->Var(op_desc.Input("Ids")[0]);
+    for (auto &out_var : op_desc.Output("Out")) {
+      block->Var(out_var)->SetType(input_var->GetType());
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(merge_ids, ops::MergeIdsOp, ops::MergeIdsOpMaker,
+                  ops::MergeIdsOpInferVarType);
+REGISTER_OP_CPU_KERNEL(
+    merge_ids, ops::MergeIdsOpKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/fluid/operators/merge_ids_op.h
+++ b/paddle/fluid/operators/merge_ids_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class MergeIdsOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto place = ctx.GetPlace();
+    if (!platform::is_cpu_place(place)) {
+      PADDLE_THROW("MergeIds do not support GPU kernel");
+    }
+    VLOG(3) << "run in MergeIdsOpKernel";
+    const auto *ids_var = ctx.InputVar("Ids");
+    PADDLE_ENFORCE(ids_var->IsType<framework::LoDTensor>(),
+                   "only support to merge Ids of LoDTensor");
+    const auto &ids_tensor = ids_var->Get<framework::LoDTensor>();
+    const auto &ids_dims = ids_tensor.dims();
+    const int64_t *ids = ids_tensor.data<int64_t>();
+    auto x_tensors = ctx.MultiInput<framework::LoDTensor>("X");
+    auto *out = ctx.Output<framework::LoDTensor>("Out");
+    int batch_size = 0;
+    int embedding_size = 0;
+    for (auto &input : x_tensors) {
+      if (framework::product(input->dims()) != 0) {
+        if (embedding_size == 0) {
+          embedding_size = input->dims()[1];
+        }
+        PADDLE_ENFORCE_EQ(embedding_size, input->dims()[1],
+                          "embedding size of all input should be the same");
+        batch_size += input->dims()[0];
+      }
+    }
+    PADDLE_ENFORCE_EQ(
+        batch_size, ids_dims[0],
+        "the batch size of ids and merged embedding value should be the same");
+    const size_t shard_num = x_tensors.size();
+    if (shard_num == 1) {
+      VLOG(3) << "only one shard, we can copy the data directly";
+      TensorCopy(*x_tensors[0], place, out);
+    } else {
+      std::vector<int> in_indexs(shard_num, 0);
+      auto *out_data = out->mutable_data<T>(
+          framework::make_ddim({batch_size, embedding_size}), place);
+      // copy data from ins[shard_num] to out.
+      for (int i = 0; i < ids_dims[0]; ++i) {
+        int64_t id = ids[i];
+        size_t shard_id = static_cast<size_t>(id) % shard_num;
+        int index = in_indexs[shard_id];
+        memcpy(out_data + embedding_size * i,
+               x_tensors[shard_id]->data<T>() + index * embedding_size,
+               sizeof(T) * embedding_size);
+        in_indexs[shard_id] += 1;
+      }
+      for (size_t i = 0; i < shard_num; ++i) {
+        PADDLE_ENFORCE_EQ(in_indexs[i], x_tensors[i]->dims()[0],
+                          "after merge, all data in x_tensor should be used");
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/reader/create_batch_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc
@@ -20,7 +20,7 @@ namespace reader {
 class BatchReader : public framework::DecoratedReader {
 public:
-  BatchReader(ReaderBase* reader, int batch_size)
+  BatchReader(const std::shared_ptr<ReaderBase>& reader, int batch_size)
      : DecoratedReader(reader), batch_size_(batch_size) {
    buffer_.reserve(batch_size_);
  }

--- a/paddle/fluid/operators/reader/create_custom_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc
@@ -22,7 +22,8 @@ namespace reader {
 class CustomReader : public framework::DecoratedReader {
 public:
-  CustomReader(ReaderBase* reader, const framework::BlockDesc& sub_block,
+  CustomReader(const std::shared_ptr<ReaderBase>& reader,
+               const framework::BlockDesc& sub_block,
               const std::vector<std::string>& source_var_names,
               const std::vector<std::string>& sink_var_names)
      : DecoratedReader(reader),

--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -34,7 +34,8 @@ static constexpr size_t kChannelSize = 1;  // kCacheSize - 2
 class DoubleBufferReader : public framework::DecoratedReader {
 public:
  explicit DoubleBufferReader(
-      ReaderBase* reader, platform::Place target_place = platform::CPUPlace())
+      const std::shared_ptr<ReaderBase>& reader,
+      platform::Place target_place = platform::CPUPlace())
      : DecoratedReader(reader), place_(target_place) {
    cpu_tensor_cache_.resize(kCacheSize);
    gpu_tensor_cache_.resize(kCacheSize);

--- a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
@@ -21,7 +21,7 @@ namespace reader {
 class MultiPassReader : public framework::DecoratedReader {
 public:
-  MultiPassReader(ReaderBase* reader, int pass_num)
+  MultiPassReader(const std::shared_ptr<ReaderBase>& reader, int pass_num)
      : DecoratedReader(reader), pass_num_(pass_num), pass_count_(0) {}
  void ReadNext(std::vector<framework::LoDTensor>* out) override {

--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
@@ -23,7 +23,8 @@ namespace reader {
 class ShuffleReader : public framework::DecoratedReader {
 public:
-  ShuffleReader(ReaderBase* reader, size_t buffer_size, size_t seed = 0)
+  ShuffleReader(const std::shared_ptr<ReaderBase>& reader, size_t buffer_size,
+                size_t seed = 0)
      : DecoratedReader(reader), buffer_size_(buffer_size), seed_(seed) {
    VLOG(10) << "Create shuffle reader of " << reader_;
    if (seed_ == 0) {

--- a/paddle/fluid/operators/reader/create_threaded_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
@@ -21,7 +21,8 @@ namespace reader {
 class ThreadedReader : public framework::DecoratedReader {
 public:
-  explicit ThreadedReader(ReaderBase* reader) : DecoratedReader(reader) {}
+  explicit ThreadedReader(const std::shared_ptr<ReaderBase>& reader)
+      : DecoratedReader(reader) {}
  void ReadNext(std::vector<framework::LoDTensor>* out) override {
    std::lock_guard<std::mutex> lock(mutex_);

--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -21,12 +21,17 @@ limitations under the License. */
 #include <unistd.h>
 #endif
+#include <algorithm>
 #include "gflags/gflags.h"
 DEFINE_double(fraction_of_cpu_memory_to_use, 1,
              "Default use 100% of CPU memory for PaddlePaddle,"
              "reserve the rest for page tables, etc");
+DEFINE_uint64(
+    initial_cpu_memory_in_mb, 500,
+    "Default initial 500MB of CPU memory for PaddlePaddle, in MD unit.");
 DEFINE_double(
    fraction_of_cuda_pinned_memory_to_use, 0.5,
    "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
@@ -54,7 +59,10 @@ inline size_t CpuTotalPhysicalMemory() {
 size_t CpuMaxAllocSize() {
  // For distributed systems, it requires configuring and limiting
  // the fraction of memory to use.
-  return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
+  return std::min(
+      static_cast<size_t>(FLAGS_fraction_of_cpu_memory_to_use *
+                          CpuTotalPhysicalMemory()),
+      static_cast<size_t>(FLAGS_initial_cpu_memory_in_mb * 1 << 20));
 }
 size_t CpuMinChunkSize() {

--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -322,7 +322,6 @@ class DeviceTracerImpl : public DeviceTracer {
    DisableActivity();
    dynload::cuptiUnsubscribe(subscriber_);
    CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_));
-    PADDLE_ENFORCE(dynload::cuptiFinalize());
    enabled_ = false;
  }

--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -72,7 +72,6 @@ extern void *cupti_dso_handle;
  __macro(cuptiGetResultString);              \
  __macro(cuptiActivityGetNumDroppedRecords); \
  __macro(cuptiActivityFlushAll);             \
-  __macro(cuptiFinalize);                     \
  __macro(cuptiSubscribe);                    \
  __macro(cuptiUnsubscribe);                  \
  __macro(cuptiEnableCallback);               \

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -413,6 +413,9 @@ All parameter, weight, gradient are variables in Paddle.
  py::class_<framework::Executor>(m, "Executor")
      .def(py::init<const platform::Place &>())
+#ifdef PADDLE_WITH_DISTRIBUTE
+      .def("complete", &Executor::Complete)
+#endif
      .def("run",
           (void (Executor::*)(const ProgramDesc &, Scope *, int, bool, bool)) &
               Executor::Run);

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -132,7 +132,8 @@ EOF
        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-        -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
+        -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
+        -DWITH_ANAKIN=ON
 }
 function abort(){

--- a/python/paddle/batch.py
+++ b/python/paddle/batch.py
@@ -15,7 +15,7 @@
 __all__ = ['batch']
-def batch(reader, batch_size, drop_last=False):
+def batch(reader, batch_size, drop_last=True):
    """
    Create a batched reader.

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -382,7 +382,7 @@ class Operator(object):
        'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
        'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine',
        'ncclInit', 'channel_create', 'channel_close', 'channel_send',
-        'channel_recv', 'select'
+        'channel_recv', 'select', 'gen_nccl_id'
    }
    def __init__(self,

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-All layers just related to the neural network.
+All layers just related to the neural network. 
 """
 from ..layer_helper import LayerHelper
@@ -25,68 +25,20 @@ import utils
 import random
 __all__ = [
-    'fc',
+    'fc', 'embedding', 'dynamic_lstm', 'dynamic_lstmp', 'dynamic_gru',
-    'embedding',
+    'gru_unit', 'linear_chain_crf', 'crf_decoding', 'cos_sim', 'cross_entropy',
-    'dynamic_lstm',
+    'square_error_cost', 'chunk_eval', 'sequence_conv', 'conv2d',
-    'dynamic_lstmp',
+    'sequence_pool', 'sequence_softmax', 'softmax', 'pool2d', 'batch_norm',
-    'dynamic_gru',
+    'beam_search_decode', 'conv2d_transpose', 'sequence_expand', 'lstm_unit',
-    'gru_unit',
+    'reduce_sum', 'reduce_mean', 'reduce_max', 'reduce_min', 'reduce_prod',
-    'linear_chain_crf',
+    'sequence_first_step', 'sequence_last_step', 'dropout', 'split',
-    'crf_decoding',
+    'ctc_greedy_decoder', 'edit_distance', 'l2_normalize', 'matmul', 'topk',
-    'cos_sim',
+    'warpctc', 'sequence_reshape', 'transpose', 'im2sequence', 'nce',
-    'cross_entropy',
+    'beam_search', 'row_conv', 'multiplex', 'layer_norm',
-    'square_error_cost',
+    'softmax_with_cross_entropy', 'smooth_l1', 'one_hot',
-    'chunk_eval',
+    'autoincreased_step_counter', 'reshape', 'lod_reset', 'lrn', 'pad',
-    'sequence_conv',
+    'label_smooth', 'roi_pool', 'dice_loss', 'image_resize',
-    'conv2d',
+    'image_resize_short', 'resize_bilinear', 'gather', 'random_crop', 'mean_iou'
-    'sequence_pool',
-    'sequence_softmax',
-    'softmax',
-    'pool2d',
-    'batch_norm',
-    'beam_search_decode',
-    'conv2d_transpose',
-    'sequence_expand',
-    'lstm_unit',
-    'reduce_sum',
-    'reduce_mean',
-    'reduce_max',
-    'reduce_min',
-    'reduce_prod',
-    'sequence_first_step',
-    'sequence_last_step',
-    'dropout',
-    'split',
-    'ctc_greedy_decoder',
-    'edit_distance',
-    'l2_normalize',
-    'matmul',
-    'topk',
-    'warpctc',
-    'sequence_reshape',
-    'transpose',
-    'im2sequence',
-    'nce',
-    'beam_search',
-    'row_conv',
-    'multiplex',
-    'layer_norm',
-    'softmax_with_cross_entropy',
-    'smooth_l1',
-    'one_hot',
-    'autoincreased_step_counter',
-    'reshape',
-    'lod_reset',
-    'lrn',
-    'pad',
-    'label_smooth',
-    'roi_pool',
-    'dice_loss',
-    'image_resize',
-    'image_resize_short',
-    'resize_bilinear',
-    'gather',
-    'random_crop',
 ]
@@ -95,7 +47,6 @@ def fc(input,
       num_flatten_dims=1,
       param_attr=None,
       bias_attr=None,
-       use_cudnn=False,
       use_mkldnn=False,
       act=None,
       is_test=False,
@@ -222,6 +173,7 @@ def embedding(input,
            have two elements which indicate the size of the dictionary of
            embeddings and the size of each embedding vector respectively.
        is_sparse(bool): The flag indicating whether to use sparse update.
+        is_distributed (bool): Whether to run lookup table from remote parameter server.
        padding_idx(int|long|None): If :attr:`None`, it makes no effect to lookup.
            Otherwise the given :attr:`padding_idx` indicates padding the output
            with zeros whenever lookup encounters it in :attr:`input`. If
@@ -261,9 +213,10 @@ def embedding(input,
    return tmp
-# TODO(qijun): expose H0 and C0
 def dynamic_lstm(input,
                 size,
+                 h_0=None,
+                 c_0=None,
                 param_attr=None,
                 bias_attr=None,
                 use_peepholes=True,
@@ -324,6 +277,13 @@ def dynamic_lstm(input,
                         (T X 4D), where T is the total time steps in this
                         mini-batch, D is the hidden size.
        size(int): 4 * hidden size.
+        h_0(Variable): The initial hidden state is an optional input, default is zero.
+                       This is a tensor with shape (N x D), where N is the
+                       batch size and D is the hidden size.
+        c_0(Variable): The initial cell state is an optional input, default is zero.
+                       This is a tensor with shape (N x D), where N is the
+                       batch size. `h_0` and `c_0` can be NULL but only at the same time.
        param_attr(ParamAttr|None): The parameter attribute for the learnable
                               hidden-hidden weights.
@@ -387,12 +347,20 @@ def dynamic_lstm(input,
    cell = helper.create_tmp_variable(dtype)
    batch_gate = helper.create_tmp_variable(dtype)
    batch_cell_pre_act = helper.create_tmp_variable(dtype)
+    inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
+    batch_size = input.shape[0]
+    if h_0:
+        assert h_0.shape == (batch_size, size), \
+            'The shape of h0 should be (batch_size, %d)' % size
+        inputs['H0'] = h_0
+    if c_0:
+        assert c_0.shape == (batch_size, size), \
+            'The shape of c0 should be (batch_size, %d)' % size
+        inputs['C0'] = c_0
    helper.append_op(
        type='lstm',
-        inputs={'Input': input,
+        inputs=inputs,
-                'Weight': weight,
-                'Bias': bias},
        outputs={
            'Hidden': hidden,
            'Cell': cell,
@@ -654,8 +622,9 @@ def dynamic_gru(input,
            :attr:`False`.
        gate_activation(str): The activation for update gate and reset gate.
            Choices = ["sigmoid", "tanh", "relu", "identity"], default "sigmoid".
-        activation(str): The activation for candidate hidden state.
+        candidate_activation(str): The activation for candidate hidden state.
            Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh".
+        h_0 (Variable): The hidden output of the first time step.
    Returns:
        Variable: The hidden state of GRU. The shape is :math:`(T \\times D)`, \
@@ -676,11 +645,13 @@ def dynamic_gru(input,
        attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
    bias = helper.create_parameter(
        attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True)
+    batch_size = input.shape[0]
    inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
    if h_0 != None:
        assert h_0.shape == (
-            size, size), 'The shape of h0 should be(%d, %d)' % (size, size)
+            batch_size, size
-        inputs['h0'] = h_0
+        ), 'The shape of h0 should be(batch_size, %d)' % size
+        inputs['H0'] = h_0
    hidden = helper.create_tmp_variable(dtype)
    batch_gate = helper.create_tmp_variable(dtype)
@@ -873,6 +844,13 @@ def cos_sim(X, Y):
    """
    This function performs the cosine similarity between two tensors
    X and Y and returns that as the output.
+    Args:
+        X (Variable): The input X.
+        Y (Variable): The input Y.
+    Returns:
+        Variable: the output of cosine(X, Y).
    """
    helper = LayerHelper('cos_sim', **locals())
    out = helper.create_tmp_variable(dtype=X.dtype)
@@ -899,15 +877,15 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
    unchanged.
    Args:
-       x(variable): The input tensor.
+        x (Variable): The input tensor.
-       dropout_prob(float): Probability of setting units to zero.
+         dropout_prob (float): Probability of setting units to zero.
-       is_test(bool): A flag indicating whether it is in test phrase or not.
+        is_test (bool): A flag indicating whether it is in test phrase or not.
-       seed(int): A Python integer used to create random seeds. If this
+        seed (int): A Python integer used to create random seeds. If this
-                  parameter is set to None, a random seed is used.
+                    parameter is set to None, a random seed is used.
-                  NOTE: If an integer seed is given, always the same output
+                    NOTE: If an integer seed is given, always the same output
-                  units will be dropped. DO NOT use a fixed seed in training.
+                    units will be dropped. DO NOT use a fixed seed in training.
-       name(str|None): A name for this layer(optional). If set None, the layer
+        name (str|None): A name for this layer(optional). If set None, the layer
-                    will be named automatically.
+                         will be named automatically.
    Returns:
        Variable: A tensor variable.
@@ -1029,8 +1007,8 @@ def square_error_cost(input, label):
        * :math:`Out`: Output value, same shape with :math:`X`.
    Args:
-       input(Variable): Input tensor, has predictions.
+        input (Variable): Input tensor, has predictions.
-       label(Variable): Label tensor, has target labels.
+        label (Variable): Label tensor, has target labels.
    Returns:
        Variable: The tensor variable storing the element-wise squared error \
@@ -1059,6 +1037,7 @@ def square_error_cost(input, label):
    return square_out
+@templatedoc()
 def chunk_eval(input,
               label,
               chunk_scheme,
@@ -1067,6 +1046,18 @@ def chunk_eval(input,
    """
    This function computes and outputs the precision, recall and
    F1-score of chunk detection.
+    Args:
+        input (Variable): prediction output of the network.
+        label (Variable): label of the test data set.
+        chunk_scheme (str): ${chunk_scheme_comment}
+        num_chunk_types (int): ${num_chunk_types_comment}
+        excluded_chunk_types (list): ${excluded_chunk_types_comment}
+    Returns:
+        tuple: tuple containing: (precision, recall, f1_score,
+               num_infer_chunks, num_label_chunks,
+               num_correct_chunks)
    """
    helper = LayerHelper("chunk_eval", **locals())
@@ -1099,6 +1090,7 @@ def chunk_eval(input,
            num_correct_chunks)
+@templatedoc()
 def sequence_conv(input,
                  num_filters,
                  filter_size=3,
@@ -1111,6 +1103,19 @@ def sequence_conv(input,
    This function creates the op for sequence_conv, using the inputs and
    other convolutional configurations for the filters and stride as given
    in the input parameters to the function.
+    Args:
+        input (Variable): ${x_comment}
+        num_filters (int): number of filters.
+        filter_size (int): the filter size (H and W).
+        filter_stride (int): stride of the filter.
+        padding (bool): if True, add paddings.
+        bias_attr (ParamAttr|None): attributes for bias
+        param_attr (ParamAttr|None): attributes for parameter
+        act (str): the activation type
+    Returns:
+        Variable: output of sequence_conv
    """
    # FIXME(dzh) : want to unify the argument of python layer
@@ -1225,33 +1230,34 @@ def conv2d(input,
            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
    Args:
-       input(Variable): The input image with [N, C, H, W] format.
+        input (Variable): The input image with [N, C, H, W] format.
-       num_filters(int): The number of filter. It is as same as the output
+            num_filters(int): The number of filter. It is as same as the output
-           image channel.
+            image channel.
-       filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+        filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
-           it must contain two integers, (filter_size_H, filter_size_W).
+            it must contain two integers, (filter_size_H, filter_size_W).
-           Otherwise, the filter will be a square.
+            Otherwise, the filter will be a square.
-       stride(int|tuple): The stride size. If stride is a tuple, it must
+        stride (int|tuple): The stride size. If stride is a tuple, it must
-           contain two integers, (stride_H, stride_W). Otherwise, the
+            contain two integers, (stride_H, stride_W). Otherwise, the
-           stride_H = stride_W = stride. Default: stride = 1.
+            stride_H = stride_W = stride. Default: stride = 1.
-       padding(int|tuple): The padding size. If padding is a tuple, it must
+        padding (int|tuple): The padding size. If padding is a tuple, it must
-           contain two integers, (padding_H, padding_W). Otherwise, the
+            contain two integers, (padding_H, padding_W). Otherwise, the
-           padding_H = padding_W = padding. Default: padding = 0.
+            padding_H = padding_W = padding. Default: padding = 0.
-       dilation(int|tuple): The dilation size. If dilation is a tuple, it must
+        dilation (int|tuple): The dilation size. If dilation is a tuple, it must
-           contain two integers, (dilation_H, dilation_W). Otherwise, the
+            contain two integers, (dilation_H, dilation_W). Otherwise, the
-           dilation_H = dilation_W = dilation. Default: dilation = 1.
+            dilation_H = dilation_W = dilation. Default: dilation = 1.
-       groups(int): The groups number of the Conv2d Layer. According to grouped
+        groups (int): The groups number of the Conv2d Layer. According to grouped
-           convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
-           the first half of the filters is only connected to the first half
+            the first half of the filters is only connected to the first half
-           of the input channels, while the second half of the filters is only
+            of the input channels, while the second half of the filters is only
-           connected to the second half of the input channels. Default: groups=1
+            connected to the second half of the input channels. Default: groups=1
-       param_attr(ParamAttr): The parameters to the Conv2d Layer. Default: None
+        param_attr (ParamAttr): The parameters to the Conv2d Layer. Default: None
-       bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
+        bias_attr (ParamAttr): Bias parameter for the Conv2d layer. Default: None
-       use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
-           library is installed. Default: True
+            library is installed. Default: True
-       act(str): Activation type. Default: None
+        use_mkldnn (bool): Use mkldnn kernels or not.
-       name(str|None): A name for this layer(optional). If set None, the layer
+        act (str): Activation type. Default: None
-           will be named automatically.
+        name (str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
    Returns:
        Variable: The tensor variable storing the convolution and \
@@ -1409,7 +1415,7 @@ def sequence_pool(input, pool_type):
 def sequence_first_step(input):
    """
-    This funciton get the first step of sequence.
+    This function gets the first step of sequence.
    .. code-block:: text
@@ -1442,7 +1448,7 @@ def sequence_first_step(input):
 def sequence_last_step(input):
    """
-    This funciton get the last step of sequence.
+    This function gets the last step of sequence.
    .. code-block:: text
@@ -1486,6 +1492,22 @@ def pool2d(input,
    """
    This function adds the operator for pooling in 2 dimensions, using the
    pooling configurations mentioned in input parameters.
+    Args:
+        input (Variable): ${input_comment}
+        pool_size (int): ${ksize_comment}
+        pool_type (str): ${pooling_type_comment}
+        pool_stride (int): stride of the pooling layer.
+        pool_padding (int): padding size.
+        global_pooling (bool): ${global_pooling_comment}
+        use_cudnn (bool): ${use_cudnn_comment}
+        ceil_mode (bool): ${ceil_mode_comment}
+        use_mkldnn (bool): ${use_mkldnn_comment}
+        name (str): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+    Returns:
+        Variable: output of pool2d layer.
    """
    if pool_type not in ["max", "avg"]:
        raise ValueError(
@@ -1543,6 +1565,25 @@ def batch_norm(input,
    """
    This function helps create an operator to implement
    the BatchNorm layer using the configurations from the input parameters.
+    Args:
+        input (Variable): the input variable.
+        act (str): activation type
+        is_test (bool): whether to run batch_norm as test mode.
+        momentum (float): momentum
+        epsilon (float): epsilon, default 1e-05
+        param_attr (ParamAttr|None): attributes for parameter
+        bias_attr (ParamAttr|None): attributes for bias
+        data_layout (str): data layout, default NCHW
+        in_place (bool): if True, do not create tmp variable
+        use_mkldnn (bool): ${use_mkldnn_comment}
+        name (str): The name of this layer. It is optional.
+        moving_mean_name (str): The name of moving mean variable name, optional.
+        moving_variance_name (str): The name of moving variance name, optional.
+        do_model_average_for_mean_and_var (bool):
+    Returns:
+        Variable: output of batch_norm layer.
    """
    helper = LayerHelper('batch_norm', **locals())
    dtype = helper.input_dtype()
@@ -1670,6 +1711,7 @@ def layer_norm(input,
        bias_attr(ParamAttr|None): The parameter attribute for the learnable
            bias :math:`b`.
        act(str): Activation to be applied to the output of layer normalizaiton.
+        name (str): The name of this layer. It is optional.
    Returns:
        Variable: A tensor variable with the same shape as the input.
@@ -1721,6 +1763,17 @@ def layer_norm(input,
 def beam_search_decode(ids, scores, name=None):
+    """
+    ${beam_search_decode}
+    Args:
+        ids (Variable): ${ids_comment}
+        scores (Variable): ${scores_comment}
+        name (str): The name of this layer. It is optional.
+    Returns:
+        tuple: a tuple of two output variable: sentence_ids, sentence_scores
+    """
    helper = LayerHelper('beam_search_decode', **locals())
    sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
    sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
@@ -1796,46 +1849,46 @@ def conv2d_transpose(input,
           W_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1
    Args:
-       input(Variable): The input image with [N, C, H, W] format.
+        input(Variable): The input image with [N, C, H, W] format.
-       num_filters(int): The number of the filter. It is as same as the output
+        num_filters(int): The number of the filter. It is as same as the output
-           image channel.
+            image channel.
-       output_size(int|tuple|None): The output image size. If output size is a
+        output_size(int|tuple|None): The output image size. If output size is a
-           tuple, it must contain two integers, (image_H, image_W). This
+            tuple, it must contain two integers, (image_H, image_W). This
-           parameter only works when filter_size is None.
+            parameter only works when filter_size is None.
-       filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
-           it must contain two integers, (filter_size_H, filter_size_W).
+            it must contain two integers, (filter_size_H, filter_size_W).
-           Otherwise, the filter will be a square. None if use output size to
+            Otherwise, the filter will be a square. None if use output size to
-           calculate filter_size.
+            calculate filter_size.
-       padding(int|tuple): The padding size. If padding is a tuple, it must
+        padding(int|tuple): The padding size. If padding is a tuple, it must
-           contain two integers, (padding_H, padding_W). Otherwise, the
+            contain two integers, (padding_H, padding_W). Otherwise, the
-           padding_H = padding_W = padding. Default: padding = 0.
+            padding_H = padding_W = padding. Default: padding = 0.
-       stride(int|tuple): The stride size. If stride is a tuple, it must
+        stride(int|tuple): The stride size. If stride is a tuple, it must
-           contain two integers, (stride_H, stride_W). Otherwise, the
+            contain two integers, (stride_H, stride_W). Otherwise, the
-           stride_H = stride_W = stride. Default: stride = 1.
+            stride_H = stride_W = stride. Default: stride = 1.
-       dilation(int|tuple): The dilation size. If dilation is a tuple, it must
+        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
-           contain two integers, (dilation_H, dilation_W). Otherwise, the
+            contain two integers, (dilation_H, dilation_W). Otherwise, the
-           dilation_H = dilation_W = dilation. Default: dilation = 1.
+            dilation_H = dilation_W = dilation. Default: dilation = 1.
-       groups(int): The groups number of the Conv2d transpose layer. Inspired by
+        groups(int): The groups number of the Conv2d transpose layer. Inspired by
-           grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
+            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
-           when group=2, the first half of the filters is only connected to the
+            when group=2, the first half of the filters is only connected to the
-           first half of the input channels, while the second half of the
+            first half of the input channels, while the second half of the
-           filters is only connected to the second half of the input channels.
+            filters is only connected to the second half of the input channels.
-           Default: groups=1
+            Default: groups=1
-       param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer.
+        param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer.
-                              Default: None
+                               Default: None
-       bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
+        bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
-       use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
+        use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
-           library is installed. Default: True
+            library is installed. Default: True
-       act(str): Activation type. Default: None
+        act(str): Activation type. Default: None
-       name(str|None): A name for this layer(optional). If set None, the layer
+        name(str|None): A name for this layer(optional). If set None, the layer
-           will be named automatically.
+            will be named automatically.
    Returns:
-       Variable: The tensor variable storing the convolution transpose result.
+        Variable: The tensor variable storing the convolution transpose result.
    Raises:
-       ValueError: If the shapes of input, filter_size, stride, padding and
+        ValueError: If the shapes of input, filter_size, stride, padding and
-                   groups mismatch.
+                    groups mismatch.
    Examples:
       .. code-block:: python
@@ -1972,6 +2025,17 @@ def sequence_expand(x, y, ref_level=-1, name=None):
 def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
    '''
    This function implements the beam search algorithm.
+    Args:
+        pre_ids (Variable): ${pre_ids_comment}
+        ids (Variable): ${ids_comment}
+        scores (Variable): ${scores_comment}
+        beam_size (int): ${beam_size_comment}
+        end_id (int): ${end_id_comment}
+        level (int): ${level_comment}
+    Returns:
+        tuple: a tuple of beam_search output variables: selected_ids, selected_scores
    '''
    helper = LayerHelper('beam_search', **locals())
    score_type = scores.dtype
@@ -2474,14 +2538,14 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
    slice along dimension `axis`.
    Args:
-       x(Variable|list): The input tensor to l2_normalize layer.
+        x(Variable|list): The input tensor to l2_normalize layer.
-       axis(int): The axis on which to apply normalization. If `axis < 0`,
+        axis(int): The axis on which to apply normalization. If `axis < 0`,
-           the dimension to normalization is rank(X) + axis. -1 is the
+            the dimension to normalization is rank(X) + axis. -1 is the
-           last dimension.
+            last dimension.
-       epsilon(float): The epsilon value is used to avoid division by zero,
+        epsilon(float): The epsilon value is used to avoid division by zero,
-           the defalut value is 1e-10.
+            the defalut value is 1e-10.
-       name(str|None): A name for this layer(optional). If set None, the layer
+        name(str|None): A name for this layer(optional). If set None, the layer
-           will be named automatically.
+            will be named automatically.
    Returns:
@@ -2694,16 +2758,13 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None,
    the edit distance will be divided by the length of reference string.
    Args:
        input(Variable): The indices for hypothesis strings.
        label(Variable): The indices for reference strings.
        normalized(bool): Indicated whether to normalize the edit distance by
                          the length of reference string.
        ignored_tokens(list of int): Tokens that should be removed before
                                     calculating edit distance.
+        name (str): The name of this layer. It is optional.
    Returns:
        Variable: sequence-to-sequence edit distance in shape [batch_size, 1].
@@ -2793,10 +2854,10 @@ def ctc_greedy_decoder(input, blank, name=None):
                         where Lp is the sum of all input sequences' length and
                         num_classes is the true number of classes. (not
                         including the blank label).
        blank(int): the blank label index of Connectionist Temporal
                    Classification (CTC) loss, which is in thehalf-opened
                    interval [0, num_classes + 1).
+        name (str): The name of this layer. It is optional.
    Returns:
        Variable: CTC greedy decode result. If all the sequences in result were
@@ -2833,23 +2894,23 @@ def warpctc(input, label, blank=0, norm_by_times=False):
    input tensor.
    Args:
-       input(Variable): (LodTensor, default: LoDTensor<float>),
+        input(Variable): (LodTensor, default: LoDTensor<float>),
-         the unscaled probabilities of variable-length sequences,
+            the unscaled probabilities of variable-length sequences,
-         which is a 2-D Tensor with LoD information.
+            which is a 2-D Tensor with LoD information.
-         It's shape is [Lp, num_classes + 1], where Lp is the sum of all input
+            It's shape is [Lp, num_classes + 1], where Lp is the sum of all input
-         sequences' length and num_classes is the true number of classes.
+            sequences' length and num_classes is the true number of classes.
-         (not including the blank label).
+            (not including the blank label).
-       label(Variable): (LodTensor, default: LoDTensor<int>), the ground truth
+        label(Variable): (LodTensor, default: LoDTensor<int>), the ground truth
-         of variable-length sequence, which is a 2-D Tensor with LoD
+            of variable-length sequence, which is a 2-D Tensor with LoD
-         information. It is of the shape [Lg, 1], where Lg is th sum of
+            information. It is of the shape [Lg, 1], where Lg is th sum of
-         all labels' length.
+            all labels' length.
-       blank: (int, default: 0), the blank label index of Connectionist
+        blank (int): default 0, the blank label index of Connectionist
-         Temporal Classification (CTC) loss, which is in the
+            Temporal Classification (CTC) loss, which is in the
-         half-opened interval [0, num_classes + 1).
+            half-opened interval [0, num_classes + 1).
-       norm_by_times: (bool, default: false), whether to normalize
+        norm_by_times (bool): default false, whether to normalize
-       the gradients by the number of time-step, which is also the
+            the gradients by the number of time-step, which is also the
-       sequence's length. There is no need to normalize the gradients
+            sequence's length. There is no need to normalize the gradients
-       if warpctc layer was follewed by a mean_op.
+            if warpctc layer was follewed by a mean_op.
    Returns:
        Variable: The Connectionist Temporal Classification (CTC) loss,
@@ -2908,9 +2969,9 @@ def sequence_reshape(input, new_dim):
    no remainder for each sequence.
    Args:
-       input (Variable): (LodTensor, default: LoDTensor<float>), a 2-D LoDTensor
+        input (Variable): (LodTensor, default: LoDTensor<float>), a 2-D LoDTensor
-                with shape being [N, M] where M for dimension.
+            with shape being [N, M] where M for dimension.
-       new_dim (int): New dimension which the input LoDTensor is reshaped to.
+        new_dim (int): New dimension which the input LoDTensor is reshaped to.
    Returns:
        Variable: Reshaped LoDTensor according to new dimension.
@@ -2932,7 +2993,10 @@ def sequence_reshape(input, new_dim):
    return out
-@autodoc()
+# FIXME(wuyi): let docstring_checker.py understand @autodoc.
+# For now, the comments in c++ use types like Tensor, but in python side
+# the type is often "Variable", and arguments may vary.
+@templatedoc(op_type="nce")
 def nce(input,
        label,
        num_total_classes,
@@ -2940,6 +3004,21 @@ def nce(input,
        param_attr=None,
        bias_attr=None,
        num_neg_samples=None):
+    """
+    ${comment}
+    Args:
+        input (Variable): input variable.
+        label (Variable): label.
+        num_total_classes (int):${num_total_classes_comment}
+        sample_weight (int): ${sample_weight_comment}
+        param_attr (ParamAttr|None): attributes for parameter
+        bias_attr (ParamAttr|None): attributes for bias
+        num_neg_samples (int): ${num_neg_samples_comment}
+    Returns:
+        Variable: output of nce layer.
+    """
    helper = LayerHelper('nce', **locals())
    assert isinstance(input, Variable)
    dim = input.shape[1]
@@ -2997,8 +3076,9 @@ def transpose(x, perm, name=None):
    perm[i]-th dimension of `input`.
    Args:
-       input (Variable): (Tensor), A Tensor.
+        x (Variable): The input Tensor.
-       perm (list): A permutation of the dimensions of `input`.
+        perm (list): A permutation of the dimensions of `input`.
+        name (str): The name of this layer. It is optional.
    Returns:
        Variable: A transposed Tensor.
@@ -3231,9 +3311,9 @@ def multiplex(inputs, index):
    row of the matrix, then `O[i]` is equal to :math:`I_{ID[i]}[i]`.
    Args:
-       inputs (list): A list of variables to gather from. All variables have the
+        inputs (list): A list of variables to gather from. All variables have the
                same shape and the rank is at least 2.
-       index (Variable): Tensor<int32>, index variable which is a 2-D tensor
+        index (Variable): Tensor<int32>, index variable which is a 2-D tensor
                with shape [M, 1] where M is the batch size.
    Returns:
@@ -3432,7 +3512,8 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
        begin(int): The first value of this counter.
        step(int): The increment step between each execution.
-    Returns(Variable): The global run counter.
+    Returns:
+        Variable: The global run counter.
    """
    helper = LayerHelper('global_step_counter')
    if counter_name is None:
@@ -3493,7 +3574,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
    the corresponding dimension of x.
    Args:
-        input(variable): The input tensor.
+        x(variable): The input tensor.
        shape(list): The new shape. At most one dimension of the new shape can
                     be -1.
        actual_shape(variable): An optional input. If provided, reshape
@@ -3505,8 +3586,10 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
        inplace(bool): If this flag is set true, a new output tensor is created
                       whose data is copied from input x, otherwise the output
                       shares data with input without copying.
+        name (str): The name of this layer. It is optional.
-    Returns(variable): The output tensor.
+    Returns:
+        Variable: The output tensor.
    Examples:
        .. code-block:: python
@@ -4027,7 +4110,6 @@ def resize_bilinear(input, out_shape=None, scale=None, name=None):
        name(str|None): The output variable name.
    Returns:
        ${out_comment}.
    """
@@ -4046,6 +4128,7 @@ def image_resize_short(input, out_short_len, resample='BILINEAR'):
                          This is a 4-D tensor of the shape
                          (num_batches, channels, in_h, in_w).
        out_short_len(int): The length of output images' short edge.
+        resample (str): resample method, default: BILINEAR.
    Returns:
        out (Variable): The output is a 4-D tensor of the shape
@@ -4100,6 +4183,7 @@ def gather(input, index):
        output (Variable): The output is a tensor with the same rank as input.
    Examples:
        .. code-block:: python
            output = fluid.layers.gather(x, index)
@@ -4164,3 +4248,53 @@ def random_crop(x, shape, seed=None):
                 "SeedOut": seed_out},
        attrs={"shape": shape})
    return out
+def mean_iou(input, label, num_classes):
+    """
+    Mean Intersection-Over-Union is a common evaluation metric for
+    semantic image segmentation, which first computes the IOU for each 
+    semantic class and then computes the average over classes. 
+    IOU is defined as follows: 
+    .. math::
+        IOU = true_positive / (true_positive + false_positive + false_negative). 
+    The predictions are accumulated in a confusion matrix and mean-IOU 
+    is then calculated from it.
+    Args:
+        input (Variable): A Tensor of prediction results for semantic labels with type int32 or int64.
+        label (Variable):  A Tensor of ground truth labels with type int32 or int64. 
+                           Its shape should be the same as input.
+    Returns:
+        mean_iou (Variable): A Tensor representing the mean intersection-over-union with shape [1].
+        out_wrong(Variable): A Tensor with shape [num_classes]. The wrong numbers of each class.
+        out_correct(Variable): A Tensor with shape [num_classes]. The correct numbers of each class. 
+    Examples:
+        .. code-block:: python
+            iou, wrongs, corrects = fluid.layers.mean_iou(predict, label, num_classes)
+    """
+    helper = LayerHelper('mean_iou', **locals())
+    dtype = helper.input_dtype()
+    out_mean_iou = helper.create_tmp_variable(dtype='float32')
+    out_wrong = helper.create_tmp_variable(dtype='int32')
+    out_correct = helper.create_tmp_variable(dtype='int32')
+    helper.append_op(
+        type="mean_iou",
+        inputs={"predictions": input,
+                "labels": label},
+        outputs={
+            "out_mean_iou": out_mean_iou,
+            "out_wrong": out_wrong,
+            "out_correct": out_correct
+        },
+        attrs={"num_classes": num_classes})
+    return out_mean_iou, out_wrong, out_correct
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
@@ -96,10 +96,11 @@ def train(use_cuda, train_program, params_dirname):
    train_reader = paddle.batch(
        paddle.reader.shuffle(
            cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10),
-        batch_size=BATCH_SIZE)
+        batch_size=BATCH_SIZE,
+        drop_last=False)
    test_reader = paddle.batch(
-        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
+        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False)
    def event_handler(event):
        if isinstance(event, fluid.EndStepEvent):

--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
@@ -73,10 +73,11 @@ def train(use_cuda, train_program, params_dirname):
    train_reader = paddle.batch(
        paddle.reader.shuffle(
            cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10),
-        batch_size=BATCH_SIZE)
+        batch_size=BATCH_SIZE,
+        drop_last=False)
    test_reader = paddle.batch(
-        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
+        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False)
    def event_handler(event):
        if isinstance(event, fluid.EndStepEvent):

--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
@@ -87,7 +87,9 @@ def train(use_cuda, train_program, params_dirname):
    def event_handler(event):
        if isinstance(event, fluid.EndEpochEvent):
            test_reader = paddle.batch(
-                paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE)
+                paddle.dataset.imdb.test(word_dict),
+                batch_size=BATCH_SIZE,
+                drop_last=False)
            avg_cost, acc = trainer.test(
                reader=test_reader, feed_order=['words', 'label'])
@@ -113,7 +115,8 @@ def train(use_cuda, train_program, params_dirname):
    train_reader = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.imdb.train(word_dict), buf_size=25000),
-        batch_size=BATCH_SIZE)
+        batch_size=BATCH_SIZE,
+        drop_last=False)
    trainer.train(
        num_epochs=1,

--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -56,7 +56,7 @@ BATCH_SIZE = 200
 # fix the order of training data
 train_reader = paddle.batch(
-    paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE)
+    paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE, drop_last=False)
 # train_reader = paddle.batch(
 #     paddle.reader.shuffle(

--- a/python/paddle/fluid/tests/unittests/test_mean_iou.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_iou.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import division
+import unittest
+import numpy as np
+from op_test import OpTest
+def compute_mean_iou(predictions, labels, num_classes, in_wrongs, in_corrects,
+                     in_mean_ious):
+    assert predictions.shape == labels.shape
+    predictions = predictions.flatten()
+    labels = labels.flatten()
+    out_wrong = np.zeros([num_classes]).astype("int32")
+    for _, wrong in in_wrongs:
+        out_wrong += wrong
+    out_correct = np.zeros([num_classes]).astype("int32")
+    for _, correct in in_corrects:
+        out_correct += correct
+    for pred, label in zip(predictions, labels):
+        if pred == label:
+            out_correct[pred] += 1
+        else:
+            out_wrong[pred] += 1
+            out_wrong[label] += 1
+    denominator = out_wrong + out_correct
+    valid_count = (denominator != 0).sum()
+    denominator = np.where(denominator > 0, denominator,
+                           np.ones(denominator.shape))
+    mean_iou = (out_correct / denominator).sum() / valid_count
+    for _, in_mean_iou in in_mean_ious:
+        mean_iou += in_mean_iou
+    return mean_iou, out_wrong, out_correct
+class TestMeanIOUOp(OpTest):
+    def setUp(self):
+        self.config()
+        self.op_type = "mean_iou"
+        predictions = np.random.randint(0, self.num_classes,
+                                        self.image_size).astype("int32")
+        labels = np.random.randint(0, self.num_classes,
+                                   self.image_size).astype("int32")
+        in_wrongs = []
+        for i in range(self.in_wrong_num):
+            in_wrongs.append(("in_wrong_%d" % i, np.random.randint(
+                0, 10, [self.num_classes]).astype("int32")))
+        in_corrects = []
+        for i in range(self.in_correct_num):
+            in_corrects.append(("in_correct_%d" % i, np.random.randint(
+                0, 10, [self.num_classes]).astype("int32")))
+        in_mean_ious = []
+        for i in range(self.in_mean_iou_num):
+            in_mean_ious.append(("in_mean_iou_%d" % i, np.random.uniform(
+                0, 1, [1]).astype("float32")))
+        self.inputs = {
+            'Predictions': predictions,
+            'Labels': labels,
+            'InWrongs': in_wrongs,
+            'InCorrects': in_corrects,
+            'InMeanIou': in_mean_ious
+        }
+        self.attrs = {'num_classes': long(self.num_classes)}
+        mean_iou, out_wrong, out_correct = compute_mean_iou(
+            predictions, labels, self.num_classes, in_wrongs, in_corrects,
+            in_mean_ious)
+        self.outputs = {
+            'OutMeanIou': mean_iou,
+            'OutWrong': out_wrong,
+            'OutCorrect': out_correct
+        }
+    def config(self):
+        self.num_classes = 10
+        self.image_size = [128, 128]
+        self.in_wrong_num = 0
+        self.in_correct_num = 0
+        self.in_mean_iou_num = 0
+    def test_check_output(self):
+        self.check_output()
+class TestCase1(TestMeanIOUOp):
+    def config(self):
+        self.num_classes = 5
+        self.image_size = [100, 128]
+        self.in_wrong_num = 2
+        self.in_correct_num = 2
+        self.in_mean_iou_num = 2
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
+++ b/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+class TestMergeIdsOp(OpTest):
+    def setUp(self):
+        self.op_type = "merge_ids"
+        ids = np.array([[0], [2], [2], [3], [5], [5], [6]]).astype('int64')
+        x0 = np.array([[0.1, 0.2], [0.2, 0.3], [0.3, 0.4]]).astype('float32')
+        x1 = np.array([]).astype('float32')
+        x2 = np.array([[0.4, 0.5], [0.4, 0.5], [0.5, 0.6],
+                       [0.5, 0.6]]).astype('float32')
+        out = np.array([[0.1, 0.2], [0.4, 0.5], [0.4, 0.5], [0.2, 0.3],
+                        [0.5, 0.6], [0.5, 0.6], [0.3, 0.4]]).astype('float32')
+        self.inputs = {'Ids': ids, "X": [('x0', x0), ('x1', x1), ('x2', x2)]}
+        self.outputs = {'Out': out}
+    def test_check_output(self):
+        self.check_output()
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -515,35 +515,38 @@ class DistributeTranspiler:
                                       grad_to_block_id, None)
        # process distributed lookup_table
-        prefetch_block = None
+        prefetch_var_name_to_block_id = []
        if self.has_distributed_lookup_table:
            pserver_index = self.pserver_endpoints.index(endpoint)
            table_opt_block = self._create_table_optimize_block(
                pserver_index, pserver_program, pre_block_idx, grad_to_block_id)
-            prefetch_block = self._create_prefetch_block(
+            prefetch_var_name_to_block_id = self._create_prefetch_block(
                pserver_index, pserver_program, table_opt_block)
        # NOTE: if has_distributed_lookup_table is False, then prefetch_block will
        # not be executed, so it's safe to use optimize_block to hold the place
        if self.has_distributed_lookup_table:
-            assert prefetch_block is not None
+            assert len(prefetch_var_name_to_block_id) > 0
        else:
-            assert prefetch_block is None
+            assert len(prefetch_var_name_to_block_id) == 0
-            prefetch_block = pserver_program.global_block()
+        attrs = {
+            "OptimizeBlock": pserver_program.block(1),
+            "endpoint": endpoint,
+            "Fanin": self.trainer_num,
+            "sync_mode": self.sync_mode,
+            "grad_to_block_id": grad_to_block_id
+        }
+        if len(prefetch_var_name_to_block_id) > 0:
+            attrs['prefetch_var_name_to_block_id'] \
+                = prefetch_var_name_to_block_id
        # step5 append the listen_and_serv op
        pserver_program.global_block().append_op(
            type="listen_and_serv",
            inputs={'X': recv_inputs},
            outputs={},
-            attrs={
+            attrs=attrs)
-                "OptimizeBlock": pserver_program.block(1),
-                "endpoint": endpoint,
-                "Fanin": self.trainer_num,
-                "PrefetchBlock": prefetch_block,
-                "sync_mode": self.sync_mode,
-                "grad_to_block_id": grad_to_block_id
-            })
        pserver_program.sync_with_cpp()
        return pserver_program
@@ -608,8 +611,15 @@ class DistributeTranspiler:
    def _replace_lookup_table_op_with_prefetch(self, program,
                                               pserver_endpoints):
        # 1. replace lookup_table_op with split_ids_op -> prefetch_op -> sum_op
-        self.prefetch_input_vars = None
+        # self.all_prefetch_input_vars =
-        self.prefetch_output_vars = None
+        #       [[var0_prefetch_in_pserver0, var0_prefetch_in_pserver1]
+        #        [var1_prefetch_in_pserver0, var1_prefetch_in_pserver1]]
+        self.all_prefetch_input_vars = []
+        # self.all_prefetch_input_vars =
+        #       [[var0_prefetch_in_pserver0, var0_prefetch_in_pserver1]
+        #        [var1_prefetch_in_pserver0, var1_prefetch_in_pserver1]]
+        self.all_prefetch_output_vars = []
        continue_search_lookup_table_op = True
        while continue_search_lookup_table_op:
@@ -619,26 +629,27 @@ class DistributeTranspiler:
                if op.type == LOOKUP_TABLE_TYPE:
                    continue_search_lookup_table_op = True
-                    op_index = list(all_ops).index(op)
+                    lookup_table_op_index = list(all_ops).index(op)
                    ids_name = op.input("Ids")
                    out_name = op.output("Out")
-                    if self.prefetch_input_vars is None:
+                    ids_var = program.global_block().vars[ids_name[0]]
-                        ids_var = program.global_block().vars[ids_name[0]]
+                    prefetch_input_vars = self.create_splited_vars(
-                        self.prefetch_input_vars = self.create_splited_vars(
+                        source_var=ids_var,
-                            source_var=ids_var,
+                        block=program.global_block(),
-                            block=program.global_block(),
+                        tag="_prefetch_in_")
-                            tag="_prefetch_in_")
+                    self.all_prefetch_input_vars.append(prefetch_input_vars)
-                    if self.prefetch_output_vars is None:
-                        out_var = program.global_block().vars[out_name[0]]
+                    out_var = program.global_block().vars[out_name[0]]
-                        self.prefetch_output_vars = self.create_splited_vars(
+                    prefetch_output_vars = self.create_splited_vars(
-                            source_var=out_var,
+                        source_var=out_var,
-                            block=program.global_block(),
+                        block=program.global_block(),
-                            tag="_prefetch_out_")
+                        tag="_prefetch_out_")
+                    self.all_prefetch_output_vars.append(prefetch_output_vars)
                    # insert split_ids_op
                    program.global_block().insert_op(
-                        index=op_index,
+                        index=lookup_table_op_index,
                        type="split_ids",
                        inputs={
                            'Ids': [
@@ -646,14 +657,14 @@ class DistributeTranspiler:
                                for varname in ids_name
                            ]
                        },
-                        outputs={"Out": self.prefetch_input_vars})
+                        outputs={"Out": prefetch_input_vars})
                    # insert prefetch_op
                    program.global_block().insert_op(
-                        index=op_index + 1,
+                        index=lookup_table_op_index + 1,
                        type="prefetch",
-                        inputs={'X': self.prefetch_input_vars},
+                        inputs={'X': prefetch_input_vars},
-                        outputs={"Out": self.prefetch_output_vars},
+                        outputs={"Out": prefetch_output_vars},
                        attrs={
                            "epmap": pserver_endpoints,
                            RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
@@ -661,16 +672,21 @@ class DistributeTranspiler:
                    # insert concat_op
                    program.global_block().insert_op(
-                        index=op_index + 2,
+                        index=lookup_table_op_index + 2,
-                        type="concat",
+                        type="merge_ids",
-                        inputs={'X': self.prefetch_output_vars},
+                        inputs={
+                            'Ids': [
+                                program.global_block().vars[varname]
+                                for varname in ids_name
+                            ],
+                            'X': prefetch_output_vars
+                        },
                        outputs={
                            "Out": [
                                program.global_block().vars[varname]
                                for varname in out_name
                            ]
-                        },
+                        })
-                        attrs={"axis": 0})
                    # delete lookup_table_op
                    delete_ops(program.global_block(), [op])
@@ -709,30 +725,34 @@ class DistributeTranspiler:
                               optimize_block):
        # STEP: create prefetch block
        table_var = pserver_program.global_block().vars[self.table_name]
-        prefetch_block = pserver_program.create_block(optimize_block.idx)
+        prefetch_var_name_to_block_id = []
-        trainer_ids = self.prefetch_input_vars[pserver_index]
+        for index in range(len(self.all_prefetch_input_vars)):
-        pserver_ids = pserver_program.global_block().create_var(
+            prefetch_block = pserver_program.create_block(optimize_block.idx)
-            name=trainer_ids.name,
+            trainer_ids = self.all_prefetch_input_vars[index][pserver_index]
-            type=trainer_ids.type,
+            pserver_ids = pserver_program.global_block().create_var(
-            shape=trainer_ids.shape,
+                name=trainer_ids.name,
-            dtype=trainer_ids.dtype)
+                type=trainer_ids.type,
-        trainer_out = self.prefetch_output_vars[pserver_index]
+                shape=trainer_ids.shape,
-        pserver_out = pserver_program.global_block().create_var(
+                dtype=trainer_ids.dtype)
-            name=trainer_out.name,
+            trainer_out = self.all_prefetch_output_vars[index][pserver_index]
-            type=trainer_out.type,
+            pserver_out = pserver_program.global_block().create_var(
-            shape=trainer_out.shape,
+                name=trainer_out.name,
-            dtype=trainer_out.dtype)
+                type=trainer_out.type,
-        prefetch_block.append_op(
+                shape=trainer_out.shape,
-            type="lookup_sparse_table",
+                dtype=trainer_out.dtype)
-            inputs={'Ids': pserver_ids,
+            prefetch_block.append_op(
-                    "W": table_var},
+                type="lookup_sparse_table",
-            outputs={"Out": pserver_out},
+                inputs={'Ids': pserver_ids,
-            attrs={
+                        "W": table_var},
-                "is_sparse": True,  # has no effect on lookup_table op
+                outputs={"Out": pserver_out},
-                "is_distributed": True,
+                attrs={
-                "padding_idx": -1
+                    "is_sparse": True,  # has no effect on lookup_table op
-            })
+                    "is_distributed": True,
-        return prefetch_block
+                    "padding_idx": -1
+                })
+            prefetch_var_name_to_block_id.append(trainer_ids.name + ":" + str(
+                prefetch_block.idx))
+        return prefetch_var_name_to_block_id
    def _create_table_optimize_block(self, pserver_index, pserver_program,
                                     pre_block_idx, grad_to_block_id):

--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -240,14 +240,15 @@ class ExtraLayerAttribute(object):
    :type error_clipping_threshold: float
    :param drop_rate: Dropout rate. Dropout will create a mask on layer output.
                      The dropout rate is the zero rate of this mask. The
-                      details of what dropout is please refer to `here
+                      details of what dropout is please refer to `JMLRdropout
-                      <https://www.cs.toronto.edu/~hinton/absps/
+                      <https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf
-                      JMLRdropout.pdf>`_.
+                      >`_.
    :type drop_rate: float
    :param device: device ID of layer. device=-1, use CPU. device>=0, use GPU.
-                   The details allocation in parallel_nn please refer to `here
+                   The details allocation in parallel_nn please refer to `use_case
-                   <http://www.paddlepaddle.org/doc/ui/cmd_argument/
+                   <https://github.com/PaddlePaddle/Paddle/blob/develop/doc/v2
-                   use_case.html#case-2-specify-layers-in-different-devices>`_.
+                   /howto/cmd_parameter/use_case_en.md#case-2-specify-layers-in
+                   -different-devices>`_.
    :type device: int
    """

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -2556,7 +2556,7 @@ def img_conv_layer(input,
    the output will be obtained by concatenating the two results.
    The details of grouped convolution, please refer to:
-    `ImageNet Classification with Deep Convolutional Neural Networks
+    `ImageNet Classification With Deep Convolutional Neural Networks
    <http://www.cs.toronto.edu/~kriz/imagenet_classification_with_deep_convolutional.pdf>`_
    The example usage is:
@@ -5678,8 +5678,8 @@ def warp_ctc_layer(input,
    <https://github.com/baidu-research/warp-ctc>`_ library, which is used in
    `Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin
    <https://arxiv.org/pdf/1512.02595v1.pdf>`_, to compute Connectionist Temporal
-    Classification (CTC) loss. Besides, another `warp-ctc
+    Classification (CTC) loss. Besides, another `warp-ctc repository
-    <https://github.com/gangliao/warp-ctc>`_ repository, which is forked from
+    <https://github.com/gangliao/warp-ctc>`_ , which is forked from
    the official one, is maintained to enable more compiling options. During the
    building process, PaddlePaddle will clone the source codes, build and
    install it to :code:`third_party/install/warpctc` directory.

--- a/python/paddle/v2/minibatch.py
+++ b/python/paddle/v2/minibatch.py
@@ -15,7 +15,7 @@
 __all__ = ['batch']
-def batch(reader, batch_size, drop_last=False):
+def batch(reader, batch_size, drop_last=True):
    """
    Create a batched reader.

--- a/tools/codestyle/docstring_checker.py
+++ b/tools/codestyle/docstring_checker.py
@@ -126,9 +126,10 @@ class DocstringChecker(BaseChecker):
        'W9002':
        ('Doc string does not end with "." period', symbol + "-end-with",
         'Used when a doc string does not end with a period'),
-        'W9003': ('All args with their types must be mentioned in doc string',
+        'W9003':
-                  symbol + "-with-all-args",
+        ('All args with their types must be mentioned in doc string %s',
-                  'Used when not all arguments are in the doc string '),
+         symbol + "-with-all-args",
+         'Used when not all arguments are in the doc string '),
        'W9005': ('Missing docstring or docstring is too short',
                  symbol + "-missing", 'Add docstring longer >=10'),
        'W9006': ('Docstring indent error, use 4 space for indent',
@@ -178,6 +179,8 @@ class DocstringChecker(BaseChecker):
        self.indent_style(node)
    def missing_doc_string(self, node):
+        if node.name.startswith("__") or node.name.startswith("_"):
+            return True
        if node.tolineno - node.fromlineno <= 10:
            return True
@@ -199,12 +202,16 @@ class DocstringChecker(BaseChecker):
        doc = node.doc
        lines = doc.splitlines()
+        line_num = 0
        for l in lines:
+            if line_num == 0:
+                continue
            cur_indent = len(l) - len(l.lstrip())
            if cur_indent % indent != 0:
                self.add_message('W9006', node=node, line=node.fromlineno)
                return False
+            line_num += 1
        return True
@@ -320,15 +327,19 @@ class DocstringChecker(BaseChecker):
            return True
        parsed_args = doc.args
+        args_not_documented = set(args) - set(parsed_args)
        if len(args) > 0 and len(parsed_args) <= 0:
-            print "debug:parsed args: ", parsed_args
+            self.add_message(
-            self.add_message('W9003', node=node, line=node.fromlineno)
+                'W9003',
+                node=node,
+                line=node.fromlineno,
+                args=list(args_not_documented))
            return False
        for t in args:
            if t not in parsed_args:
-                print t, " with (type) not in ", parsed_args
+                self.add_message(
-                self.add_message('W9003', node=node, line=node.fromlineno)
+                    'W9003', node=node, line=node.fromlineno, args=[t, ])
                return False
        return True
--- a/tools/codestyle/pylint_pre_commit.hook
+++ b/tools/codestyle/pylint_pre_commit.hook
@@ -7,13 +7,13 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 export PYTHONPATH=$DIR:$PYTHONPATH
 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
-for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do
+for file in $(git diff --name-status | awk '$1 != "D" {print $2}'); do
    pylint --disable=all --load-plugins=docstring_checker \
    --enable=doc-string-one-line,doc-string-end-with,doc-string-with-all-args,doc-string-triple-quotes,doc-string-missing,doc-string-indent-error,doc-string-with-returns,doc-string-with-raises $file;
    TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
 done
-#exit $TOTAL_ERRORS
+exit $TOTAL_ERRORS
 #For now, just warning:
-exit 0
+#exit 0