Merge branch 'develop' of upstream into argsort_dev

7ca511e0 · Yibing Liu · 92cfa2be · 16a0f746 · 7ca511e0 · 7ca511e0
192 changed file
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -22,6 +22,7 @@
 | jczaja | Jacek Czaja |
 | JiayiFeng | Jia-Yi Feng |
 | kbinias | Krzysztof Binias |
+| kexinzhao | Ke-Xin Zhao |
 | kuke | Yi-Bing Liu |
 | lcy-seso | Ying Cao |
 | lipeng-unisound | Peng Li |

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -61,6 +61,7 @@ option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen"            OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
+option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})

 # CMAKE_BUILD_TYPE
@@ -193,7 +194,10 @@ set(EXTERNAL_LIBS
 if(WITH_GPU)
    include(cuda)
    include(tensorrt)
-endif(WITH_GPU)
+    include(external/anakin)
+else()
+  set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
+endif()

 if(WITH_AMD_GPU)
    find_package(HIP)

--- a/benchmark/fluid/models/machine_translation.py
+++ b/benchmark/fluid/models/machine_translation.py
@@ -173,21 +173,6 @@ def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
        return avg_cost, feeding_list


-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    lod_t = core.LoDTensor()
-    lod_t.set(flattened_data, place)
-    lod_t.set_lod([lod])
-    return lod_t, lod[-1]
-
-
 def lodtensor_to_ndarray(lod_tensor):
    dims = lod_tensor.get_dims()
    ndarray = np.zeros(shape=dims).astype('float32')

--- a/benchmark/fluid/models/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@@ -125,18 +125,3 @@ def get_model(args):
        batch_size=args.batch_size)

    return loss, inference_program, adam, train_reader, test_reader, batch_acc
-
-
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = numpy.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
+if (NOT WITH_ANAKIN)
+  return()
+endif()
+
+set(ANAKIN_INSTALL_DIR "${THIRD_PARTY_PATH}/install/anakin" CACHE PATH
+  "Anakin install path." FORCE)
+set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header files")
+set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library")
+
+set(ANAKIN_COMPILE_EXTRA_FLAGS -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp)
+
+set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/3.0/anakin_release_simple.tar.gz")
+
+# A helper function used in Anakin, currently, to use it, one need to recursively include
+# nearly all the header files.
+function(fetch_include_recursively root_dir)
+    if (IS_DIRECTORY ${root_dir})
+        include_directories(${root_dir})
+    endif()
+
+    file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*)
+    foreach(sub ${ALL_SUB})
+        if (IS_DIRECTORY ${root_dir}/${sub})
+            fetch_include_recursively(${root_dir}/${sub})
+        endif()
+    endforeach()
+endfunction()
+
+# download library
+message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
+execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
+execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
+execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
+execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
+execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
+
+if (WITH_ANAKIN)
+    message(STATUS "Anakin for inference is enabled")
+    message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
+    fetch_include_recursively(${ANAKIN_INCLUDE})
+    link_directories(${ANAKIN_LIBRARY})
+endif()
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -29,6 +29,8 @@ IF(NOT ${CBLAS_FOUND})
        "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
        CACHE FILEPATH "openblas library." FORCE)

+    ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS)
+
    SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
    SET(OPENBLAS_COMMIT "v0.2.20")


--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -39,7 +39,7 @@ function(copy TARGET)
        message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers")
    endif()
    math(EXPR len "${copy_lib_SRCS_len} - 1")
-    
+
    add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS})
    foreach(index RANGE ${len})
        list(GET copy_lib_SRCS ${index} src)
@@ -155,6 +155,15 @@ copy(inference_lib DEPS paddle_fluid_shared paddle_fluid
  DSTS ${dst_dir}/${module} ${dst_dir}/${module}
 )

+if(WITH_CONTRIB)
+   set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
+   copy(contrib_inference_lib DEPS paddle_inference_api
+        SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h
+        ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api.*
+        DSTS ${contrib_dst_dir} ${contrib_dst_dir}
+   )
+endif()
+
 set(module "platform")
 copy(platform_lib DEPS profiler_py_proto
  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h

--- a/doc/fluid/api/detection.rst
+++ b/doc/fluid/api/detection.rst
--- a/doc/fluid/api/gen_doc.sh
+++ b/doc/fluid/api/gen_doc.sh
 #!/bin/bash
-python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst
+python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler metric > layers.rst

 for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer
 do

--- a/doc/fluid/api/initializer.rst
+++ b/doc/fluid/api/initializer.rst
@@ -33,6 +33,13 @@ Xavier
    :members:
    :noindex:

+Bilinear
+--------
+
+..  autoclass:: paddle.fluid.initializer.Bilinear
+    :members:
+    :noindex:
+
 force_init_on_cpu
 -----------------

@@ -73,3 +80,10 @@ XavierInitializer
    :members:
    :noindex:

+BilinearInitializer
+-------------------
+
+..  autoclass:: paddle.fluid.initializer.BilinearInitializer
+    :members:
+    :noindex:
+
--- a/doc/fluid/api/io.rst
+++ b/doc/fluid/api/io.rst
@@ -77,3 +77,21 @@ clean_checkpoint
 ..  autofunction:: paddle.fluid.io.clean_checkpoint
    :noindex:

+load_persist_vars_without_grad
+------------------------------
+
+..  autofunction:: paddle.fluid.io.load_persist_vars_without_grad
+    :noindex:
+
+save_persist_vars_without_grad
+------------------------------
+
+..  autofunction:: paddle.fluid.io.save_persist_vars_without_grad
+    :noindex:
+
+get_latest_checkpoint_serial
+----------------------------
+
+..  autofunction:: paddle.fluid.io.get_latest_checkpoint_serial
+    :noindex:
+
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -225,6 +225,12 @@ Send
 ..  autofunction:: paddle.fluid.layers.Send
    :noindex:

+Recv
+----
+
+..  autofunction:: paddle.fluid.layers.Recv
+    :noindex:
+
 open_recordio_file
 ------------------

@@ -274,6 +280,12 @@ Preprocessor
    :members:
    :noindex:

+load
+----
+
+..  autofunction:: paddle.fluid.layers.load
+    :noindex:
+
 nn
 ==

@@ -361,6 +373,12 @@ conv2d
 ..  autofunction:: paddle.fluid.layers.conv2d
    :noindex:

+conv3d
+------
+
+..  autofunction:: paddle.fluid.layers.conv3d
+    :noindex:
+
 sequence_pool
 -------------

@@ -385,6 +403,12 @@ pool2d
 ..  autofunction:: paddle.fluid.layers.pool2d
    :noindex:

+pool3d
+------
+
+..  autofunction:: paddle.fluid.layers.pool3d
+    :noindex:
+
 batch_norm
 ----------

@@ -403,6 +427,12 @@ conv2d_transpose
 ..  autofunction:: paddle.fluid.layers.conv2d_transpose
    :noindex:

+conv3d_transpose
+----------------
+
+..  autofunction:: paddle.fluid.layers.conv3d_transpose
+    :noindex:
+
 sequence_expand
 ---------------

@@ -619,6 +649,18 @@ dice_loss
 ..  autofunction:: paddle.fluid.layers.dice_loss
    :noindex:

+image_resize
+------------
+
+..  autofunction:: paddle.fluid.layers.image_resize
+    :noindex:
+
+image_resize_short
+------------------
+
+..  autofunction:: paddle.fluid.layers.image_resize_short
+    :noindex:
+
 resize_bilinear
 ---------------

@@ -637,6 +679,12 @@ random_crop
 ..  autofunction:: paddle.fluid.layers.random_crop
    :noindex:

+mean_iou
+--------
+
+..  autofunction:: paddle.fluid.layers.mean_iou
+    :noindex:
+
 ops
 ===

@@ -742,12 +790,6 @@ logical_not
 ..  autofunction:: paddle.fluid.layers.logical_not
    :noindex:

-uniform_random
--------------
-
-..  autofunction:: paddle.fluid.layers.uniform_random
-    :noindex:
-
 uniform_random_batch_size_like
 ------------------------------

@@ -766,12 +808,6 @@ gaussian_random_batch_size_like
 ..  autofunction:: paddle.fluid.layers.gaussian_random_batch_size_like
    :noindex:

-cumsum
------
-
-..  autofunction:: paddle.fluid.layers.cumsum
-    :noindex:
-
 scatter
 -------

@@ -784,12 +820,30 @@ sum
 ..  autofunction:: paddle.fluid.layers.sum
    :noindex:

+slice
+-----
+
+..  autofunction:: paddle.fluid.layers.slice
+    :noindex:
+
+polygon_box_transform
+---------------------
+
+..  autofunction:: paddle.fluid.layers.polygon_box_transform
+    :noindex:
+
 shape
 -----

 ..  autofunction:: paddle.fluid.layers.shape
    :noindex:

+maxout
+------
+
+..  autofunction:: paddle.fluid.layers.maxout
+    :noindex:
+
 sigmoid
 -------

@@ -946,18 +1000,6 @@ stanh
 ..  autofunction:: paddle.fluid.layers.stanh
    :noindex:

-hard_shrink
-----------
-
-..  autofunction:: paddle.fluid.layers.hard_shrink
-    :noindex:
-
-thresholded_relu
----------------
-
-..  autofunction:: paddle.fluid.layers.thresholded_relu
-    :noindex:
-
 hard_sigmoid
 ------------

@@ -970,6 +1012,30 @@ swish
 ..  autofunction:: paddle.fluid.layers.swish
    :noindex:

+uniform_random
+--------------
+
+..  autofunction:: paddle.fluid.layers.uniform_random
+    :noindex:
+
+hard_shrink
+-----------
+
+..  autofunction:: paddle.fluid.layers.hard_shrink
+    :noindex:
+
+cumsum
+------
+
+..  autofunction:: paddle.fluid.layers.cumsum
+    :noindex:
+
+thresholded_relu
+----------------
+
+..  autofunction:: paddle.fluid.layers.thresholded_relu
+    :noindex:
+
 tensor
 ======

@@ -1027,6 +1093,18 @@ fill_constant
 ..  autofunction:: paddle.fluid.layers.fill_constant
    :noindex:

+argmin
+------
+
+..  autofunction:: paddle.fluid.layers.argmin
+    :noindex:
+
+argmax
+------
+
+..  autofunction:: paddle.fluid.layers.argmax
+    :noindex:
+
 ones
 ----

@@ -1039,3 +1117,114 @@ zeros
 ..  autofunction:: paddle.fluid.layers.zeros
    :noindex:

+detection
+=========
+
+prior_box
+---------
+
+..  autofunction:: paddle.fluid.layers.prior_box
+    :noindex:
+
+multi_box_head
+--------------
+
+..  autofunction:: paddle.fluid.layers.multi_box_head
+    :noindex:
+
+bipartite_match
+---------------
+
+..  autofunction:: paddle.fluid.layers.bipartite_match
+    :noindex:
+
+target_assign
+-------------
+
+..  autofunction:: paddle.fluid.layers.target_assign
+    :noindex:
+
+detection_output
+----------------
+
+..  autofunction:: paddle.fluid.layers.detection_output
+    :noindex:
+
+ssd_loss
+--------
+
+..  autofunction:: paddle.fluid.layers.ssd_loss
+    :noindex:
+
+detection_map
+-------------
+
+..  autofunction:: paddle.fluid.layers.detection_map
+    :noindex:
+
+iou_similarity
+--------------
+
+..  autofunction:: paddle.fluid.layers.iou_similarity
+    :noindex:
+
+box_coder
+---------
+
+..  autofunction:: paddle.fluid.layers.box_coder
+    :noindex:
+
+learning_rate_scheduler
+=======================
+
+exponential_decay
+-----------------
+
+..  autofunction:: paddle.fluid.layers.exponential_decay
+    :noindex:
+
+natural_exp_decay
+-----------------
+
+..  autofunction:: paddle.fluid.layers.natural_exp_decay
+    :noindex:
+
+inverse_time_decay
+------------------
+
+..  autofunction:: paddle.fluid.layers.inverse_time_decay
+    :noindex:
+
+polynomial_decay
+----------------
+
+..  autofunction:: paddle.fluid.layers.polynomial_decay
+    :noindex:
+
+piecewise_decay
+---------------
+
+..  autofunction:: paddle.fluid.layers.piecewise_decay
+    :noindex:
+
+noam_decay
+----------
+
+..  autofunction:: paddle.fluid.layers.noam_decay
+    :noindex:
+
+metric
+======
+
+accuracy
+--------
+
+..  autofunction:: paddle.fluid.layers.accuracy
+    :noindex:
+
+auc
+---
+
+..  autofunction:: paddle.fluid.layers.auc
+    :noindex:
+
--- a/doc/survey/dynamic_graph.md
+++ b/doc/survey/dynamic_graph.md
@@ -171,7 +171,7 @@ Pytorch chooses immediate evaluation. It avoids ever materializing a "forward gr

 ## What can fluid learn from them?

-TBD
+Please refer to `paddle/contrib/dynamic/`.

 # Appendix


--- a/doc/v2/api/config/evaluators.rst
+++ b/doc/v2/api/config/evaluators.rst
@@ -101,7 +101,7 @@ value_printer
    :noindex:

 Detection
-=====
+==========

 detection_map
 -------------

--- a/doc/v2/api/config/layer.rst
+++ b/doc/v2/api/config/layer.rst
@@ -11,7 +11,7 @@ Data layer

 data
 ----
-..  autoclass:: paddle.v2.layer.data
+..  autofunction:: paddle.v2.layer.data
    :noindex:

 Fully Connected Layers
@@ -21,12 +21,12 @@ Fully Connected Layers

 fc
 --
-..  autoclass:: paddle.v2.layer.fc
+..  autofunction:: paddle.v2.layer.fc
    :noindex:

 selective_fc
 ------------
-..  autoclass:: paddle.v2.layer.selective_fc
+..  autofunction:: paddle.v2.layer.selective_fc
    :noindex:

 Conv Layers
@@ -34,34 +34,34 @@ Conv Layers

 conv_operator
 -------------
-..  autoclass:: paddle.v2.layer.conv_operator
+..  autofunction:: paddle.v2.layer.conv_operator
    :noindex:

 conv_projection
 ---------------
-..  autoclass:: paddle.v2.layer.conv_projection
+..  autofunction:: paddle.v2.layer.conv_projection
    :noindex:

 conv_shift
 ----------
-..  autoclass:: paddle.v2.layer.conv_shift
+..  autofunction:: paddle.v2.layer.conv_shift
    :noindex:

 img_conv
 --------
-..  autoclass:: paddle.v2.layer.img_conv
+..  autofunction:: paddle.v2.layer.img_conv
    :noindex:

 ..  _api_v2.layer_context_projection:

 context_projection
 ------------------
-..  autoclass:: paddle.v2.layer.context_projection
+..  autofunction:: paddle.v2.layer.context_projection
    :noindex:

 row_conv
 --------
-..  autoclass:: paddle.v2.layer.row_conv
+..  autofunction:: paddle.v2.layer.row_conv
    :noindex:

 Image Pooling Layer
@@ -69,27 +69,27 @@ Image Pooling Layer

 img_pool
 --------
-..  autoclass:: paddle.v2.layer.img_pool
+..  autofunction:: paddle.v2.layer.img_pool
    :noindex:

 spp
 ---
-..  autoclass:: paddle.v2.layer.spp
+..  autofunction:: paddle.v2.layer.spp
    :noindex:

 maxout
 ------
-..  autoclass:: paddle.v2.layer.maxout
+..  autofunction:: paddle.v2.layer.maxout
    :noindex:

 roi_pool
 --------
-..  autoclass:: paddle.v2.layer.roi_pool
+..  autofunction:: paddle.v2.layer.roi_pool
    :noindex:

 pad
 ----
-..  autoclass:: paddle.v2.layer.pad
+..  autofunction:: paddle.v2.layer.pad
    :noindex:

 Norm Layer
@@ -97,27 +97,27 @@ Norm Layer

 img_cmrnorm
 -----------
-..  autoclass:: paddle.v2.layer.img_cmrnorm
+..  autofunction:: paddle.v2.layer.img_cmrnorm
    :noindex:

 batch_norm
 ----------
-..  autoclass:: paddle.v2.layer.batch_norm
+..  autofunction:: paddle.v2.layer.batch_norm
    :noindex:

 sum_to_one_norm
 ---------------
-..  autoclass:: paddle.v2.layer.sum_to_one_norm
+..  autofunction:: paddle.v2.layer.sum_to_one_norm
    :noindex:

 cross_channel_norm
 ------------------
-..  autoclass:: paddle.v2.layer.cross_channel_norm
+..  autofunction:: paddle.v2.layer.cross_channel_norm
    :noindex:

 row_l2_norm
 -----------
-..  autoclass:: paddle.v2.layer.row_l2_norm
+..  autofunction:: paddle.v2.layer.row_l2_norm
    :noindex:

 Recurrent Layers
@@ -125,22 +125,22 @@ Recurrent Layers

 recurrent
 ---------
-..  autoclass:: paddle.v2.layer.recurrent
+..  autofunction:: paddle.v2.layer.recurrent
    :noindex:

 lstmemory
 ---------
-..  autoclass:: paddle.v2.layer.lstmemory
+..  autofunction:: paddle.v2.layer.lstmemory
    :noindex:

 grumemory
 ---------
-..  autoclass:: paddle.v2.layer.grumemory
+..  autofunction:: paddle.v2.layer.grumemory
    :noindex:

 gated_unit
 -----------
-..  autoclass:: paddle.v2.layer.gated_unit
+..  autofunction:: paddle.v2.layer.gated_unit
    :noindex:

 Recurrent Layer Group
@@ -148,32 +148,32 @@ Recurrent Layer Group

 memory
 ------
-..  autoclass:: paddle.v2.layer.memory
+..  autofunction:: paddle.v2.layer.memory
    :noindex:

 recurrent_group
 ---------------
-..  autoclass:: paddle.v2.layer.recurrent_group
+..  autofunction:: paddle.v2.layer.recurrent_group
    :noindex:

 lstm_step
 ---------
-..  autoclass:: paddle.v2.layer.lstm_step
+..  autofunction:: paddle.v2.layer.lstm_step
    :noindex:

 gru_step
 --------
-..  autoclass:: paddle.v2.layer.gru_step
+..  autofunction:: paddle.v2.layer.gru_step
    :noindex:

 beam_search
 ------------
-..  autoclass:: paddle.v2.layer.beam_search
+..  autofunction:: paddle.v2.layer.beam_search
    :noindex:

 get_output
 ----------
-..  autoclass:: paddle.v2.layer.get_output
+..  autofunction:: paddle.v2.layer.get_output
    :noindex:

 Mixed Layer
@@ -183,54 +183,54 @@ Mixed Layer

 mixed
 -----
-..  autoclass:: paddle.v2.layer.mixed
+..  autofunction:: paddle.v2.layer.mixed
    :noindex:

 ..  _api_v2.layer_embedding:

 embedding
 ---------
-..  autoclass:: paddle.v2.layer.embedding
+..  autofunction:: paddle.v2.layer.embedding
    :noindex:

 scaling_projection
 ------------------
-..  autoclass:: paddle.v2.layer.scaling_projection
+..  autofunction:: paddle.v2.layer.scaling_projection
    :noindex:

 dotmul_projection
 -----------------
-..  autoclass:: paddle.v2.layer.dotmul_projection
+..  autofunction:: paddle.v2.layer.dotmul_projection
    :noindex:

 dotmul_operator
 ---------------
-..  autoclass:: paddle.v2.layer.dotmul_operator
+..  autofunction:: paddle.v2.layer.dotmul_operator
    :noindex:

 full_matrix_projection
 ----------------------
-..  autoclass:: paddle.v2.layer.full_matrix_projection
+..  autofunction:: paddle.v2.layer.full_matrix_projection
    :noindex:

 identity_projection
 -------------------
-..  autoclass:: paddle.v2.layer.identity_projection
+..  autofunction:: paddle.v2.layer.identity_projection
    :noindex:

 slice_projection
 -------------------
-..  autoclass:: paddle.v2.layer.slice_projection
+..  autofunction:: paddle.v2.layer.slice_projection
    :noindex:

 table_projection
 ----------------
-..  autoclass:: paddle.v2.layer.table_projection
+..  autofunction:: paddle.v2.layer.table_projection
    :noindex:

 trans_full_matrix_projection
 ----------------------------
-..  autoclass:: paddle.v2.layer.trans_full_matrix_projection
+..  autofunction:: paddle.v2.layer.trans_full_matrix_projection
    :noindex:

 Aggregate Layers
@@ -245,51 +245,46 @@ AggregateLevel

 pooling
 -------
-..  autoclass:: paddle.v2.layer.pooling
+..  autofunction:: paddle.v2.layer.pooling
    :noindex:

 ..  _api_v2.layer_last_seq:

 last_seq
 --------
-..  autoclass:: paddle.v2.layer.last_seq
+..  autofunction:: paddle.v2.layer.last_seq
    :noindex:

 ..  _api_v2.layer_first_seq:

 first_seq
 ---------
-..  autoclass:: paddle.v2.layer.first_seq
+..  autofunction:: paddle.v2.layer.first_seq
    :noindex:

 sub_seq
 ---------
-..  autoclass:: paddle.v2.layer.sub_seq
+..  autofunction:: paddle.v2.layer.sub_seq
    :noindex:

 concat
 ------
-..  autoclass:: paddle.v2.layer.concat
+..  autofunction:: paddle.v2.layer.concat
    :noindex:

 seq_concat
 ----------
-..  autoclass:: paddle.v2.layer.seq_concat
+..  autofunction:: paddle.v2.layer.seq_concat
    :noindex:

 seq_slice
 ---------
-..  autoclass:: paddle.v2.layer.seq_slice
-    :noindex:
-
-kmax_sequence_score
-------------------
-..  autoclass:: paddle.v2.layer.kmax_sequence_score
+..  autofunction:: paddle.v2.layer.seq_slice
    :noindex:

 sub_nested_seq
 --------------
-..  autoclass:: paddle.v2.layer.sub_nested_seq
+..  autofunction:: paddle.v2.layer.sub_nested_seq
    :noindex:

 Reshaping Layers
@@ -297,7 +292,7 @@ Reshaping Layers

 block_expand
 ------------
-..  autoclass:: paddle.v2.layer.block_expand
+..  autofunction:: paddle.v2.layer.block_expand
    :noindex:

 ..  _api_v2.layer_expand:
@@ -309,22 +304,22 @@ ExpandLevel

 expand
 ------
-..  autoclass:: paddle.v2.layer.expand
+..  autofunction:: paddle.v2.layer.expand
    :noindex:

 repeat
 ------
-..  autoclass:: paddle.v2.layer.repeat
+..  autofunction:: paddle.v2.layer.repeat
    :noindex:

 rotate
 ------
-..  autoclass:: paddle.v2.layer.rotate
+..  autofunction:: paddle.v2.layer.rotate
    :noindex:

 seq_reshape
 -----------
-..  autoclass:: paddle.v2.layer.seq_reshape
+..  autofunction:: paddle.v2.layer.seq_reshape
    :noindex:

 Math Layers
@@ -332,94 +327,94 @@ Math Layers

 addto
 -----
-..  autoclass:: paddle.v2.layer.addto
+..  autofunction:: paddle.v2.layer.addto
    :noindex:

 linear_comb
 -----------
-..  autoclass:: paddle.v2.layer.linear_comb
+..  autofunction:: paddle.v2.layer.linear_comb
    :noindex:

 interpolation
 -------------
-..  autoclass:: paddle.v2.layer.interpolation
+..  autofunction:: paddle.v2.layer.interpolation
    :noindex:

 bilinear_interp
 ---------------
-..  autoclass:: paddle.v2.layer.bilinear_interp
+..  autofunction:: paddle.v2.layer.bilinear_interp
    :noindex:

 dropout
 --------
-..  autoclass:: paddle.v2.layer.dropout
+..  autofunction:: paddle.v2.layer.dropout
    :noindex:

 dot_prod
 ---------
-.. autoclass:: paddle.v2.layer.dot_prod
+.. autofunction:: paddle.v2.layer.dot_prod
    :noindex:

 out_prod
 --------
-.. autoclass:: paddle.v2.layer.out_prod
+.. autofunction:: paddle.v2.layer.out_prod
    :noindex:

 power
 -----
-..  autoclass:: paddle.v2.layer.power
+..  autofunction:: paddle.v2.layer.power
    :noindex:

 scaling
 -------
-..  autoclass:: paddle.v2.layer.scaling
+..  autofunction:: paddle.v2.layer.scaling
    :noindex:

 clip
 ----
-..  autoclass:: paddle.v2.layer.clip
+..  autofunction:: paddle.v2.layer.clip
    :noindex:

 resize
 ------
-..  autoclass:: paddle.v2.layer.resize
+..  autofunction:: paddle.v2.layer.resize
    :noindex:

 slope_intercept
 ---------------
-..  autoclass:: paddle.v2.layer.slope_intercept
+..  autofunction:: paddle.v2.layer.slope_intercept
    :noindex:

 tensor
 ------
-..  autoclass:: paddle.v2.layer.tensor
+..  autofunction:: paddle.v2.layer.tensor
    :noindex:

 ..  _api_v2.layer_cos_sim:

 cos_sim
 -------
-..  autoclass:: paddle.v2.layer.cos_sim
+..  autofunction:: paddle.v2.layer.cos_sim
    :noindex:

 l2_distance
 -----------
-..  autoclass:: paddle.v2.layer.l2_distance
+..  autofunction:: paddle.v2.layer.l2_distance
    :noindex:

 trans
 -----
-..  autoclass:: paddle.v2.layer.trans
+..  autofunction:: paddle.v2.layer.trans
    :noindex:

 scale_shift
 -----------
-..  autoclass:: paddle.v2.layer.scale_shift
+..  autofunction:: paddle.v2.layer.scale_shift
    :noindex:

 factorization_machine
 ---------------------
-..  autoclass:: paddle.v2.layer.factorization_machine
+..  autofunction:: paddle.v2.layer.factorization_machine
    :noindex:

 Sampling Layers
@@ -427,17 +422,17 @@ Sampling Layers

 maxid
 -----
-..  autoclass:: paddle.v2.layer.max_id
+..  autofunction:: paddle.v2.layer.max_id
    :noindex:

 sampling_id
 -----------
-..  autoclass:: paddle.v2.layer.sampling_id
+..  autofunction:: paddle.v2.layer.sampling_id
    :noindex:

 multiplex
 ---------
-..  autoclass:: paddle.v2.layer.multiplex
+..  autofunction:: paddle.v2.layer.multiplex
    :noindex:

 ..  _api_v2.layer_costs:
@@ -447,97 +442,97 @@ Cost Layers

 cross_entropy_cost
 ------------------
-..  autoclass:: paddle.v2.layer.cross_entropy_cost
+..  autofunction:: paddle.v2.layer.cross_entropy_cost
    :noindex:

 cross_entropy_with_selfnorm_cost
 --------------------------------
-..  autoclass:: paddle.v2.layer.cross_entropy_with_selfnorm_cost
+..  autofunction:: paddle.v2.layer.cross_entropy_with_selfnorm_cost
    :noindex:

 multi_binary_label_cross_entropy_cost
 -------------------------------------
-..  autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
+..  autofunction:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
    :noindex:

 classification_cost
 -------------------
-.. autoclass:: paddle.v2.layer.classification_cost
+.. autofunction:: paddle.v2.layer.classification_cost
   :noindex:

 huber_regression_cost
 -------------------------
-..  autoclass:: paddle.v2.layer.huber_regression_cost
+..  autofunction:: paddle.v2.layer.huber_regression_cost
    :noindex:

 huber_classification_cost
 -------------------------
-..  autoclass:: paddle.v2.layer.huber_classification_cost
+..  autofunction:: paddle.v2.layer.huber_classification_cost
    :noindex:

 lambda_cost
 -----------
-..  autoclass:: paddle.v2.layer.lambda_cost
+..  autofunction:: paddle.v2.layer.lambda_cost
    :noindex:

 square_error_cost
 -----------------
-..  autoclass:: paddle.v2.layer.square_error_cost
+..  autofunction:: paddle.v2.layer.square_error_cost
    :noindex:

 rank_cost
 ---------
-..  autoclass:: paddle.v2.layer.rank_cost
+..  autofunction:: paddle.v2.layer.rank_cost
    :noindex:

 sum_cost
 ---------
-..  autoclass:: paddle.v2.layer.sum_cost
+..  autofunction:: paddle.v2.layer.sum_cost
    :noindex:

 crf
 ---
-..  autoclass:: paddle.v2.layer.crf
+..  autofunction:: paddle.v2.layer.crf
    :noindex:

 crf_decoding
 ------------
-..  autoclass:: paddle.v2.layer.crf_decoding
+..  autofunction:: paddle.v2.layer.crf_decoding
    :noindex:

 ctc
 ---
-..  autoclass:: paddle.v2.layer.ctc
+..  autofunction:: paddle.v2.layer.ctc
    :noindex:

 warp_ctc
 --------
-..  autoclass:: paddle.v2.layer.warp_ctc
+..  autofunction:: paddle.v2.layer.warp_ctc
    :noindex:

 nce
 ---
-..  autoclass:: paddle.v2.layer.nce
+..  autofunction:: paddle.v2.layer.nce
    :noindex:

 hsigmoid
 ---------
-..  autoclass:: paddle.v2.layer.hsigmoid
+..  autofunction:: paddle.v2.layer.hsigmoid
    :noindex:

 smooth_l1_cost
 --------------
-..  autoclass:: paddle.v2.layer.smooth_l1_cost
+..  autofunction:: paddle.v2.layer.smooth_l1_cost
    :noindex:

 multibox_loss
 --------------
-..  autoclass:: paddle.v2.layer.multibox_loss
+..  autofunction:: paddle.v2.layer.multibox_loss
    :noindex:

 detection_output
 ----------------
-..  autoclass:: paddle.v2.layer.detection_output
+..  autofunction:: paddle.v2.layer.detection_output
    :noindex:

 Check Layer
@@ -545,7 +540,7 @@ Check Layer

 eos
 ---
-..  autoclass:: paddle.v2.layer.eos
+..  autofunction:: paddle.v2.layer.eos
    :noindex:

 Activation
@@ -553,5 +548,5 @@ Activation

 prelu
 --------
-..  autoclass:: paddle.v2.layer.prelu
+..  autofunction:: paddle.v2.layer.prelu
    :noindex:
--- a/doc/v2/api/index_en.rst
+++ b/doc/v2/api/index_en.rst
@@ -8,4 +8,3 @@ API
    model_configs.rst
    data.rst
    run_logic.rst
-    fluid/index.rst
--- a/doc/v2/build_and_install/pip_install_cn.rst
+++ b/doc/v2/build_and_install/pip_install_cn.rst
@@ -60,6 +60,7 @@ paddlepaddle-gpu==0.11.0            使用CUDA 7.5和cuDNN 5编译的0.11.0版
    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"

 .. _pip_dependency:


--- a/doc/v2/build_and_install/pip_install_en.rst
+++ b/doc/v2/build_and_install/pip_install_en.rst
@@ -63,6 +63,7 @@ If the links below shows up the login form, just click "Log in as guest" to star
    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"

 .. _pip_dependency:


--- a/doc/v2/dev/contribute_to_paddle_cn.md
+++ b/doc/v2/dev/contribute_to_paddle_cn.md
@@ -104,7 +104,7 @@ no changes added to commit (use "git add" and/or "git commit -a")
 ➜  docker run -it -v $(pwd):/paddle paddle:latest-dev bash -c "cd /paddle/build && ctest"
 ```

-关于构建和测试的更多信息，请参见[这篇文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)。
+关于构建和测试的更多信息，请参见[使用Docker安装运行](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/v2/build_and_install/docker_install_cn.rst)。

 ## 提交（commit）


--- a/paddle/contrib/CMakeLists.txt
+++ b/paddle/contrib/CMakeLists.txt
@@ -14,3 +14,4 @@
 #

 add_subdirectory(inference)
+add_subdirectory(tape)
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -17,48 +17,9 @@ if(APPLE)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
 endif(APPLE)

-set(ANAKIN_INCLUDE "" CACHE STRING "root of Anakin header files")
-set(ANAKIN_LIBRARY "" CACHE STRING "path of Anakin library")
-

 set(inference_deps paddle_inference_api paddle_fluid_api)

-# if anakin is set enable anakin api implementation
-if(ANAKIN_INCLUDE AND ANAKIN_LIBRARY)
-    set(ANAKIN_FOUND ON)
-else()
-    set(ANAKIN_FOUND OFF)
-endif()
-
-function(fetch_include_recursively root_dir) 
-    if (IS_DIRECTORY ${root_dir}) 
-        include_directories(${root_dir})
-    endif()
-
-    file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*)
-    foreach(sub ${ALL_SUB})
-        if (IS_DIRECTORY ${root_dir}/${sub})
-            fetch_include_recursively(${root_dir}/${sub})
-        endif()
-    endforeach()
-endfunction()
-
-if (ANAKIN_FOUND)
-    # Anakin's code style doesn't follow google c style.
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp")
-
-    message(STATUS "Anakin for inference is enabled")
-    message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
-    fetch_include_recursively(${ANAKIN_INCLUDE})
-
-    link_directories(${ANAKIN_LIBRARY})
-
-    nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
-    target_link_libraries(inference_anakin_api anakin anakin_saber_common)
-    list(APPEND inference_deps inference_anakin_api)
-endif()
-
-
 function(inference_api_test TARGET_NAME)
    if (WITH_TESTING)
        set(options "")
@@ -79,7 +40,7 @@ function(inference_api_test TARGET_NAME)
 endfunction(inference_api_test)

 cc_library(paddle_inference_api
-    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc 
+    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})

 cc_test(test_paddle_inference_api
@@ -89,9 +50,17 @@ cc_test(test_paddle_inference_api
 inference_api_test(test_paddle_inference_api_impl
                    ARGS test_word2vec test_image_classification)

-if (ANAKIN_FOUND)
+if (WITH_ANAKIN AND WITH_TESTING) # only needed in CI
+    # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
+    # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
+    # compile the libinference_anakin_api.a and compile with anakin.so.
+    nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
+    target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
+    target_link_libraries(inference_anakin_api anakin anakin_saber_common)
    cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
-    DEPS ${inference_deps})
+                                  ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
+                                  DEPS inference_anakin_api)
+    target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
 endif()

 if(WITH_TESTING)

--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
@@ -12,9 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <cuda.h>
-
 #include "paddle/contrib/inference/paddle_inference_api_anakin_engine.h"
+#include <cuda.h>

 namespace paddle {


--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
@@ -19,10 +19,9 @@ limitations under the License. */

 #pragma once

-// NOTE This header file do not have namespace.
-//#include <test/framework/net/paddle_api.h>
 #include "paddle/contrib/inference/paddle_inference_api.h"

+// from anakin
 #include "framework/core/net/net.h"
 #include "saber/saber_types.h"


--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
@@ -12,17 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>

-#include "gflags/gflags.h"
 #include "paddle/contrib/inference/paddle_inference_api.h"

+DEFINE_string(model, "", "Directory of the inference model.");
+
 namespace paddle {

 AnakinConfig GetConfig() {
  AnakinConfig config;
-  config.model_file = "./mobilenet_v2.anakin.bin";
+  config.model_file = FLAGS_model;
  config.device = 0;
  config.max_batch_size = 1;
  return config;

--- a/paddle/contrib/tape/CMakeLists.txt
+++ b/paddle/contrib/tape/CMakeLists.txt
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+if(APPLE)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
+endif(APPLE)
+
+cc_library(tape_variable SRCS variable.cc DEPS ${FLUID_CORE_MODULES} device_context framework_proto proto_desc operator)
+cc_library(tape SRCS tape.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} tape_variable)
+
+cc_test(test_tape
+        SRCS test_tape.cc
+        DEPS tape tape_variable)
--- a/paddle/contrib/tape/README.md
+++ b/paddle/contrib/tape/README.md
+# Dynamic Graph on Fluid
+
+PaddlePaddle Fluid is targeting the autodiff without tape, which, however, is very
+challenging and we are still way from there. DyNet and PyTorch provide a good design
+idea, the *tape*, that significantly eases the challenge.  Also, DyNet provides
+a C++ API that is as convenient as Python but with higher efficiency and could
+conveniently integrate with industrial/production systems. This package, `tape`,
+combines the good of
+
+1. tape from PyTorch and DyNet
+2. C++ API and core from DyNet
+3. rich set of operators from PaddlePaddle
+
+## Overview
+
+We can implement Dynet-like Tape(See this [survey](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/survey/dynamic_graph.md))
+by wrapping Paddle Fluid's `Operator` and `Variable`.
+
+The user API is straight forward since
+
+1. it is imperative. And it uses host language's control flow logic.
+1. it avoids extra concepts such as `Scope` and `Executor`.
+
+All of these benefits come at the cost of just adding one line `reset_global_tape`
+at every iteration.
+
+## Code Structure
+
+In short, the `Tape` contains a vector of `OpHandle`s. And an `OpHandle` contains its
+`type`, the pointers to the `Variable`s, and necessary attributes.
+
+```c++
+class Variable {
+public:
+  VriableHandle Grad(); // returns its gradient variable
+private:
+  framework::VarDesc desc_; // compile time infershape, necessary for lazy execution
+  framework::Variable var_; // run time variable, holds data memory
+};
+
+using VariableHandle = shared_ptr<Variable>;
+
+struct OpHandle {
+  string type_;
+  map<string, vector<VariableHandle>> inputs_;
+  map<string, vector<VariableHandle>> outputs_;
+  AttributeMap attrs_;
+};
+
+class Tape {
+public:
+  void AddOp(OpHandle); // add op
+  void Forward();       // execute the tape_
+  void Backward();      // execute the backward of the tape_
+private:
+  vector<OpHandle> tape_;
+};
+```
+
+We uses `Function` to indicate layers. It takes care of parameter
+initialization and `AddOp` to the Tape when it is called.
+
+```c++
+class Linear {
+ public:
+  Linear(int in_dim, int out_dim, const std::string &act)
+      : w_(new Variable("LinearWeight")),
+        b_(new Variable("LinearBias")),
+        act_(act) {
+    Tape init_tape;
+
+    std::string initializer = "fill_constant";
+    framework::AttributeMap attrs;
+    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
+    attrs["shape"] = std::vector<int>{in_dim, out_dim};
+    attrs["value"] = 1.0f;
+    init_tape.AddOp(initializer, {}, {{"Out", {w_}}}, attrs);
+
+    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
+    attrs["shape"] = std::vector<int>{out_dim};
+    attrs["value"] = 1.0f;
+    init_tape.AddOp(initializer, {}, {{"Out", {b_}}}, attrs);
+
+    init_tape.Forward();
+  }
+
+  VariableHandle operator()(VariableHandle input) {
+    VariableHandle pre_bias(new Variable("linear"));
+    get_global_tape().AddOp("mul",
+                            {{"X", {input}}, {"Y", {w_}}},
+                            {{"Out", {pre_bias}}},
+                            {{"x_num_col_dims", 1}, {"y_num_col_dims", 1}});
+    VariableHandle pre_act(new Variable("linear"));
+    get_global_tape().AddOp("elementwise_add",
+                            {{"X", {pre_bias}}, {"Y", {b_}}},
+                            {{"Out", {pre_act}}},
+                            {{"axis", 1}});
+    VariableHandle post_act(new Variable("linear"));
+    get_global_tape().AddOp(act_,
+                            {{"X", {pre_act}}},
+                            {{"Out", {post_act}}},
+                            {});
+    return post_act;
+  }
+
+  std::vector<VariableHandle> Params() { return {w_, b_}; }
+
+ private:
+  VariableHandle w_;
+  VariableHandle b_;
+  std::string act_;
+};
+```
+
+## User API
+
+```c++
+// Model function
+paddle::tape::Linear linear1(3, 3, "relu"); // init weight and bias
+paddle::tape::Linear linear2(3, 3, "relu"); // init weight and bias
+paddle::tape::Mean mean;
+
+// Optimizer
+paddle::tape::SGD sgd(0.001);
+
+// Data Feeder
+paddle::tape::Fill data_feeder(...);
+VariableHandle input(new paddle::tape::Variable("input"));
+VariableHandle label(new paddle::tape::Variable("label"));
+
+for (int i = 0; i < 2; ++i) {
+  reset_global_tape();
+
+  data_feeder(input, label);
+
+  auto loss = softmax(linear2(linear1(input)), label); // compile time InferShape & InferVarType
+  LOG(INFO) << loss.value(); // Run forward up to loss
+
+  // Run backward, store gradient of w at w->Grad()
+  get_global_tape.Backward(loss);
+
+  // Update w
+  sgd(linear1.Params());
+  sgd(linear2.Params());
+}
+```
+
+<details>
+  <summary></summary>
+digraph G {
+
+	subgraph cluster_0 {
+                node [shape=record,style=filled];
+		style=filled;
+		color=lightgrey;
+                linear1 [label="{type: mul | {input | {<before_mul1>X: before_mul1 |<weight1> Y: weight1}} |  {output |<before_bias1> Out: before_bias1}}"];
+                elementwise_add1 [label="{type: elementwise_add | {input | {<before_bias1>X: before_bias1 |<bias1> Y: bias1}} |  {output |<before_act1> Out: before_act1}}"];
+                relu1 [label="{type: relu | {input | {<before_act1>X: before_act1 }} |  {output |<after_act1> Out: after_act1}}"];
+
+		linear1 -> elementwise_add1->relu1;
+		label = "forward tape";
+	}
+
+        linear1:before_mul1->before_mul1
+        linear1:weight1->weight1
+        linear1:before_bias1->before_bias1
+
+        elementwise_add1:bias1->bias1
+        elementwise_add1:before_bias1->before_bias1
+        elementwise_add1:before_act1->before_act1
+
+        relu1:before_act1->before_act1
+        relu1:after_act1->after_act1
+
+	subgraph cluster_1 {
+                node [shape=record,style=filled];
+		style=filled;
+		color=lightgrey;
+                linear1_grad [label="{type: mul_grad | {input | {<before_mul1>X: before_mul1 |<weight1> Y: weight1|<before_bias1_grad> Out_grad: before_bias1_grad}} |  {output |{<before_mul1_grad>X_grad: before_mul1_grad |<weight1_grad> Y_grad: weight1_grad}}}"];
+
+                elementwise_add1_grad [label="{type: elementwise_add_grad | {input | <before_act1_grad> Out_grad: before_act1_grad} |  {output |{<before_bias1_grad>X_grad: before_bias1_grad |<bias1_grad> Y_grad: bias1_grad}}}"];
+
+                relu1_grad [label="{type: relu_grad |  {input |<after_act1_grad> Out_grad: after_act1_grad} | {ouput | {<before_act1_grad>X_grad: before_act1_grad }}}"];
+
+		linear1_grad -> elementwise_add1_grad ->relu1_grad [dir=back];
+                label = "backward tape";
+	}
+
+        relu1_grad:after_act1_grad->after_act1_grad
+        relu1_grad:before_act1_grad->before_act1_grad
+
+        elementwise_add1_grad:before_act1_grad->before_act1_grad
+        elementwise_add1_grad:before_bias1_grad->before_bias1_grad
+        elementwise_add1_grad:bias1_grad->bias1_grad
+
+        linear1_grad:before_mul1->before_mul1
+        linear1_grad:weight1->weight1
+        linear1_grad:before_bias1_grad->before_bias1_grad
+        linear1_grad:before_mul1_grad->before_mul1_grad
+        linear1_grad:weight1_grad->weight1_grad
+
+
+	subgraph cluster_2 {
+                node [shape=record];
+                label = "Linear1";
+                weight1
+                bias1
+	}
+
+        weight1 -> weight1_grad [ label="Grad()", style="dashed" ];
+        bias1 -> bias1_grad [ label="Grad()", style="dashed"];
+
+	
+
+}
+</details>
+
+![Image](https://github.com/tonyyang-svail/Paddle/blob/cpp_tap/paddle/contrib/tape/computation_graph.png)
+
+## Code Reuse
+
+We want to stay close to Paddle Fluid as much as possible.
+
+### Reuse All Operators
+
+As all Ops are registered at `OpInfoMap`, the effort of adding a new `Function`
+is about 10 lines of code, similar to expose an operator to Python.
+
+### Reuse Compile Time InferShape and InferVarType
+
+Note that all the symbolic information is stored at `tape::Varaible::desc_`, instead
+of `ProgramDesc.block.vars`, we create a temporary `BlockDesc` to do `InferShape` and
+`InferVarType` every time we `AddOp` to the tape.
+
+### Reuse Operator::Run
+
+We use smart pointer, instead of `Scope`, to manage memory. So we create a temporary
+`Scope` for every `Operator::Run()`.
+
+## Possible Feature
+
+### Release Memory on Backward
+
+We can release memory aggressively. During backward, we can delete the OpHandle once
+we have finished its backward. Since all the variable is managed by smart pointer, the
+memory is automatically released when its `ref_count` goes to 0.
+
+### Kernel Fusion
+
+As a symbolic representation of the Tape is constructed first before the actual
+execution, it would be possible to perform graph optimization. One use case is kernel
+fusion.
--- a/paddle/contrib/tape/computation_graph.png
+++ b/paddle/contrib/tape/computation_graph.png
--- a/paddle/contrib/tape/function.h
+++ b/paddle/contrib/tape/function.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/contrib/tape/tape.h"
+#include "paddle/contrib/tape/variable.h"
+#include "paddle/fluid/framework/type_defs.h"
+
+namespace paddle {
+namespace tape {
+
+class Function {};
+
+class Fill {
+ public:
+  Fill(const std::string &initializer, const framework::AttributeMap &attrs)
+      : initializer_(initializer), attrs_(attrs) {}
+
+  void operator()(VariableHandle var) {
+    get_global_tape().AddOp(initializer_, {}, {{"Out", {var}}}, attrs_);
+  }
+
+ private:
+  const std::string initializer_;
+  const framework::AttributeMap attrs_;
+};
+
+class Mean {
+ public:
+  VariableHandle operator()(VariableHandle var) {
+    VariableHandle out(new Variable("mean"));
+    get_global_tape().AddOp("mean", {{"X", {var}}}, {{"Out", {out}}}, {});
+    return out;
+  }
+};
+
+class Linear {
+ public:
+  Linear(int in_dim, int out_dim, const std::string &act)
+      : w_(new Variable("LinearWeight")),
+        b_(new Variable("LinearBias")),
+        act_(act) {
+    Tape init_tape;
+
+    std::string initializer = "fill_constant";
+    framework::AttributeMap attrs;
+    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
+    attrs["shape"] = std::vector<int>{in_dim, out_dim};
+    attrs["value"] = 1.0f;
+    init_tape.AddOp(initializer, {}, {{"Out", {w_}}}, attrs);
+
+    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
+    attrs["shape"] = std::vector<int>{out_dim};
+    attrs["value"] = 1.0f;
+    init_tape.AddOp(initializer, {}, {{"Out", {b_}}}, attrs);
+
+    init_tape.Forward();
+  }
+
+  VariableHandle operator()(VariableHandle input) {
+    VariableHandle pre_bias(new Variable("linear"));
+    get_global_tape().AddOp("mul",
+                            {{"X", {input}}, {"Y", {w_}}},
+                            {{"Out", {pre_bias}}},
+                            {{"x_num_col_dims", 1}, {"y_num_col_dims", 1}});
+    VariableHandle pre_act(new Variable("linear"));
+    get_global_tape().AddOp("elementwise_add",
+                            {{"X", {pre_bias}}, {"Y", {b_}}},
+                            {{"Out", {pre_act}}},
+                            {{"axis", 1}});
+    VariableHandle post_act(new Variable("linear"));
+    get_global_tape().AddOp(
+        act_, {{"X", {pre_act}}}, {{"Out", {post_act}}}, {});
+    return post_act;
+  }
+
+  std::vector<VariableHandle> Params() { return {w_, b_}; }
+
+ private:
+  VariableHandle w_;
+  VariableHandle b_;
+  std::string act_;
+};
+
+class SGD {
+ public:
+  SGD(float learning_rate) : learning_rate_(new Variable("sgd")) {
+    Tape init_tape;
+
+    std::string initializer = "fill_constant";
+    framework::AttributeMap attrs;
+    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
+    attrs["shape"] = std::vector<int>{1};
+    attrs["value"] = learning_rate;
+    init_tape.AddOp(initializer, {}, {{"Out", {learning_rate_}}}, attrs);
+
+    init_tape.Forward();
+  }
+
+  void operator()(VariableHandle input) {
+    PADDLE_ENFORCE(get_global_tape().HasBeenBackwarded(),
+                   "optimization must happen after the backward");
+    Tape temp_tape;
+    temp_tape.AddOp("sgd",
+                    {{"Param", {input}},
+                     {"LearningRate", {learning_rate_}},
+                     {"Grad", {input->Grad()}}},
+                    {{"ParamOut", {input}}},
+                    {});
+    temp_tape.Forward();
+  }
+
+ private:
+  VariableHandle learning_rate_;
+};
+}
+}
--- a/paddle/contrib/tape/tape.cc
+++ b/paddle/contrib/tape/tape.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/contrib/tape/tape.h"
+
+#include <list>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/dim.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/pybind/pybind.h"
+
+namespace paddle {
+namespace tape {
+
+// borrowed from
+// https://stackoverflow.com/questions/874134/find-if-string-ends-with-another-string-in-c
+inline bool ends_with(std::string const &value, std::string const &ending) {
+  if (ending.size() > value.size()) return false;
+  return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
+}
+
+std::ostream &operator<<(std::ostream &os, const framework::VarDesc &var_desc) {
+  os << var_desc.Name();
+  os << "[" << var_desc.GetType() << "]";
+  os << "[" << var_desc.GetDataType() << "]";
+  os << "{";
+  for (auto &i : var_desc.GetShape()) {
+    os << i << ",";
+  }
+  os << "}";
+  return os;
+}
+
+std::string to_string(const std::string &type,
+                      const VariableHandleMap &in_vars,
+                      const VariableHandleMap &out_vars,
+                      const framework::AttributeMap &attrs) {
+  std::stringstream ss;
+  ss << type << " ";
+  for (auto &param_name : in_vars) {
+    for (auto &var : param_name.second) {
+      ss << param_name.first << ":(" << var->Desc() << ") ";
+    }
+  }
+  for (auto &param_name : out_vars) {
+    for (auto &var : param_name.second) {
+      ss << param_name.first << ":(" << var->Desc() << ") ";
+    }
+  }
+  return ss.str();
+}
+
+framework::OpDesc CreateOpDesc(const std::string &type,
+                               const VariableHandleMap &in_vars,
+                               const VariableHandleMap &out_vars,
+                               const framework::AttributeMap &attrs) {
+  framework::VariableNameMap inputs;
+  for (auto &param_name : in_vars) {
+    for (auto &var : param_name.second) {
+      inputs[param_name.first].emplace_back(var->Name());
+    }
+  }
+  framework::VariableNameMap outputs;
+  for (auto &param_name : out_vars) {
+    for (auto &var : param_name.second) {
+      outputs[param_name.first].emplace_back(var->Name());
+    }
+  }
+  return framework::OpDesc(type, inputs, outputs, attrs);
+}
+
+void InferShapeAndVarType(const std::string &type,
+                          const VariableHandleMap &in_vars,
+                          VariableHandleMap *out_vars,
+                          const framework::AttributeMap &attrs) {
+  framework::OpDesc op_desc = CreateOpDesc(type, in_vars, *out_vars, attrs);
+
+  // Create a temporary block for compile-time
+  framework::ProgramDesc program_desc;
+  framework::BlockDesc *block_desc = program_desc.MutableBlock(0);
+  PADDLE_ENFORCE(block_desc);
+
+  for (auto &param_name : in_vars) {
+    for (auto &var : param_name.second) {
+      *block_desc->Var(var->Name())->Proto() = *var->MutableDesc()->Proto();
+    }
+  }
+  for (auto &param_name : *out_vars) {
+    for (auto &var : param_name.second) {
+      *block_desc->Var(var->Name())->Proto() = *var->MutableDesc()->Proto();
+    }
+  }
+
+  LOG(INFO) << "- " << to_string(type, in_vars, *out_vars, attrs);
+  op_desc.InferShape(*block_desc);
+  op_desc.InferVarType(block_desc);
+  for (auto &param_name : *out_vars) {
+    for (auto &var : param_name.second) {
+      *var->MutableDesc()->Proto() = *block_desc->Var(var->Name())->Proto();
+    }
+  }
+  LOG(INFO) << "+ " << to_string(type, in_vars, *out_vars, attrs);
+}
+
+void Tape::AddOp(const std::string &type,
+                 const VariableHandleMap &in_vars,
+                 VariableHandleMap out_vars,
+                 const framework::AttributeMap &attrs) {
+  InferShapeAndVarType(type, in_vars, &out_vars, attrs);
+  tape_.emplace_back(type, in_vars, out_vars, attrs);
+}
+
+// Temporary Scope for Operator::Run()
+class ScopeWrapper : public framework::Scope {
+ public:
+  ScopeWrapper(const VariableHandleMap &in_vars,
+               const VariableHandleMap &out_vars) {
+    for (auto &v : in_vars) {
+      for (auto &vv : v.second) {
+        if (!vars_.count(vv->Name())) {
+          vars_[vv->Name()].reset(vv->Var());
+        }
+      }
+    }
+    for (auto &v : out_vars) {
+      for (auto &vv : v.second) {
+        if (!vars_.count(vv->Name())) {
+          vars_[vv->Name()].reset(vv->Var());
+        }
+      }
+    }
+  }
+
+  ~ScopeWrapper() {
+    for (auto &pair : vars_) {
+      pair.second.release();
+    }
+  }
+};
+
+void Tape::Forward() {
+  LOG(INFO) << "Starting forward -------------------------";
+  PADDLE_ENFORCE(!has_been_backwarded_);
+  while (current_position_ < tape_.size()) {
+    OpHandle &op = tape_[current_position_];
+
+    // Create Output Tensor, this is only necessary for OpWithKernel
+    for (auto &param2var : op.outputs_) {
+      for (auto &var : param2var.second) {
+        var->InitializeVariable();
+      }
+    }
+
+    framework::OpDesc op_desc =
+        CreateOpDesc(op.type_, op.inputs_, op.outputs_, op.attrs_);
+    ScopeWrapper scope(op.inputs_, op.outputs_);
+    framework::OpRegistry::CreateOp(op_desc)->Run(scope, platform::CPUPlace());
+    current_position_++;
+  }
+
+  LOG(INFO) << "Finishing forward -------------------------";
+}
+
+void Tape::Backward(VariableHandle target) {
+  PADDLE_ENFORCE(!has_been_backwarded_);
+
+  Forward();
+
+  // TODO(tonyyang-svail): check output of last op is target
+  backward_tape_.reset(new Tape());
+
+  framework::AttributeMap attrs;
+
+  // FIXME(tonyyang-svail): Need to infer_data_type
+  attrs["dtype"] = framework::proto::VarType::Type::VarType_Type_FP32;
+  attrs["shape"] = std::vector<int>{1};
+  attrs["value"] = 1.0f;
+  backward_tape_->AddOp(
+      "fill_constant", {}, {{"Out", {target->Grad()}}}, attrs);
+
+  for (auto it = tape_.rbegin(); it != tape_.rend(); ++it) {
+    framework::OpDesc op_desc =
+        CreateOpDesc(it->type_, it->inputs_, it->outputs_, it->attrs_);
+    std::unordered_map<std::string, std::string> grad_to_var;
+    std::vector<std::unique_ptr<framework::OpDesc>> grad_op_descs =
+        framework::OpInfoMap::Instance()
+            .Get(op_desc.Type())
+            .GradOpMaker()(op_desc, {}, &grad_to_var, {});
+
+    for (auto &op_desc : grad_op_descs) {
+      std::unordered_map<std::string, VariableHandle> name2var;
+      for (auto &param2vars : it->inputs_) {
+        for (auto &a : param2vars.second) {
+          name2var[a->Name()] = a;
+        }
+      }
+      for (auto &param2vars : it->outputs_) {
+        for (auto &a : param2vars.second) {
+          name2var[a->Name()] = a;
+        }
+      }
+
+      VariableHandleMap in_vars;
+      VariableHandleMap out_vars;
+      std::map<const framework::VariableNameMap *, VariableHandleMap *>
+          loop_over{{&op_desc->Inputs(), &in_vars},
+                    {&op_desc->Outputs(), &out_vars}};
+      for (auto &each : loop_over) {
+        auto &vmp = *each.first;
+        auto &vhm = *each.second;
+        for (auto &p2a : vmp) {
+          for (auto &argu : p2a.second) {
+            if (name2var.count(argu)) {
+              vhm[p2a.first].push_back(name2var[argu]);
+            } else {
+              PADDLE_ENFORCE(ends_with(argu, framework::kGradVarSuffix),
+                             argu.c_str());
+              std::string name = argu.substr(
+                  0, argu.size() - std::strlen(framework::kGradVarSuffix));
+              PADDLE_ENFORCE(name2var.count(name), name.c_str());
+              vhm[p2a.first].push_back(name2var[name]->Grad());
+            }
+          }
+        }
+      }
+
+      backward_tape_->AddOp(
+          op_desc->Type(), in_vars, out_vars, op_desc->GetAttrMap());
+    }
+
+    // TODO(tonyyang-svail): how to fill empty grad?
+    // TODO(tonyyang-svail): Sum var grad is necessary
+  }
+
+  backward_tape_->Forward();
+  has_been_backwarded_ = true;
+}
+
+Tape &get_global_tape() {
+  static Tape T;
+  return T;
+}
+
+void reset_global_tape() { get_global_tape() = Tape(); }
+}
+}
--- a/paddle/contrib/tape/tape.h
+++ b/paddle/contrib/tape/tape.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/contrib/tape/variable.h"
+
+namespace paddle {
+namespace tape {
+
+using VariableHandleMap = std::map<std::string, std::vector<VariableHandle>>;
+
+struct OpHandle {
+  OpHandle(const std::string &type,
+           const VariableHandleMap &in_vars,
+           const VariableHandleMap &out_vars,
+           const framework::AttributeMap &attrs)
+      : type_(type), inputs_(in_vars), outputs_(out_vars), attrs_(attrs) {}
+
+  std::string type_;
+  VariableHandleMap inputs_;
+  VariableHandleMap outputs_;
+  framework::AttributeMap attrs_;
+};
+
+class Tape {
+ public:
+  void AddOp(const std::string &type,
+             const VariableHandleMap &in_vars,
+             VariableHandleMap out_vars,
+             const framework::AttributeMap &attrs);
+  void Forward();
+  void Backward(VariableHandle target);
+
+  bool HasBeenBackwarded() { return has_been_backwarded_; }
+
+ private:
+  bool has_been_backwarded_ = false;
+  size_t current_position_ = 0;
+
+  std::vector<OpHandle> tape_;
+  std::shared_ptr<Tape> backward_tape_;
+};
+
+Tape &get_global_tape();
+
+void reset_global_tape();
+}
+}
--- a/paddle/contrib/tape/test_tape.cc
+++ b/paddle/contrib/tape/test_tape.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+#include "paddle/contrib/tape/function.h"
+
+using namespace paddle::tape;
+
+TEST(Tape, TestMLP) {
+  LOG(INFO) << "TestMLP";
+  Linear linear1(3, 3, "relu");
+  Linear linear2(3, 3, "relu");
+  Mean mean;
+
+  SGD sgd(0.001);
+
+  std::string initializer = "fill_constant";
+  paddle::framework::AttributeMap attrs;
+  attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
+  attrs["shape"] = std::vector<int>{3, 3};
+  attrs["value"] = 1.0f;
+  Fill filler(initializer, attrs);
+
+  for (int i = 0; i < 2; ++i) {
+    reset_global_tape();
+
+    VariableHandle input(new Variable("input"));
+    filler(input);
+
+    auto loss = mean(linear2(linear1(input)));
+
+    get_global_tape().Backward(loss);
+
+    for (auto w : linear1.Params()) {
+      sgd(w);
+    }
+    for (auto w : linear2.Params()) {
+      sgd(w);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  std::vector<paddle::platform::Place> places;
+  places.emplace_back(paddle::platform::CPUPlace());
+  paddle::platform::DeviceContextPool::Init(places);
+
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
--- a/paddle/contrib/tape/variable.cc
+++ b/paddle/contrib/tape/variable.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/contrib/tape/variable.h"
+
+namespace paddle {
+namespace tape {
+
+void Variable::InitializeVariable() {
+  LOG(INFO) << "Initialzing " << desc_.Name() << " as " << desc_.GetType();
+  framework::proto::VarType::Type var_type = desc_.GetType();
+  if (var_type == framework::proto::VarType::LOD_TENSOR) {
+    var_.GetMutable<framework::LoDTensor>();
+  } else if (var_type == framework::proto::VarType::SELECTED_ROWS) {
+    var_.GetMutable<framework::SelectedRows>();
+  } else {
+    PADDLE_THROW("Variable type %d is not in [LOD_TENSOR, SELECTED_ROWS]",
+                 var_type);
+  }
+}
+}
+}
--- a/paddle/contrib/tape/variable.h
+++ b/paddle/contrib/tape/variable.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <memory>
+
+#include "paddle/fluid/framework/operator.h"  // framework::kGradVarSuffix
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/variable.h"
+
+namespace paddle {
+namespace tape {
+
+class Variable;
+using VariableHandle = std::shared_ptr<Variable>;
+
+/*
+ * Combination of
+ *     framework::VarDesc desc_;
+ *     framework::Variable var_;
+ */
+class Variable {
+ public:
+  Variable(const std::string pre_fix)
+      : desc_(pre_fix + std::to_string(count())) {}
+
+  Variable(const std::string pre_fix, bool is_grad)
+      : desc_(pre_fix + (is_grad ? framework::kGradVarSuffix
+                                 : std::to_string(count()))) {}
+
+  ~Variable() { LOG(INFO) << "Deleting " << Name(); }
+
+  // Instantiate LoDTensor/SelectedRow
+  void InitializeVariable();
+
+  VariableHandle Grad() {
+    if (grad_.expired()) {
+      VariableHandle new_grad(new Variable(desc_.Name(), true));
+      grad_ = new_grad;
+      return new_grad;
+    } else {
+      return VariableHandle(grad_);
+    }
+  }
+
+  // Stochastic Gradient Descent with Momentum
+  //  VariableHandle Momentum ();
+
+  //  void init(const std::string& initializer,
+  //            const framework::AttributeMap& attrs);
+
+  // void value() {};
+
+  const framework::VarDesc& Desc() const { return desc_; }
+  framework::VarDesc* MutableDesc() { return &desc_; }
+
+  // TODO(tonyyang-svail): No need to expose name
+  std::string Name() const { return desc_.Name(); }
+
+  framework::Variable* Var() { return &var_; }
+
+ private:
+  int count() {
+    static int counter = 0;
+    return counter++;
+  }
+
+  framework::VarDesc desc_;
+  framework::Variable var_;
+
+  std::weak_ptr<Variable> grad_;
+};
+}
+}
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -84,7 +84,7 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)

 if(WITH_DISTRIBUTE)
-  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr)
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr)
  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
  set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -330,8 +330,12 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
  }

  for (auto& op : ctx->ops_) {
-    VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
+    VLOG(4) << place_ << " " << op->DebugStringEx(local_scope);
    op->Run(*local_scope, place_);
+    // NOTE! Please do not delete this line, it's usefull because the debug
+    // string before and after op.run are different, after run the output
+    // will have right shape which is usefull for debug.
+    VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);

    if (FLAGS_benchmark) {
      VLOG(2) << "Memory used after operator " + op->Type() + " running: "
@@ -402,6 +406,9 @@ void Executor::EnableMKLDNN(const ProgramDesc& program) {
      }
    }
  }
+#else
+  LOG(WARNING)
+      << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option";
 #endif
 }


--- a/paddle/fluid/framework/init.cc
+++ b/paddle/fluid/framework/init.cc
@@ -18,6 +18,7 @@ limitations under the License. */

 #include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/piece.h"
@@ -113,6 +114,9 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
  }
  places.emplace_back(platform::CPUPlace());
  platform::DeviceContextPool::Init(places);
+#ifndef PADDLE_WITH_MKLDNN
+  operators::math::SetNumThreads(1);
+#endif
 }

 void InitGLOG(const std::string &prog_name) {

--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -410,5 +410,38 @@ void LoDTensor::MergeLoDTensor(
  }
 }

+LoD ConvertToLengthBasedLoD(const LoD &offset_lod) {
+  LoD length_lod;
+  length_lod.reserve(offset_lod.size());
+  for (size_t lvl = 0; lvl < offset_lod.size(); ++lvl) {
+    std::vector<size_t> level;
+    if (offset_lod[lvl].size() > 0) {
+      level.reserve(offset_lod[lvl].size() - 1);
+    }
+    for (size_t idx = 0; idx < offset_lod[lvl].size() - 1; ++idx) {
+      level.push_back(offset_lod[lvl][idx + 1] - offset_lod[lvl][idx]);
+    }
+    length_lod.push_back(level);
+  }
+  return length_lod;
+}
+
+LoD ConvertToOffsetBasedLoD(const LoD &length_lod) {
+  LoD offset_lod;
+  offset_lod.reserve(length_lod.size());
+  for (size_t lvl = 0; lvl < length_lod.size(); ++lvl) {
+    std::vector<size_t> level;
+    level.reserve(length_lod[lvl].size() + 1);
+    size_t tmp = 0;
+    level.push_back(tmp);
+    for (size_t idx = 0; idx < length_lod[lvl].size(); ++idx) {
+      tmp += length_lod[lvl][idx];
+      level.push_back(tmp);
+    }
+    offset_lod.push_back(level);
+  }
+  return offset_lod;
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -226,5 +226,19 @@ extern void WriteToRecordIO(recordio::Writer* writer,
 extern std::vector<LoDTensor> ReadFromRecordIO(
    recordio::Scanner* scanner, const platform::DeviceContext& dev_ctx);

+/*
+ * Convert between length-based LoD and offset-based LoD.
+ * The implementation of LoDTensor class use offset-based LoD.
+ * However, we want to expose the more user-friendly length-based
+ * LoD to the Python side instead.
+ *
+ * Example:
+ * If offset_lod = [[0, 2, 3],[0, 3, 5, 9]]
+ * then length_lod = [[2, 1], [3, 2, 4]]
+ */
+LoD ConvertToLengthBasedLoD(const LoD& offset_lod);
+
+LoD ConvertToOffsetBasedLoD(const LoD& length_lod);
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -228,6 +228,38 @@ TEST(LoD, CheckAbsLoD) {
  ASSERT_FALSE(CheckAbsLoD(abs_lod0));
 }

+TEST(LoD, ConvertToLengthBasedLoD) {
+  LoD offset_lod;
+  offset_lod.push_back(std::vector<size_t>({0, 2}));
+  offset_lod.push_back(std::vector<size_t>({0, 1, 3}));
+  offset_lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
+
+  LoD length_lod = ConvertToLengthBasedLoD(offset_lod);
+
+  LoD expected;
+  expected.push_back(std::vector<size_t>({2}));
+  expected.push_back(std::vector<size_t>({1, 2}));
+  expected.push_back(std::vector<size_t>({2, 2, 1}));
+
+  EXPECT_EQ(length_lod, expected);
+}
+
+TEST(LoD, ConvertToOffsetBasedLoD) {
+  LoD length_lod;
+  length_lod.push_back(std::vector<size_t>({2}));
+  length_lod.push_back(std::vector<size_t>({1, 2}));
+  length_lod.push_back(std::vector<size_t>({2, 2, 1}));
+
+  LoD offset_lod = ConvertToOffsetBasedLoD(length_lod);
+
+  LoD expected;
+  expected.push_back(std::vector<size_t>({0, 2}));
+  expected.push_back(std::vector<size_t>({0, 1, 3}));
+  expected.push_back(std::vector<size_t>({0, 2, 4, 5}));
+
+  EXPECT_EQ(offset_lod, expected);
+}
+
 template <typename T>
 static void TestRecordIO() {
  LoDTensor tensor;

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -69,6 +69,19 @@ static DDim GetDims(const Scope& scope, const std::string& name,
  }
 }

+static int GetRowSize(const Scope& scope, const std::string& name) {
+  Variable* var = scope.FindVar(name);
+  if (var == nullptr) {
+    return -1;
+  }
+
+  if (var->IsType<SelectedRows>()) {
+    return var->Get<SelectedRows>().rows().size();
+  }
+
+  return -1;
+}
+
 static LoD GetLoD(const Scope& scope, const std::string& name) {
  Variable* var = scope.FindVar(name);
  auto default_lod = LoD({{}});
@@ -85,6 +98,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
 }

 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
+  VLOG(10) << "- " << DebugStringEx(&scope);
  if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
    PADDLE_THROW("Cannot run operator on place %s", place);
@@ -94,6 +108,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #endif
  }
  RunImpl(scope, place);
+  VLOG(10) << "+ " << DebugStringEx(&scope);
 }

 bool OperatorBase::HasInputs(const std::string& name) const {
@@ -153,6 +168,10 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
    for (size_t i = 0; i < input.second.size(); ++i) {
      ss << input.second[i];
      if (scope) {
+        int row_size = GetRowSize(*scope, input.second[i]);
+        if (row_size >= 0) {
+          ss << "[row_size=" << row_size << "]";
+        }
        ss << "[" << GetDims(*scope, input.second[i], true) << "]";
        ss << "(" << GetLoD(*scope, input.second[i]) << ")";
      }
@@ -173,6 +192,10 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
    for (size_t i = 0; i < output.second.size(); ++i) {
      ss << output.second[i];
      if (scope) {
+        int row_size = GetRowSize(*scope, output.second[i]);
+        if (row_size >= 0) {
+          ss << "[row_size=" << row_size << "]";
+        }
        ss << "[" << GetDims(*scope, output.second[i], true) << "]";
        ss << "(" << GetLoD(*scope, output.second[i]) << ")";
      }

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -145,9 +145,9 @@ void ParallelExecutor::BCastParamsToGPUs(
    auto &dims = main_tensor.dims();
    if (paddle::platform::is_gpu_place(main_tensor.place())) {
 #ifdef PADDLE_WITH_CUDA
+      std::vector<void *> buffers;
      size_t numel = main_tensor.numel();
      ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
-      platform::NCCLGroupGuard guard;
      for (size_t i = 0; i < member_->places_.size(); ++i) {
        auto place = member_->places_[i];
        void *buffer;
@@ -159,11 +159,21 @@ void ParallelExecutor::BCastParamsToGPUs(
          t->Resize(dims);
          buffer = t->mutable_data(place, main_tensor.type());
        }
-        auto &nccl_ctx = member_->nccl_ctxs_->at(place);
-        platform::dynload::ncclBcast(buffer, numel, data_type, 0,
-                                     nccl_ctx.comm_, nccl_ctx.stream());
+        buffers.push_back(buffer);
      }
-      member_->nccl_ctxs_->WaitAll();
+
+      PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(),
+                        "variables' buffer size to bcast NOT equal to places");
+      {
+        platform::NCCLGroupGuard guard;
+        for (size_t i = 0; i < member_->places_.size(); ++i) {
+          auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]);
+          platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
+                                       nccl_ctx.comm_, nccl_ctx.stream());
+        }
+        member_->nccl_ctxs_->WaitAll();
+      }
+
 #else
      PADDLE_THROW("Not compiled with CUDA");
 #endif

--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -43,48 +43,29 @@ Scope& Scope::NewScope() const {
 }

 Variable* Scope::Var(const std::string& name) {
-  // acquire the lock when new var under this scope
  std::unique_lock<std::mutex> lock(mutex_);
-  auto* v = FindVarLocally(name);
-  if (v != nullptr) return v;
-
-  v = new Variable();
-  vars_[name].reset(v);
-  VLOG(3) << "Create variable " << name;
-  v->name_ = &(vars_.find(name)->first);
-  return v;
+  return VarInternal(name);
 }

 Variable* Scope::Var(std::string* name) {
-  auto var_name = string::Sprintf("%p.%d", this, vars_.size());
+  std::unique_lock<std::mutex> lock(mutex_);
+  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
  if (name != nullptr) {
-    *name = var_name;
+    *name = new_name;
  }
-  return Var(var_name);
+  return VarInternal(new_name);
 }

 Variable* Scope::FindVar(const std::string& name) const {
-  // acquire the lock when find var
  std::unique_lock<std::mutex> lock(mutex_);
  return FindVarInternal(name);
 }

-Variable* Scope::FindVarInternal(const std::string& name) const {
-  auto var = FindVarLocally(name);
-  if (var != nullptr) {
-    return var;
-  }
-  return (parent_ == nullptr) ? nullptr : parent_->FindVarInternal(name);
-}
-
 const Scope* Scope::FindScope(const Variable* var) const {
-  for (auto& kv : vars_) {
-    if (kv.second.get() == var) {
-      return this;
-    }
-  }
-  return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
+  std::unique_lock<std::mutex> lock(mutex_);
+  return FindScopeInternal(var);
 }
+
 void Scope::DropKids() {
  std::unique_lock<std::mutex> lock(mutex_);
  for (Scope* s : kids_) delete s;
@@ -92,6 +73,7 @@ void Scope::DropKids() {
 }

 std::vector<std::string> Scope::LocalVarNames() const {
+  std::unique_lock<std::mutex> lock(mutex_);
  std::vector<std::string> known_vars;
  known_vars.reserve(this->vars_.size());
  for (auto& p : vars_) {
@@ -127,6 +109,39 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {

 void Scope::Rename(const std::string& origin_name,
                   const std::string& new_name) const {
+  std::unique_lock<std::mutex> lock(mutex_);
+  RenameInternal(origin_name, new_name);
+}
+
+std::string Scope::Rename(const std::string& origin_name) const {
+  std::unique_lock<std::mutex> lock(mutex_);
+  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
+  RenameInternal(origin_name, new_name);
+  return new_name;
+}
+
+Variable* Scope::VarInternal(const std::string& name) {
+  auto* v = FindVarLocally(name);
+  if (v != nullptr) return v;
+
+  v = new Variable();
+  vars_[name].reset(v);
+  VLOG(3) << "Create variable " << name;
+  v->name_ = &(vars_.find(name)->first);
+  return v;
+}
+
+const Scope* Scope::FindScopeInternal(const Variable* var) const {
+  for (auto& kv : vars_) {
+    if (kv.second.get() == var) {
+      return this;
+    }
+  }
+  return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
+}
+
+void Scope::RenameInternal(const std::string& origin_name,
+                           const std::string& new_name) const {
  auto origin_it = vars_.find(origin_name);
  PADDLE_ENFORCE(origin_it != vars_.end(),
                 "Cannot find original variable with name %s", origin_name);
@@ -137,10 +152,12 @@ void Scope::Rename(const std::string& origin_name,
  vars_.erase(origin_it);
 }

-std::string Scope::Rename(const std::string& origin_name) const {
-  auto var_name = string::Sprintf("%p.%d", this, vars_.size());
-  Rename(origin_name, var_name);
-  return var_name;
+Variable* Scope::FindVarInternal(const std::string& name) const {
+  auto var = FindVarLocally(name);
+  if (var != nullptr) {
+    return var;
+  }
+  return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
 }

 Variable* Scope::FindVarLocally(const std::string& name) const {

--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -81,20 +81,29 @@ class Scope {
  // Rename variable to a new name and return the new name
  std::string Rename(const std::string& origin_name) const;

+ protected:
+  mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
+
 private:
  // Call Scope::NewScope for a sub-scope.
  explicit Scope(Scope const* parent) : parent_(parent) {}

+  // Called by Var.
+  Variable* VarInternal(const std::string& name);
+
+  // Called by FindScope.
+  const Scope* FindScopeInternal(const Variable* var) const;
+
+  // Called by Rename.
+  void RenameInternal(const std::string& origin_name,
+                      const std::string& new_name) const;
+
  // Called by FindVar recursively.
-  // Caller doesn't own the returned Variable.
  Variable* FindVarInternal(const std::string& name) const;

  // Called by FindVarInternal and Var.
-  // Caller doesn't own the returned Variable.
  Variable* FindVarLocally(const std::string& name) const;

-  mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
-
  // Scope in `kids_` are owned by this class.
  mutable std::list<Scope*> kids_;
  Scope const* parent_{nullptr};

--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -20,16 +20,20 @@ limitations under the License. */
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/pybind/pybind.h"

 DEFINE_string(devices, "", "The devices to be used which is joined by comma.");
 DEFINE_bool(init_p2p, false, "Whether to init p2p.");
+DEFINE_int32(math_num_threads, 1,
+             "Number of threads used to run math functions.");

 namespace paddle {
 namespace inference {

 void Init(const std::vector<std::string> argv) {
  framework::InitGflags(argv);
+  operators::math::SetNumThreads(FLAGS_math_num_threads);
  // init devices
  std::vector<int> devices;
  std::string token;

--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -64,7 +64,8 @@ class OpConverter {
    (*it)(op, scope, test_mode);
  }

-  // convert fluid block to tensorrt network
+  // Convert a fluid block to tensorrt network, NOTE it just convert operators,
+  // the INetwork's inputs and outputs should specified in some other modules.
  void ConvertBlock(const framework::proto::BlockDesc& block,
                    const std::unordered_set<std::string>& parameters,
                    const framework::Scope& scope, TensorRTEngine* engine) {

--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -51,11 +51,12 @@ class TensorRTEngine : public EngineBase {
    nvinfer1::Weights w_;
  };

-  TensorRTEngine(int max_batch, int max_workspace, cudaStream_t* stream,
+  TensorRTEngine(int max_batch, int max_workspace,
+                 cudaStream_t* stream = nullptr,
                 nvinfer1::ILogger& logger = NaiveLogger::Global())
      : max_batch_(max_batch),
        max_workspace_(max_workspace),
-        stream_(stream),
+        stream_(stream ? stream : &default_stream_),
        logger_(logger) {}

  virtual ~TensorRTEngine();
@@ -121,6 +122,8 @@ class TensorRTEngine : public EngineBase {
  // the max memory size the engine uses
  int max_workspace_;
  cudaStream_t* stream_;
+  // If stream_ is not set from outside, hold its own stream.
+  cudaStream_t default_stream_;
  nvinfer1::ILogger& logger_;

  std::vector<Buffer> buffers_;
@@ -165,20 +168,31 @@ class TensorRTEngine : public EngineBase {
 */
 class TRT_EngineManager {
 public:
-  TensorRTEngine* Create(int max_batch, int max_workspace,
-                         cudaStream_t* stream) {
-    engines_.emplace_back(new TensorRTEngine(max_batch, max_workspace, stream));
-    return engines_.back().get();
+  bool HasEngine(const std::string& name) const {
+    return engines_.count(name) != 0;
+  }
+
+  // Get an engine called `name`.
+  TensorRTEngine* Get(const std::string& name) const {
+    return engines_.at(name).get();
+  }
+
+  // Create or get an engine called `name`
+  TensorRTEngine* Create(int max_batch, int max_workspace, cudaStream_t* stream,
+                         const std::string& name) {
+    auto* p = new TensorRTEngine(max_batch, max_workspace, stream);
+    engines_[name].reset(p);
+    return p;
  }

  void DeleteALl() {
-    for (auto& ptr : engines_) {
-      ptr.reset(nullptr);
+    for (auto& item : engines_) {
+      item.second.reset(nullptr);
    }
  }

 private:
-  std::vector<std::unique_ptr<TensorRTEngine>> engines_;
+  std::unordered_map<std::string, std::unique_ptr<TensorRTEngine>> engines_;
 };

 }  // namespace tensorrt

--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -29,6 +29,7 @@ DEFINE_string(data_file, "", "File of input index data.");
 DEFINE_int32(repeat, 100, "Running the inference program repeat times");
 DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
 DEFINE_int32(num_threads, 1, "Number of threads should be used");
+DECLARE_bool(use_mkldnn);

 inline double GetCurrentMs() {
  struct timeval time;
@@ -103,9 +104,9 @@ void ThreadRunInfer(
    const int tid, paddle::framework::Scope* scope,
    const std::vector<std::vector<const paddle::framework::LoDTensor*>>& jobs) {
  // maybe framework:ProgramDesc is not thread-safe
+  paddle::platform::CPUPlace place;
+  paddle::framework::Executor executor(place);
  auto& sub_scope = scope->NewScope();
-  auto place = paddle::platform::CPUPlace();
-  auto executor = paddle::framework::Executor(place);
  auto inference_program =
      paddle::inference::Load(&executor, scope, FLAGS_model_path);

@@ -182,8 +183,8 @@ TEST(inference, nlp) {
    stop_ms = GetCurrentMs();
  } else {
    // 1. Define place, executor, scope
-    auto place = paddle::platform::CPUPlace();
-    auto executor = paddle::framework::Executor(place);
+    paddle::platform::CPUPlace place;
+    paddle::framework::Executor executor(place);

    // 2. Initialize the inference_program and load parameters
    std::unique_ptr<paddle::framework::ProgramDesc> inference_program;

--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -43,14 +43,16 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) {

  *index = 0;  // unlock memory

-  void* p;
+  void* p = nullptr;

 #ifdef PADDLE_WITH_MKLDNN
  // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
  // memory alignment
-  PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0);
+  PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0, "Alloc %ld error!",
+                    size);
 #else
-  PADDLE_ENFORCE_EQ(posix_memalign(&p, 32ul, size), 0);
+  PADDLE_ENFORCE_EQ(posix_memalign(&p, 32ul, size), 0, "Alloc %ld error!",
+                    size);
 #endif
  PADDLE_ENFORCE(p, "Fail to allocate CPU memory: size = %d .", size);


--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -19,18 +19,18 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)               \
-  class OP_NAME##OpMaker                                                \
-      : public ::paddle::framework::OpProtoAndCheckerMaker {            \
-   public:                                                              \
-    void Make() override {                                              \
-      AddInput("X", "Input of " #OP_NAME " operator");                  \
-      AddOutput("Out", "Output of " #OP_NAME " operator").Reuse("X");   \
-      AddAttr<bool>("use_mkldnn",                                       \
-                    "(bool, default false) Only used in mkldnn kernel") \
-          .SetDefault(false);                                           \
-      AddComment(OP_COMMENT);                                           \
-    }                                                                   \
+#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)             \
+  class OP_NAME##OpMaker                                              \
+      : public ::paddle::framework::OpProtoAndCheckerMaker {          \
+   public:                                                            \
+    void Make() override {                                            \
+      AddInput("X", "Input of " #OP_NAME " operator");                \
+      AddOutput("Out", "Output of " #OP_NAME " operator").Reuse("X"); \
+      AddAttr<bool>("use_mkldnn",                                     \
+                    "(default false) Only used in mkldnn kernel")     \
+          .SetDefault(false);                                         \
+      AddComment(OP_COMMENT);                                         \
+    }                                                                 \
  }

 #define REGISTER_ACTIVATION_OP_GRAD_MAKER(OP_NAME, KERNEL_TYPE)              \
@@ -112,7 +112,7 @@ $$out = \frac{1}{1 + e^{-x}}$$
 __attribute__((unused)) constexpr char LogSigmoidDoc[] = R"DOC(
 Logsigmoid Activation Operator

-$$out = \log \frac{1}{1 + e^{-x}}$$
+$$out = \\log \\frac{1}{1 + e^{-x}}$$

 )DOC";

@@ -133,7 +133,7 @@ $out = \max(x, 0)$
 __attribute__((unused)) constexpr char TanhDoc[] = R"DOC(
 Tanh Activation Operator.

-$$out = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
+$$out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$

 )DOC";

@@ -196,7 +196,7 @@ $out = [x]$
 __attribute__((unused)) constexpr char ReciprocalDoc[] = R"DOC(
 Reciprocal Activation Operator.

-$$out = \frac{1}{x}$$
+$$out = \\frac{1}{x}$$

 )DOC";

@@ -252,15 +252,14 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Out", "Output of Softshrink operator");
    AddAttr<float>("lambda", "non-negative offset").SetDefault(0.5f);
    AddComment(R"DOC(
-Softshrink Activation Operator.
+:strong:`Softshrink Activation Operator`

-$$
-out = \begin{cases} 
-    x - \lambda, \text{if } x > \lambda \\
-    x + \lambda, \text{if } x < -\lambda \\
-    0,  \text{otherwise}
-    \end{cases}
-$$
+..  math::
+    out = \begin{cases} 
+         x - \lambda, \text{if } x > \lambda \\
+         x + \lambda, \text{if } x < -\lambda \\
+         0,  \text{otherwise}
+         \end{cases}

 )DOC");
  }
@@ -271,18 +270,18 @@ class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
  void Make() override {
    AddInput("X", "Input of HardShrink operator");
    AddOutput("Out", "Output of HardShrink operator");
-    AddAttr<float>("threshold", "The value of threshold for HardShrink")
+    AddAttr<float>("threshold",
+                   "The value of threshold for HardShrink. [default: 0.5]")
        .SetDefault(0.5f);
    AddComment(R"DOC(
-HardShrink Activation Operator.
+:strong:`HardShrink activation operator`

-$$
-out = \begin{cases} 
-    x, \text{if } x > \lambda \\
-    x, \text{if } x < -\lambda \\
-    0,  \text{otherwise}
-    \end{cases}
-$$
+..  math::
+    out = \begin{cases}
+            x, \text{if } x > \lambda \\
+            x, \text{if } x < -\lambda \\
+            0,  \text{otherwise}
+          \end{cases}

 )DOC");
  }
@@ -394,18 +393,18 @@ class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
  void Make() override {
    AddInput("X", "Input of ThresholdedRelu operator");
    AddOutput("Out", "Output of ThresholdedRelu operator");
-    AddAttr<float>("threshold", "The threshold location of activation")
+    AddAttr<float>("threshold",
+                   "The threshold location of activation. [default 1.0].")
        .SetDefault(1.0f);
    AddComment(R"DOC(
-ThresholdedRelu Activation Operator.
+:strong:`ThresholdedRelu activation operator`

-$$
-out = \begin{cases} 
-    x, \text{if } x > threshold \\
-    0,  \text{otherwise}
-    \end{cases}
-$$
+..  math::

+    out = \begin{cases}
+             x,  \text{if } x > threshold \\
+             0,  \text{otherwise}
+          \end{cases}
 )DOC");
  }
 };
@@ -444,7 +443,7 @@ class SwishOpMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(
 Swish Activation Operator.

-$$out = \frac{x}{1 + e^{- \beta x}}$$
+$$out = \\frac{x}{1 + e^{- \beta x}}$$

 )DOC");
  }

--- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
@@ -19,10 +19,17 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-using Tensor = framework::Tensor;
+using batch_norm_bwd = mkldnn::batch_normalization_backward;
+using batch_norm_fwd = mkldnn::batch_normalization_forward;
+using framework::DataLayout;
+using framework::Tensor;
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::reorder;
+using mkldnn::stream;
 using paddle::platform::MKLDNNDeviceContext;
 using paddle::platform::MKLDNNMemDesc;
-using mkldnn::memory;
+using platform::to_void_cast;

 template <typename T>
 using EigenArrayMap =
@@ -64,21 +71,12 @@ void run_batch_norm_op(Args &&... args) {
  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
 }

-template <typename T>
-inline void *cast_const_to_void(const T *t) {
-  return static_cast<void *>(const_cast<T *>(t));
-}
 }  // namespace

 template <typename T>
 class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto data_layout_str = ctx.Attr<std::string>("data_layout");
-    auto data_layout = framework::StringToDataLayout(data_layout_str);
-    PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW,
-                   "MKLDNN batch normalization handles only NCHW data layout");
-
    const float epsilon = ctx.Attr<float>("epsilon");
    const float momentum = ctx.Attr<float>("momentum");
    const bool is_test = ctx.Attr<bool>("is_test");
@@ -99,41 +97,53 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    const auto *scale = ctx.Input<Tensor>("Scale");
    const auto *shift = ctx.Input<Tensor>("Bias");

-    y->mutable_data<T>(ctx.GetPlace());
-    mean_out->mutable_data<T>(ctx.GetPlace());
-    variance_out->mutable_data<T>(ctx.GetPlace());
+    PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
+                       x->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input x tensor");
+
+    const T *x_data = x->data<T>();
+    const T *mean_data = mean->data<T>();
+    const T *variance_data = variance->data<T>();
+    T *y_data = y->mutable_data<T>(ctx.GetPlace());
+    T *mean_out_data = mean_out->mutable_data<T>(ctx.GetPlace());
+    T *variance_out_data = variance_out->mutable_data<T>(ctx.GetPlace());
+    T *batch_mean_data = nullptr;
+    T *batch_variance_data = nullptr;

    if (!is_test) {
-      batch_mean->mutable_data<T>(ctx.GetPlace());
-      batch_variance->mutable_data<T>(ctx.GetPlace());
+      batch_mean_data = batch_mean->mutable_data<T>(ctx.GetPlace());
+      batch_variance_data = batch_variance->mutable_data<T>(ctx.GetPlace());
    }

    auto propagation = is_test == true ? mkldnn::prop_kind::forward_scoring
                                       : mkldnn::prop_kind::forward_training;

-    auto dims = paddle::framework::vectorize2int(x->dims());
-
-    auto src_md =
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
-    auto dst_md =
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
-
-    auto src_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine};
-    auto dst_pd = mkldnn::memory::primitive_desc{dst_md, mkldnn_engine};
-
-    auto src = mkldnn::memory{src_pd, cast_const_to_void(x->data<T>())};
-    auto dst = mkldnn::memory{dst_pd, y->data<T>()};
+    auto src_tz = paddle::framework::vectorize2int(x->dims());
+    auto scale_tz = paddle::framework::vectorize2int(scale->dims());
+    PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1");
+    const unsigned int ic = scale_tz[0];

    unsigned flags = mkldnn::use_scale_shift;
    if (is_test) flags |= mkldnn::use_global_stats;

+    // create mkldnn memory from input x tensor
+    auto src_memory =
+        memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
+               to_void_cast(x_data));
+
+    // create primitive descriptor for batch norm forward
    using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
-    auto batch_norm_fwd_desc =
-        bn_fwd_types::op_desc{propagation, src_md, epsilon, flags};
-    auto batch_norm_fwd_pd =
-        bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine};
+    auto batch_norm_fwd_desc = bn_fwd_types::op_desc{
+        propagation, src_memory.get_primitive_desc().desc(), epsilon, flags};
+    std::shared_ptr<batch_norm_fwd::primitive_desc> batch_norm_fwd_pd =
+        std::shared_ptr<batch_norm_fwd::primitive_desc>(
+            new batch_norm_fwd::primitive_desc(batch_norm_fwd_desc,
+                                               mkldnn_engine));

-    const unsigned int ic = dims[1];
+    // Save the pd to be used in backward pass
+    const std::string key = ctx.op().Output("SavedMean");
+    const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
+    dev_ctx.SetBlob(key_batch_norm_fwd_pd, batch_norm_fwd_pd);

    // MKLDNN requires a single piece of memory for scale and shift/bias data
    const size_t scaleshift_size = 2 * ic;
@@ -143,73 +153,58 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    copy_to_weights(scale->data<T>(), scale->data<T>() + ic, shift->data<T>(),
                    shift->data<T>() + ic, &scaleshift_data);

-    auto scaleshift_memory = mkldnn::memory{
-        batch_norm_fwd_pd.weights_primitive_desc(), scaleshift_data.data()};
+    // crate mkldnn memory for weights(scale/shift)
+    auto scaleshift_memory = memory(batch_norm_fwd_pd->weights_primitive_desc(),
+                                    scaleshift_data.data());

-    if (is_test) {
-      auto mean_memory = mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(),
-                                        cast_const_to_void(mean->data<T>())};
+    // create mkldnn memory for output y tensor
+    auto dst_memory = memory(batch_norm_fwd_pd->dst_primitive_desc(), y_data);

+    if (is_test) {
+      // create mkldnn memory for stats (as input)
+      auto mean_memory = memory(batch_norm_fwd_pd->mean_primitive_desc(),
+                                to_void_cast(mean_data));
      auto variance_memory =
-          mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(),
-                         cast_const_to_void(variance->data<T>())};
+          memory(batch_norm_fwd_pd->variance_primitive_desc(),
+                 to_void_cast(variance_data));

      run_batch_norm_op<typename bn_fwd_types::op_type>(
-          batch_norm_fwd_pd, src, (const mkldnn::primitive::at &)mean_memory,
+          *batch_norm_fwd_pd, src_memory,
+          (const mkldnn::primitive::at &)mean_memory,
          (const mkldnn::primitive::at &)variance_memory, scaleshift_memory,
-          dst);
+          dst_memory);
    } else {
+      // create mkldnn memory for stats (as output)
      auto mean_memory =
-          mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(),
-                         cast_const_to_void(batch_mean->data<T>())};
-
-      auto variance_memory =
-          mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(),
-                         cast_const_to_void(batch_variance->data<T>())};
+          memory(batch_norm_fwd_pd->mean_primitive_desc(), batch_mean_data);
+      auto variance_memory = memory(
+          batch_norm_fwd_pd->variance_primitive_desc(), batch_variance_data);

-      run_batch_norm_op<bn_fwd_types::op_type>(batch_norm_fwd_pd, src,
-                                               scaleshift_memory, dst,
+      run_batch_norm_op<bn_fwd_types::op_type>(*batch_norm_fwd_pd, src_memory,
+                                               scaleshift_memory, dst_memory,
                                               mean_memory, variance_memory);
    }

    if (!is_test) {
-      const unsigned int in = dims[0];
-      const unsigned int sample_size = x->numel() / in / ic;
-
-      // saved_xx is use just in this batch of data
-      EigenVectorArrayMap<T> saved_mean_e(
-          batch_mean->mutable_data<T>(ctx.GetPlace()), ic);
-      EigenVectorArrayMap<T> saved_variance_e(
-          batch_variance->mutable_data<T>(ctx.GetPlace()), ic);
-      saved_mean_e.setZero();
-      saved_variance_e.setZero();
-
-      const unsigned int x_arr_size = in * ic;
-      ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, x_arr_size);
-      for (unsigned int nc = 0; nc < x_arr_size; ++nc) {
-        saved_mean_e(nc % ic) += x_arr.col(nc).sum();
-      }
-      saved_mean_e /= in * sample_size;
-      for (unsigned int nc = 0; nc < x_arr_size; ++nc) {
-        saved_variance_e(nc % ic) +=
-            (x_arr.col(nc) - saved_mean_e(nc % ic)).matrix().squaredNorm();
-      }
-      saved_variance_e /= in * sample_size;
-
-      ConstEigenVectorArrayMap<T> mean_arr{mean->data<T>(), ic};
-      ConstEigenVectorArrayMap<T> variance_arr{variance->data<T>(), ic};
-
-      EigenVectorArrayMap<T> running_mean_arr(
-          mean_out->mutable_data<T>(ctx.GetPlace()), ic);
-      EigenVectorArrayMap<T> running_var_arr(
-          variance_out->mutable_data<T>(ctx.GetPlace()), ic);
+      // mkldnn only compute stats for current batch
+      // so we need compute momentum stats via Eigen lib
+      EigenVectorArrayMap<T> batch_mean_e(batch_mean_data, ic);
+      EigenVectorArrayMap<T> batch_variance_e(batch_variance_data, ic);
+      ConstEigenVectorArrayMap<T> mean_e(mean_data, ic);
+      ConstEigenVectorArrayMap<T> variance_e{variance_data, ic};
+
+      EigenVectorArrayMap<T> running_mean_e(mean_out_data, ic);
+      EigenVectorArrayMap<T> running_variance_e(variance_out_data, ic);

      auto one_minus_momentum = 1. - momentum;
-      running_mean_arr =
-          mean_arr * momentum + saved_mean_e * one_minus_momentum;
-      running_var_arr =
-          variance_arr * momentum + saved_variance_e * one_minus_momentum;
+      running_mean_e = mean_e * momentum + batch_mean_e * one_minus_momentum;
+      running_variance_e =
+          variance_e * momentum + batch_variance_e * one_minus_momentum;
    }
+
+    y->set_layout(DataLayout::kMKLDNN);
+    y->set_format(
+        (memory::format)dst_memory.get_primitive_desc().desc().data.format);
  }
 };

@@ -217,11 +212,6 @@ template <typename T>
 class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 public:
  void Compute(const paddle::framework::ExecutionContext &ctx) const override {
-    auto data_layout_str = ctx.Attr<std::string>("data_layout");
-    auto data_layout = framework::StringToDataLayout(data_layout_str);
-    PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW,
-                   "MKLDNN batch normalization handles only NCHW data layout");
-
    auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
    auto mkldnn_engine = dev_ctx.GetEngine();

@@ -238,88 +228,132 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    auto *diff_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
    auto *diff_shift = ctx.Output<Tensor>(framework::GradVarName("Bias"));

-    diff_x->mutable_data<T>(ctx.GetPlace());
-    diff_scale->mutable_data<T>(ctx.GetPlace());
-    diff_shift->mutable_data<T>(ctx.GetPlace());
+    PADDLE_ENFORCE(diff_y->layout() == DataLayout::kMKLDNN &&
+                       diff_y->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input diff_y tensor");
+
+    const T *x_data = x->data<T>();
+    const T *diff_y_data = diff_y->data<T>();
+    const T *batch_mean_data = batch_mean->data<T>();
+    const T *batch_variance_data = batch_variance->data<T>();
+    const T *scale_data = scale->data<T>();
+    const T *shift_data = shift->data<T>();
+    T *diff_x_data = diff_x->mutable_data<T>(ctx.GetPlace());
+    T *diff_scale_data = diff_scale->mutable_data<T>(ctx.GetPlace());
+    T *diff_shift_data = diff_shift->mutable_data<T>(ctx.GetPlace());
+
+    auto src_tz = paddle::framework::vectorize2int(x->dims());
+    auto diff_src_tz = src_tz;
+    auto dst_tz = src_tz;
+    auto diff_dst_tz = dst_tz;
+    auto scale_tz = paddle::framework::vectorize2int(scale->dims());
+    PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1");
+
+    const unsigned int ic = scale_tz[0];
+
+    // Retrieve bn_fwd_pd from device context
+    const std::string key = ctx.op().Input("SavedMean");
+    const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
+    auto batch_norm_fwd_pd =
+        std::static_pointer_cast<batch_norm_fwd::primitive_desc>(
+            dev_ctx.GetBlob(key_batch_norm_fwd_pd));
+    PADDLE_ENFORCE(batch_norm_fwd_pd != nullptr,
+                   "Fail to find batch_norm_fwd_pd in device context");

-    auto dims = paddle::framework::vectorize2int(x->dims());
-    unsigned flags = mkldnn::use_scale_shift | !mkldnn::use_global_stats;
+    using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;

-    auto src_md =
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
-    auto dst_md =
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
-    auto diff_src_md =
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
-    auto diff_dst_md =
-        MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw);
+    // create mkldnn memory from input diff_y tensor
+    auto user_diff_dst_memory =
+        memory({{{diff_dst_tz}, memory::data_type::f32, diff_y->format()},
+                mkldnn_engine},
+               to_void_cast(diff_y_data));

-    using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
-    using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
+    // create mkldnn memory from input x tensor
+    auto src_memory =
+        memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
+               to_void_cast(x_data));

-    auto batch_norm_fwd_desc = bn_fwd_types::op_desc{
-        mkldnn::prop_kind::forward_training, src_md, epsilon, flags};
-    auto batch_norm_fwd_pd =
-        bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine};
+    // for diff_dst, try to use same format as dst in forward pass
+    auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc();
+    auto diff_dst_md = diff_dst_pd.desc();

+    // create primitive descriptor for batch norm backward
+    unsigned flags = mkldnn::use_scale_shift;
    auto batch_norm_bwd_desc = bn_bwd_types::op_desc{
-        mkldnn::prop_kind::backward, diff_dst_md, dst_md, epsilon, flags};
+        mkldnn::prop_kind::backward, diff_dst_md,
+        src_memory.get_primitive_desc().desc(), epsilon, flags};
    auto batch_norm_bwd_pd = bn_bwd_types::op_prim{
-        batch_norm_bwd_desc, mkldnn_engine, batch_norm_fwd_pd};
-
-    auto src = mkldnn::memory{{src_md, mkldnn_engine},
-                              cast_const_to_void(x->data<T>())};
-
-    auto mean = mkldnn::memory{batch_norm_bwd_pd.mean_primitive_desc(),
-                               cast_const_to_void(batch_mean->data<T>())};
-
-    auto variance =
-        mkldnn::memory{batch_norm_bwd_pd.variance_primitive_desc(),
-                       cast_const_to_void(batch_variance->data<T>())};
-
-    auto diff_dst = mkldnn::memory{{diff_dst_md, mkldnn_engine},
-                                   cast_const_to_void(diff_y->data<T>())};
+        batch_norm_bwd_desc, mkldnn_engine, *batch_norm_fwd_pd};
+
+    // reorder user_diff_dst if it's not in preferred format
+    auto diff_dst_memory = user_diff_dst_memory;
+    primitive reorder_diff_dst;
+    bool is_diff_dst_reordered = false;
+    if (diff_dst_pd != user_diff_dst_memory.get_primitive_desc()) {
+      diff_dst_memory = memory(diff_dst_pd);
+      reorder_diff_dst = reorder(user_diff_dst_memory, diff_dst_memory);
+      is_diff_dst_reordered = true;
+    }

-    const unsigned int ic = dims[1];
+    // create mkldnn memory for input tensors (src/mean/variance)
+    auto mean_memory = memory(batch_norm_bwd_pd.mean_primitive_desc(),
+                              to_void_cast(batch_mean_data));
+    auto variance_memory = memory(batch_norm_bwd_pd.variance_primitive_desc(),
+                                  to_void_cast(batch_variance_data));

+    // MKLDNN requires a single piece of memory for scale and shift/bias data
    const size_t scaleshift_size = 2 * ic;

    std::vector<T> scaleshift_data;
    scaleshift_data.reserve(scaleshift_size);
-    copy_to_weights(scale->data<T>(), scale->data<T>() + ic, shift->data<T>(),
-                    shift->data<T>() + ic, &scaleshift_data);
+    copy_to_weights(scale_data, scale_data + ic, shift_data, shift_data + ic,
+                    &scaleshift_data);

-    auto scaleshift_memory = mkldnn::memory{
-        batch_norm_bwd_pd.weights_primitive_desc(), scaleshift_data.data()};
+    // create mkldnn memory for input tensors (scale/shift)
+    auto scaleshift_memory = memory(batch_norm_bwd_pd.weights_primitive_desc(),
+                                    scaleshift_data.data());

+    // create mkldnn memory for output diff weights (combined scale/shift)
    std::vector<T> diff_scaleshift_data;
    diff_scaleshift_data.reserve(scaleshift_size);
-    copy_to_weights(diff_scale->data<T>(), diff_scale->data<T>() + ic,
-                    diff_shift->data<T>(), diff_shift->data<T>() + ic,
-                    &diff_scaleshift_data);
-
    auto diff_scaleshift_memory =
-        mkldnn::memory{batch_norm_bwd_pd.diff_weights_primitive_desc(),
-                       diff_scaleshift_data.data()};
-
-    auto diff_src = mkldnn::memory{{diff_src_md, mkldnn_engine},
-                                   static_cast<void *>(diff_x->data<T>())};
-
-    run_batch_norm_op<bn_bwd_types::op_type>(
-        batch_norm_bwd_pd, src, mean, variance, diff_dst, scaleshift_memory,
-        diff_src, diff_scaleshift_memory);
-
+        memory(batch_norm_bwd_pd.diff_weights_primitive_desc(),
+               diff_scaleshift_data.data());
+
+    // here assume diff_src is in the same format of src
+    auto diff_src_memory = memory(src_memory.get_primitive_desc(), diff_x_data);
+
+    // finally create batch_norm backward primitive
+    auto batch_norm_bwd_prim =
+        batch_norm_bwd(batch_norm_bwd_pd, src_memory, mean_memory,
+                       variance_memory, diff_dst_memory, scaleshift_memory,
+                       diff_src_memory, diff_scaleshift_memory);
+
+    // execute optional reorder and batch_norm backward primitive
+    std::vector<primitive> pipeline;
+    if (is_diff_dst_reordered) pipeline.push_back(reorder_diff_dst);
+    pipeline.push_back(batch_norm_bwd_prim);
+    stream(stream::kind::eager).submit(pipeline).wait();
+
+    // copy back diff sacle/shift to output tensors (diff scale/shift)
+    diff_scaleshift_data.resize(scaleshift_size);
    auto it = std::begin(diff_scaleshift_data);
-    std::copy(it, std::next(it, ic), diff_scale->data<T>());
+    std::copy(it, std::next(it, ic), diff_scale_data);
    std::copy(std::next(it, ic), std::end(diff_scaleshift_data),
-              diff_shift->data<T>());
+              diff_shift_data);
+
+    // set layout/format of output tensors
+    diff_x->set_layout(DataLayout::kMKLDNN);
+    diff_x->set_format((memory::format)diff_src_memory.get_primitive_desc()
+                           .desc()
+                           .data.format);
  }
 };
 }  // namespace operators
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP_KERNEL(batch_norm, MKLDNN, paddle::platform::CPUPlace,
+REGISTER_OP_KERNEL(batch_norm, MKLDNN, ::paddle::platform::CPUPlace,
                   ops::BatchNormMKLDNNOpKernel<float>);
-REGISTER_OP_KERNEL(batch_norm_grad, MKLDNN, paddle::platform::CPUPlace,
+REGISTER_OP_KERNEL(batch_norm_grad, MKLDNN, ::paddle::platform::CPUPlace,
                   ops::BatchNormMKLDNNGradOpKernel<float>);
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -110,19 +110,19 @@ class BatchNormOp : public framework::OperatorWithKernel {
                                         ctx.Input<Tensor>("Variance")->type()),
                      "Variance input should be of float type");

-    framework::LibraryType library_{framework::LibraryType::kPlain};
    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    framework::LibraryType library = framework::LibraryType::kPlain;
    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
-
 #ifdef PADDLE_WITH_MKLDNN
-    if (library_ == framework::LibraryType::kPlain &&
+    if (library == framework::LibraryType::kPlain &&
        platform::CanMKLDNNBeUsed(ctx)) {
-      library_ = framework::LibraryType::kMKLDNN;
+      library = framework::LibraryType::kMKLDNN;
      layout = framework::DataLayout::kMKLDNN;
    }
 #endif
+
    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
-                                   library_);
+                                   library);
  }
 };

@@ -370,19 +370,21 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
      PADDLE_THROW("can't find Y@GRAD");
    }

-    framework::LibraryType library_{framework::LibraryType::kPlain};
    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+
 #ifdef PADDLE_WITH_MKLDNN
-    if (library_ == framework::LibraryType::kPlain &&
+    if (library == framework::LibraryType::kPlain &&
        platform::CanMKLDNNBeUsed(ctx)) {
-      library_ = framework::LibraryType::kMKLDNN;
-      layout_ = framework::DataLayout::kMKLDNN;
+      library = framework::LibraryType::kMKLDNN;
+      layout = framework::DataLayout::kMKLDNN;
    }
 #endif
+
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        layout_, library_);
+        layout, library);
  }
 };


--- a/paddle/fluid/operators/chunk_eval_op.cc
+++ b/paddle/fluid/operators/chunk_eval_op.cc
@@ -91,32 +91,31 @@ class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
        "(int64_t). The number of chunks both in Inference and Label on the "
        "given mini-batch.");
    AddAttr<int>("num_chunk_types",
-                 "(int). The number of chunk type. See below for details.");
-    AddAttr<std::string>(
-        "chunk_scheme",
-        "(string, default IOB). The labeling scheme indicating "
-        "how to encode the chunks. Must be IOB, IOE, IOBES or plain. See below "
-        "for details.")
+                 "The number of chunk type. See the description for details.");
+    AddAttr<std::string>("chunk_scheme",
+                         "The labeling scheme indicating "
+                         "how to encode the chunks. Must be IOB, IOE, IOBES or "
+                         "plain. See the description"
+                         "for details.")
        .SetDefault("IOB");
    AddAttr<std::vector<int>>("excluded_chunk_types",
-                              "(list<int>) A list including chunk type ids "
+                              "A list including chunk type ids "
                              "indicating chunk types that are not counted. "
-                              "See below for details.")
+                              "See the description for details.")
        .SetDefault(std::vector<int>{});
    AddComment(R"DOC(
 For some basics of chunking, please refer to
-‘Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>’.
+'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'.

-
-CheckEvalOp computes the precision, recall, and F1-score of chunk detection,
+ChunkEvalOp computes the precision, recall, and F1-score of chunk detection,
 and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
 Here is a NER example of labeling for these tagging schemes:
-
- 	     Li     Ming    works  at  Agricultural   Bank   of    China  in  Beijing.
-  IO:    I-PER  I-PER   O      O   I-ORG          I-ORG  I-ORG I-ORG  O   I-LOC
-  IOB:   B-PER  I-PER   O      O   B-ORG          I-ORG  I-ORG I-ORG  O   B-LOC
-  IOE:   I-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   E-LOC
-  IOBES: B-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   S-LOC
+   
+          Li     Ming    works  at  Agricultural   Bank   of    China  in  Beijing.
+   IO     I-PER  I-PER   O      O   I-ORG          I-ORG  I-ORG I-ORG  O   I-LOC
+   IOB    B-PER  I-PER   O      O   B-ORG          I-ORG  I-ORG I-ORG  O   B-LOC
+   IOE    I-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   E-LOC
+   IOBES  B-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   S-LOC

 There are three chunk types(named entity types) including PER(person), ORG(organization)
 and LOC(LOCATION), and we can see that the labels have the form <tag type>-<chunk type>.
@@ -124,31 +123,31 @@ and LOC(LOCATION), and we can see that the labels have the form <tag type>-<chun
 Since the calculations actually use label ids rather than labels, extra attention
 should be paid when mapping labels to ids to make CheckEvalOp work. The key point
 is that the listed equations are satisfied by ids.
-
-    tag_type = label % num_tag_type
-    chunk_type = label / num_tag_type
+   
+   tag_type = label % num_tag_type
+   chunk_type = label / num_tag_type

 where `num_tag_type` is the num of tag types in the tagging scheme, `num_chunk_type`
 is the num of chunk types, and `tag_type` get its value from the following table.
-
-    Scheme Begin Inside End   Single
-     plain   0     -      -     -
-     IOB     0     1      -     -
-     IOE     -     0      1     -
-     IOBES   0     1      2     3
+   
+   Scheme Begin Inside End   Single
+    plain   0     -      -     -
+    IOB     0     1      -     -
+    IOE     -     0      1     -
+    IOBES   0     1      2     3

 Still use NER as example, assuming the tagging scheme is IOB while chunk types are ORG,
 PER and LOC. To satisfy the above equations, the label map can be like this:

-    B-ORG  0
-    I-ORG  1
-    B-PER  2
-    I-PER  3
-    B-LOC  4
-    I-LOC  5
-    O      6
+   B-ORG  0
+   I-ORG  1
+   B-PER  2
+   I-PER  3
+   B-LOC  4
+   I-LOC  5
+   O      6

-It’s not hard to verify the equations noting that the num of chunk types
+It's not hard to verify the equations noting that the num of chunk types
 is 3 and the num of tag types in IOB scheme is 2. For example, the label
 id of I-LOC is 5, the tag type id of I-LOC is 1, and the chunk type id of
 I-LOC is 2, which consistent with the results from the equations.

--- a/paddle/fluid/operators/clip_by_norm_op.cc
+++ b/paddle/fluid/operators/clip_by_norm_op.cc
@@ -54,10 +54,19 @@ be linearly scaled to make the L2 norm of $Out$ equal to $max\_norm$, as
 shown in the following formula:

 $$
-Out = \frac{max\_norm * X}{norm(X)},
+Out = \\frac{max\\_norm * X}{norm(X)},
 $$

 where $norm(X)$ represents the L2 norm of $X$.
+
+Examples:
+        .. code-block:: python
+
+            data = fluid.layer.data(
+                name='data', shape=[2, 4, 6], dtype='float32')
+            reshaped = fluid.layers.clip_by_norm(
+                x=data, max_norm=0.5)
+
 )DOC");
  }
 };

--- a/paddle/fluid/operators/compare_op.cc
+++ b/paddle/fluid/operators/compare_op.cc
@@ -23,30 +23,26 @@ class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
    OpComment comment;
-    AddInput("X",
-             string::Sprintf("(LoDTensor) the left hand operand of %s operator",
-                             comment.type));
-    AddInput("Y", string::Sprintf(
-                      "(LoDTensor) the right hand operand of %s operator",
-                      comment.type));
+    AddInput("X", string::Sprintf("the left hand operand of %s operator",
+                                  comment.type));
+    AddInput("Y", string::Sprintf("the right hand operand of %s operator",
+                                  comment.type));
    AddAttr<bool>("force_cpu",
-                  "(bool, default false) Force fill output variable to cpu "
+                  "Force fill output variable to cpu "
                  "memory. Otherwise, fill output variable to the running "
-                  "device")
-        .SetDefault(false);
-    AddOutput("Out", string::Sprintf(
-                         "(LoDTensor) n-dim bool tensor. Each element is %s",
-                         comment.equation));
-    AddComment(string::Sprintf(R"DOC(%s Operator
-
+                  "device [default true].")
+        .SetDefault(true);
+    AddOutput("Out", string::Sprintf("n-dim bool tensor. Each element is %s",
+                                     comment.equation));
+    AddComment(string::Sprintf(R"DOC(
 It operates element-wise on X and Y, and returns the Out. Each of them is a
 N-dim tensor. X and Y could be any type.  The each element of the Out tensor is
-calculated by %s
+calculated by $%s$
 )DOC",
-                               comment.type, comment.equation));
-    AddAttr<int>("axis",
-                 "(int, default -1). The start dimension index "
-                 "for broadcasting Y onto X.")
+                               comment.equation));
+    AddAttr<int>(
+        "axis",
+        "The start dimension index for broadcasting Y onto X. [default -1]")
        .SetDefault(-1)
        .EqualGreaterThan(-1);
  }

--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -107,7 +107,13 @@ REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker,
                      false> /* set false to disable empty grad */);
 REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad);
 REGISTER_OP_CPU_KERNEL(
-    concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>);
+    concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ConcatKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::ConcatKernel<paddle::platform::CPUDeviceContext, int>);
 REGISTER_OP_CPU_KERNEL(
    concat_grad,
-    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, int>);
--- a/paddle/fluid/operators/concat_op.cu.cc
+++ b/paddle/fluid/operators/concat_op.cu.cc
@@ -15,7 +15,13 @@ limitations under the License. */
 #include "paddle/fluid/operators/concat_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    concat, ops::ConcatKernel<paddle::platform::CUDADeviceContext, float>);
+    concat, ops::ConcatKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ConcatKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ConcatKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ConcatKernel<paddle::platform::CUDADeviceContext, int>);
 REGISTER_OP_CUDA_KERNEL(
    concat_grad,
-    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, int>);
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -75,9 +75,8 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvOp::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
  framework::LibraryType library{framework::LibraryType::kPlain};
-
-  std::string data_format = ctx.Attr<std::string>("data_format");
  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  std::string data_format = ctx.Attr<std::string>("data_format");
  framework::DataLayout layout = framework::StringToDataLayout(data_format);

 #ifdef PADDLE_WITH_CUDA

--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -156,7 +156,7 @@ Parameters(strides, paddings) are two elements. These two elements represent hei
 and width, respectively.
 The input(X) size and output(Out) size may be different.

-Example:
+For an example:
  Input:
       Input shape: $(N, C_{in}, H_{in}, W_{in})$
       Filter shape: $(C_{in}, C_{out}, H_f, W_f)$

--- a/paddle/fluid/operators/cos_sim_op.cc
+++ b/paddle/fluid/operators/cos_sim_op.cc
@@ -76,9 +76,9 @@ class CosSimOpMaker : public framework::OpProtoAndCheckerMaker {
        .AsIntermediate();

    AddComment(R"DOC(
-Cosine Similarity Operator.
+**Cosine Similarity Operator**

-$Out = X^T * Y / (\sqrt{X^T * X} * \sqrt{Y^T * Y})$
+$Out = \frac{X^T * Y}{(\sqrt{X^T * X} * \sqrt{Y^T * Y})}$

 The input X and Y must have the same shape, except that the 1st dimension
 of input Y could be just 1 (different from input X), which will be

--- a/paddle/fluid/operators/crf_decoding_op.cc
+++ b/paddle/fluid/operators/crf_decoding_op.cc
@@ -53,21 +53,18 @@ sequence of observed tags.
 The output of this operator changes according to whether Input(Label) is given:

 1. Input(Label) is given:
-
-This happens in training. This operator is used to co-work with the chunk_eval
-operator.
-
-When Input(Label) is given, the crf_decoding operator returns a row vector
-with shape [N x 1] whose values are fixed to be 0, indicating an incorrect
-prediction, or 1 indicating a tag is correctly predicted. Such an output is the
-input to chunk_eval operator.
+   This happens in training. This operator is used to co-work with the chunk_eval
+   operator.
+   When Input(Label) is given, the crf_decoding operator returns a row vector
+   with shape [N x 1] whose values are fixed to be 0, indicating an incorrect
+   prediction, or 1 indicating a tag is correctly predicted. Such an output is the
+   input to chunk_eval operator.

 2. Input(Label) is not given:
-
-This is the standard decoding process.
+   This is the standard decoding process.

 The crf_decoding operator returns a row vector with shape [N x 1] whose values
-range from 0 to maximum tag number - 1. Each element indicates an index of a
+range from 0 to maximum tag number - 1, Each element indicates an index of a
 predicted tag.
 )DOC");
  }

--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
@@ -52,7 +52,7 @@ static std::vector<int> GetOffsets(const framework::ExecutionContext& ctx) {
  } else {
    res = ctx.Attr<std::vector<int>>("offsets");
    PADDLE_ENFORCE_EQ(
-        rank, res.size(),
+        rank, static_cast<int>(res.size()),
        "Offsets size should be equal to dimension size of input tensor.");
  }
  return res;

--- a/paddle/fluid/operators/cumsum_op.cc
+++ b/paddle/fluid/operators/cumsum_op.cc
@@ -30,19 +30,19 @@ class CumOp : public framework::OperatorWithKernel {
 class CumsumOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddInput("X", "Input of Cumsum operator");
-    AddOutput("Out", "Output of Cumsum operator");
+    AddInput("X", "Input of cumsum operator");
+    AddOutput("Out", "Output of cumsum operator");
    AddAttr<int>("axis",
-                 "(int, default -1). The dimenstion to accumulate along. "
-                 "-1 means the last dimenstion")
+                 "The dimenstion to accumulate along. -1 means the last "
+                 "dimenstion [default -1].")
        .SetDefault(-1)
        .EqualGreaterThan(-1);
    AddAttr<bool>("exclusive",
-                  "bool, default false). Whether to perform exclusive cumsum")
+                  "Whether to perform exclusive cumsum. [default false].")
        .SetDefault(false);
    AddAttr<bool>("reverse",
-                  "bool, default false). If true, the cumsum is performed in "
-                  "the reversed direction")
+                  "If true, the cumsum is performed in the reversed direction. "
+                  "[default false].")
        .SetDefault(false);
    AddComment(R"DOC(
 The cumulative sum of the elements along a given axis.

--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -245,7 +245,7 @@ void GRPCClient::Proceed() {
    if (c->status_.ok()) {
      c->Process();
    } else {
-      LOG(ERROR) << "var: " << c->var_h_.String()
+      LOG(FATAL) << "var: " << c->var_h_.String()
                 << " grpc error:" << c->status_.error_message();
    }
    delete c;

--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -169,7 +169,8 @@ class RequestPrefetch final : public RequestBase {

    auto scope = request_->GetMutableLocalScope();
    auto invar = scope->FindVar(in_var_name);
-    framework::Variable* outvar = scope->FindVar(out_var_name);
+    // out var must be created in local scope!
+    framework::Variable* outvar = scope->Var(out_var_name);

    request_handler_->Handle(in_var_name, scope, invar, &outvar, out_var_name);


--- a/paddle/fluid/operators/detection/box_coder_op.cc
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
@@ -106,23 +106,36 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
              "and M represents the number of deocded boxes.");

    AddComment(R"DOC(
-Bounding Box Coder Operator.
+
+Bounding Box Coder.
+
 Encode/Decode the target bounding box with the priorbox information.
+
 The Encoding schema described below:
-ox = (tx - px) / pw / pxv
-oy = (ty - py) / ph / pyv
-ow = log(abs(tw / pw)) / pwv 
-oh = log(abs(th / ph)) / phv 
+
+    ox = (tx - px) / pw / pxv
+
+    oy = (ty - py) / ph / pyv
+
+    ow = log(abs(tw / pw)) / pwv 
+
+    oh = log(abs(th / ph)) / phv 
+
 The Decoding schema described below:
-ox = (pw * pxv * tx * + px) - tw / 2
-oy = (ph * pyv * ty * + py) - th / 2
-ow = exp(pwv * tw) * pw + tw / 2
-oh = exp(phv * th) * ph + th / 2
-where tx, ty, tw, th denote the target box's center coordinates, width and
-height respectively. Similarly, px, py, pw, ph denote the priorbox's(anchor)
-center coordinates, width and height. pxv, pyv, pwv, phv denote the variance
-of the priorbox and ox, oy, ow, oh denote the encoded/decoded coordinates,
-width and height.
+
+    ox = (pw * pxv * tx * + px) - tw / 2
+
+    oy = (ph * pyv * ty * + py) - th / 2
+
+    ow = exp(pwv * tw) * pw + tw / 2
+
+    oh = exp(phv * th) * ph + th / 2
+
+where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width
+and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the
+priorbox's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`,
+`phv` denote the variance of the priorbox and `ox`, `oy`, `ow`, `oh` denote the
+encoded/decoded coordinates, width and height.
 )DOC");
  }
 };

--- a/paddle/fluid/operators/detection/iou_similarity_op.cc
+++ b/paddle/fluid/operators/detection/iou_similarity_op.cc
@@ -68,15 +68,16 @@ class IOUSimilarityOpMaker : public framework::OpProtoAndCheckerMaker {
              "representing pairwise iou scores.");

    AddComment(R"DOC(
-IOU Similarity Operator.
+**IOU Similarity Operator**
+
 Computes intersection-over-union (IOU) between two box lists.
- Box list 'X' should be a LoDTensor and 'Y' is a common Tensor,
- boxes in 'Y' are shared by all instance of the batched inputs of X.
- Given two boxes A and B, the calculation of IOU is as follows:
+Box list 'X' should be a LoDTensor and 'Y' is a common Tensor,
+boxes in 'Y' are shared by all instance of the batched inputs of X.
+Given two boxes A and B, the calculation of IOU is as follows:

 $$
 IOU(A, B) = 
-\frac{area(A\cap B)}{area(A)+area(B)-area(A\cap B)}
+\\frac{area(A\\cap B)}{area(A)+area(B)-area(A\\cap B)}
 $$

 )DOC");

--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cc
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
@@ -83,11 +83,13 @@ class PolygonBoxTransformOpMaker : public framework::OpProtoAndCheckerMaker {

    AddComment(R"DOC(
 PolygonBoxTransform Operator.
+
+PolygonBoxTransform Operator is used to transform the coordinate shift to the real coordinate.
+
 The input is the final geometry output in detection network.
 We use 2*n numbers to denote the coordinate shift from n corner vertices of
 the polygon_box to the pixel location. As each distance offset contains two numbers (xi, yi),
 the geometry output contains 2*n channels.
-PolygonBoxTransform Operator is used to transform the coordinate shift to the real coordinate.
 )DOC");
  }
 };

--- a/paddle/fluid/operators/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_op.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise_mul_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
 namespace ops = paddle::operators;
-REGISTER_ELEMWISE_OP(elementwise_mul, "Mul", "Out = X \\odot\\ Y");
+REGISTER_ELEMWISE_OP(elementwise_mul, "Mul", "Out = X \\\\odot Y");
 REGISTER_OP_CPU_KERNEL(
    elementwise_mul,
    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, float>,

--- a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
@@ -36,11 +36,12 @@ class GaussianRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
  void Apply() override {
    AddAttr<float>("mean",
                   "(float, default 0.0) "
-                   "mean of random tensor.")
+                   "The mean (or center) of the gaussian distribution.")
        .SetDefault(.0f);
    AddAttr<float>("std",
                   "(float, default 1.0) "
-                   "std of random tensor.")
+                   "The standard deviation (std, or spread) of the "
+                   "gaussian distribution.")
        .SetDefault(1.0f);
    AddAttr<int>("seed",
                 "(int, default 0) "
@@ -55,9 +56,11 @@ class GaussianRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
        .SetDefault(framework::proto::VarType::FP32);

    AddComment(R"DOC(
-GaussianRandom Operator.

 Used to initialize tensors with gaussian random generator.
+The defalut mean of the distribution is 0. and defalut standard
+deviation (std) of the distribution is 1.. Uers can set mean and std
+by input arguments.
 )DOC");
  }
 };

--- a/paddle/fluid/operators/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/gen_nccl_id_op.cc
@@ -67,6 +67,10 @@ class GenNCCLIdOp : public framework::OperatorBase {
      client->AsyncSendVar(ep, dev_ctx, *scope, NCCL_ID_VARNAME);
    }
    client->Wait();
+    for (auto& ep : endpoint_list) {
+      client->AsyncSendBatchBarrier(ep);
+    }
+    client->Wait();
    VLOG(3) << "sending completed...";
  }


--- a/paddle/fluid/operators/get_places_op.cc
+++ b/paddle/fluid/operators/get_places_op.cc
@@ -85,7 +85,7 @@ class GetPlacesOpProtoMaker : public framework::OpProtoAndCheckerMaker {
        .InEnum({"CUDA", "CPU", "AUTO"})
        .SetDefault("AUTO");
    AddComment(R"DOC(
-Returns a list of places based on flags. The list will be used for parallel
+Returns a list of places based on arguments. The list will be used for parallel
 execution.
 )DOC");
  }

--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -62,36 +62,33 @@ class LayerNormOp : public framework::OperatorWithKernel {
 class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddInput("X", "(LoDTensor) The input tensor.");
+    AddInput("X", "The input tensor.");
    AddInput("Scale",
-             "(Tensor, optional) Scale is a 1-dimensional tensor of size "
+             "(optional) Scale is a 1-dimensional tensor of size "
             "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
             "It is applied to the output.")
        .AsDispensable();
    AddInput("Bias",
-             "(Tensor, optional) Bias is a 1-dimensional tensor of size "
+             "(optional) Bias is a 1-dimensional tensor of size "
             "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
             "It is applied to the output.")
        .AsDispensable();
-    AddOutput("Y", "(LoDTensor) Result after normalization.");
-    AddOutput("Mean", "(Tensor) Mean of the current mini batch.")
-        .AsIntermediate();
-    AddOutput("Variance", "(Tensor) Variance of the current mini batch.")
+    AddOutput("Y", "Result after normalization.");
+    AddOutput("Mean", "Mean of the current mini batch.").AsIntermediate();
+    AddOutput("Variance", "Variance of the current mini batch.")
        .AsIntermediate();

    AddAttr<float>("epsilon",
-                   "(float, default 1e-5) Constant for "
-                   "numerical stability")
+                   "Constant for numerical stability [default 1e-5].")
        .SetDefault(1e-5)
        .AddCustomChecker([](const float &epsilon) {
          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
                         "'epsilon' should be between 0.0 and 0.001.");
        });
    AddAttr<int>("begin_norm_axis",
-                 "(int default:1), the "
-                 "axis of `begin_norm_axis ... Rank(X) - 1` will be "
+                 "the axis of `begin_norm_axis ... Rank(X) - 1` will be "
                 "normalized. `begin_norm_axis` splits the tensor(`X`) to a "
-                 "matrix [N,H].")
+                 "matrix [N,H]. [default 1].")
        .SetDefault(1)
        .AddCustomChecker([](const int &begin_norm_axis) {
          PADDLE_ENFORCE_GT(begin_norm_axis, 0,
@@ -99,10 +96,14 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
        });

    AddComment(R"DOC(
-Layer Normalization.
-Layer Norm has been implemented as discussed in the paper:
-https://arxiv.org/abs/1607.06450
-...
+Assume feature vectors exist on dimensions
+:attr:`begin_norm_axis ... rank(input)` and calculate the moment statistics
+along these dimensions for each feature vector :math:`a` with size
+:math:`H`, then normalize each feature vector using the corresponding
+statistics. After that, apply learnable gain and bias on the normalized
+tensor to scale and shift if :attr:`scale` and :attr:`shift` are set.
+
+Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
 )DOC");
  }
 };

--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -84,6 +84,7 @@ CRF. Please refer to http://www.cs.columbia.edu/~mcollins/fb.pdf and
 http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for details.

 Equation:
+
 1. Denote Input(Emission) to this operator as $x$ here.
 2. The first D values of Input(Transition) to this operator are for starting
 weights, denoted as $a$ here.
@@ -106,6 +107,7 @@ Finally, the linear chain CRF operator outputs the logarithm of the conditional
 likelihood of each training sample in a mini-batch.

 NOTE:
+
 1. The feature function for a CRF is made up of the emission features and the
 transition features. The emission feature weights are NOT computed in
 this operator. They MUST be computed first before this operator is called.

--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -348,7 +348,8 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
 };

 void SignalHandler::StopAndExit(int signal_num) {
-  VLOG(3) << "Catch interrupt signal: " << signal_num << ", program will exit";
+  // Do not use VLOG here for the device for printing maybe already released.
+  // exit will release interal allocated resoureces.
  exit(0);
 }


--- a/paddle/fluid/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
@@ -184,34 +184,32 @@ Long-Short Term Memory (LSTM) Operator.
 The defalut implementation is diagonal/peephole connection
 (https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows:

-$$
-i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) \\
+$$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) $$

-f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) \\
+$$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) $$

-\tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) \\
+$$ \\tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) $$

-o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) \\
+$$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) $$

-c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\
+$$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$

-h_t = o_t \odot act_h(c_t)
-$$
+$$ h_t = o_t \\odot act_h(c_t) $$

-where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
-of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
-are diagonal weight matrices for peephole connections. In our implementation,
-we use vectors to reprenset these diagonal weight matrices. The b terms
-denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$
-is the non-line activations, such as logistic sigmoid function, and
-$i, f, o$ and $c$ are the input gate, forget gate, output gate,
-and cell activation vectors, respectively, all of which have the same size as
-the cell output activation vector $h$.
-
-The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$
-are the cell input and cell output activation functions and `tanh` is usually
-used for them. $\tilde{c_t}$ is also called candidate hidden state,
-which is computed based on the current input and the previous hidden state.
+- W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
+  of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
+  are diagonal weight matrices for peephole connections. In our implementation,
+  we use vectors to reprenset these diagonal weight matrices.
+- The b terms denote bias vectors ($b_i$ is the input gate bias vector).
+- $\sigma$ is the non-line activations, such as logistic sigmoid function.
+- $i, f, o$ and $c$ are the input gate, forget gate, output gate,
+  and cell activation vectors, respectively, all of which have the same size as
+  the cell output activation vector $h$.
+- The $\odot$ is the element-wise product of the vectors.
+- $act_g$ and $act_h$ are the cell input and cell output activation functions
+  and `tanh` is usually used for them.
+- $\tilde{c_t}$ is also called candidate hidden state,
+  which is computed based on the current input and the previous hidden state.

 Set `use_peepholes` False to disable peephole connection. The formula
 is omitted here, please refer to the paper

--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -20,13 +20,16 @@
 #ifdef PADDLE_WITH_MKLML
 #include <mkl_cblas.h>
 #include <mkl_lapacke.h>
+#include <mkl_service.h>
 #include <mkl_vml_functions.h>
 #endif

 #ifdef PADDLE_USE_OPENBLAS
 #include <cblas.h>
+#ifdef LAPACK_FOUND
 #include <lapacke.h>
 #endif
+#endif

 #ifndef LAPACK_FOUND
 extern "C" {
@@ -46,6 +49,18 @@ namespace paddle {
 namespace operators {
 namespace math {

+static void SetNumThreads(int num_threads) {
+#ifdef PADDLE_USE_OPENBLAS
+  int real_num_threads = num_threads > 1 ? num_threads : 1;
+  openblas_set_num_threads(real_num_threads);
+#elif defined(PADDLE_WITH_MKLML)
+  int real_num_threads = num_threads > 1 ? num_threads : 1;
+  mkl_set_num_threads(real_num_threads);
+#else
+  PADDLE_ENFORCE(false, "To be implemented.");
+#endif
+}
+
 /**
 * Matrix Descriptor of a memory buffer.
 *

--- a/paddle/fluid/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
@@ -21,8 +21,10 @@ limitations under the License. */

 #ifdef PADDLE_USE_OPENBLAS
 #include <cblas.h>
+#ifdef LAPACK_FOUND
 #include <lapacke.h>
 #endif
+#endif

 #ifndef LAPACK_FOUND
 extern "C" {

--- a/paddle/fluid/operators/mean_iou_op.cc
+++ b/paddle/fluid/operators/mean_iou_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/mean_iou_op.h"
+
+namespace paddle {
+namespace operators {
+
+class MeanIoUOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Predictions"),
+                   "Input (Predictions) of MeanIoU op should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input (labels) of MeanIoU op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutMeanIou"),
+                   "Output (OutMeanIou) of MeanIoU op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutWrong"),
+                   "Output (OutWrong) of MeanIoU op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutCorrect"),
+                   "Output (OutWrong) of MeanIoU op should not be null.");
+
+    int64_t num_classes =
+        static_cast<int64_t>(ctx->Attrs().Get<int>("num_classes"));
+
+    ctx->SetOutputDim("OutMeanIou", {1});
+    ctx->SetOutputDim("OutWrong", {num_classes});
+    ctx->SetOutputDim("OutCorrect", {num_classes});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Predictions")->type()),
+        ctx.GetPlace());
+  }
+};
+
+class MeanIoUOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Predictions",
+             "(Tensor), A Tensor of prediction results for semantic labels"
+             " with type int32 or int64. The rank should be greater than 1.");
+    AddInput(
+        "Labels",
+        "(Tensor), A Tensor of ground truth labels with type int32 or int64."
+        "Its shape should be the same as Input(Predictions).");
+    AddInput("InWrongs",
+             "(vector<Tensor>), A list of Tensor with shape "
+             "[num_classes]. They are used to collect wrong number among "
+             "batches. Empty list is also valid here.")
+        .AsDuplicable()
+        .AsDispensable();
+    AddInput(
+        "InCorrects",
+        "(vector<Tensor>), A list of Tensor with shape "
+        "[num_classes]. They are used to collect correct number among batches. "
+        "Empty list is also valid here.")
+        .AsDuplicable()
+        .AsDispensable();
+    AddInput("InMeanIou",
+             "(vector<Tensor>), A list of Tensor that Output(mean_iou) should "
+             "be added to. Empty list is also valid here.")
+        .AsDuplicable()
+        .AsDispensable();
+    AddOutput("OutMeanIou",
+              "(vector<Tensor>), A Tensor representing the"
+              " mean intersection-over-union with shape [1].");
+    AddOutput("OutWrong", "(Tensor), A Tensor with shape [num_classes]. ");
+    AddOutput("OutCorrect", "(Tensor), A Tensor with shape [num_classes]. ");
+    AddAttr<int>("num_classes", "(int), The possible number of labels.");
+
+    AddComment(R"DOC(
+mean-IOU Operator.
+Mean Intersection-Over-Union is a common evaluation metric for
+semantic image segmentation, which first computes the IOU for each
+semantic class and then computes the average over classes. 
+IOU is defined as follows: 
+    IOU = true_positive / (true_positive + false_positive + false_negative).
+It is based on pixel level area while "IOU Similarity Operator" 
+is based on area of rectangle.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(mean_iou, ops::MeanIoUOp, ops::MeanIoUOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(mean_iou, ops::MeanIoUKernel<int>,
+                       ops::MeanIoUKernel<int32_t>,
+                       ops::MeanIoUKernel<int64_t>);
--- a/paddle/fluid/operators/mean_iou_op.cu
+++ b/paddle/fluid/operators/mean_iou_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/mean_iou_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__global__ void CountCUDAKernel(const int num_classes, const int count,
+                                const T* predictions, const T* labels,
+                                int* wrong, int* correct) {
+  extern __shared__ int blcok_cache[];
+  int* wrong_c = blcok_cache;
+  int* correct_c = blcok_cache + num_classes;
+  // init cache
+  for (int i = threadIdx.x; i < num_classes * 2; i += blockDim.x) {
+    blcok_cache[i] = 0;
+  }
+  __syncthreads();
+
+  T pred;
+  T label;
+  CUDA_1D_KERNEL_LOOP(i, count) {
+    pred = predictions[i];
+    label = labels[i];
+    if (pred == label) {
+      atomicAdd(correct_c + pred, 1);
+    } else {
+      atomicAdd(wrong_c + pred, 1);
+      atomicAdd(wrong_c + label, 1);
+    }
+  }
+
+  __syncthreads();
+
+  for (int i = threadIdx.x; i < num_classes; i += blockDim.x) {
+    atomicAdd(wrong + i, wrong_c[i]);
+    atomicAdd(correct + i, correct_c[i]);
+  }
+}
+
+__global__ void ComputeIoUCUDAKernel(const int num_classes, int* wrong,
+                                     int* correct, float* ious, float* iou) {
+  __shared__ int valid_count_c;
+  if (threadIdx.x == 0) {
+    valid_count_c = 0;
+  }
+  __syncthreads();
+  CUDA_1D_KERNEL_LOOP(i, num_classes) {
+    int wrong_n = wrong[i];
+    int correct_n = correct[i];
+    int denominator = wrong_n + correct_n;
+    if (denominator > 0) {
+      atomicAdd(&valid_count_c, 1);
+      ious[i] = static_cast<float>(correct_n) / denominator;
+    } else {
+      ious[i] = 0;
+    }
+  }
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    float iou_sum = 0;
+    for (int i = 0; i < num_classes; ++i) {
+      iou_sum += ious[i];
+    }
+    iou[0] += iou_sum / valid_count_c;
+  }
+}
+
+template <typename T>
+class MeanIoUCUDAOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& place = *ctx.template device_context<platform::CUDADeviceContext>()
+                       .eigen_device();
+    // get input and output tensor
+    auto* predictions = ctx.Input<Tensor>("Predictions");
+    auto* labels = ctx.Input<Tensor>("Labels");
+    auto* out_mean_iou = ctx.Output<Tensor>("OutMeanIou");
+    auto* out_wrong = ctx.Output<Tensor>("OutWrong");
+    auto* out_correct = ctx.Output<Tensor>("OutCorrect");
+    int num_classes = static_cast<int>(ctx.Attr<int>("num_classes"));
+
+    // Get data ptr
+    const T* predictions_data = predictions->data<T>();
+    const T* labels_data = labels->data<T>();
+    int* out_wrong_data = out_wrong->mutable_data<int>(ctx.GetPlace());
+    int* out_correct_data = out_correct->mutable_data<int>(ctx.GetPlace());
+    float* out_mean_iou_data =
+        out_mean_iou->mutable_data<float>(ctx.GetPlace());
+
+    // Get Eigen tensor
+    auto out_mean_iou_t = EigenTensor<float, 1>::From(*out_mean_iou);
+    auto out_wrong_t = EigenTensor<int, 1>::From(*out_wrong);
+    auto out_correct_t = EigenTensor<int, 1>::From(*out_correct);
+
+    // Temporary tensor
+    Tensor ious;
+    float* ious_data = ious.mutable_data<float>(
+        {static_cast<int64_t>(num_classes)}, ctx.GetPlace());
+    auto ious_t = EigenTensor<float, 1>::From(ious);
+
+    // Init out_wrong, out_correct and out_mean_iou
+    out_wrong_t.device(place) = out_wrong_t.constant(0);
+    out_correct_t.device(place) = out_correct_t.constant(0);
+    out_mean_iou_t.device(place) = out_mean_iou_t.constant(0.0f);
+
+    // collect pre wrong, correct and mean_iou
+    auto in_mean_ious = ctx.MultiInput<Tensor>("InMeanIou");
+    for (int i = 0; i < in_mean_ious.size(); ++i) {
+      out_mean_iou_t.device(place) +=
+          EigenTensor<float, 1>::From(*in_mean_ious[i]);
+    }
+    auto in_wrongs = ctx.MultiInput<Tensor>("InWrongs");
+    for (int i = 0; i < in_wrongs.size(); ++i) {
+      out_wrong_t.device(place) += EigenTensor<int, 1>::From(*in_wrongs[i]);
+    }
+    auto in_corrects = ctx.MultiInput<Tensor>("InCorrects");
+    for (int i = 0; i < in_corrects.size(); ++i) {
+      out_correct_t.device(place) += EigenTensor<int, 1>::From(*in_corrects[i]);
+    }
+    // compute
+    auto stream = ctx.cuda_device_context().stream();
+    int block = PADDLE_CUDA_NUM_THREADS;
+    int grid = (predictions->numel() + block - 1) / block;
+    int cache_size = (num_classes * 2 + 1) * sizeof(int);
+    CountCUDAKernel<T><<<grid, block, cache_size, stream>>>(
+        num_classes, predictions->numel(), predictions_data, labels_data,
+        out_wrong_data, out_correct_data);
+    ctx.device_context().Wait();
+    ComputeIoUCUDAKernel<<<1, block, 0, stream>>>(num_classes, out_wrong_data,
+                                                  out_correct_data, ious_data,
+                                                  out_mean_iou_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(mean_iou, ops::MeanIoUCUDAOpKernel<int>,
+                        ops::MeanIoUCUDAOpKernel<int64_t>,
+                        ops::MeanIoUCUDAOpKernel<int32_t>);
--- a/paddle/fluid/operators/mean_iou_op.h
+++ b/paddle/fluid/operators/mean_iou_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+template <typename T, int D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+
+template <typename T>
+class MeanIoUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
+                       .eigen_device();
+    // get input and output tensor
+    auto* predictions = ctx.Input<Tensor>("Predictions");
+    auto* labels = ctx.Input<Tensor>("Labels");
+    auto* out_mean_iou = ctx.Output<Tensor>("OutMeanIou");
+    auto* out_wrong = ctx.Output<Tensor>("OutWrong");
+    auto* out_correct = ctx.Output<Tensor>("OutCorrect");
+    int num_classes = static_cast<int>(ctx.Attr<int>("num_classes"));
+
+    // get data ptr
+    const T* predictions_data = predictions->data<T>();
+    const T* labels_data = labels->data<T>();
+    float* out_mean_iou_data =
+        out_mean_iou->mutable_data<float>(ctx.GetPlace());
+    int* out_wrong_data = out_wrong->mutable_data<int>(ctx.GetPlace());
+    int* out_correct_data = out_correct->mutable_data<int>(ctx.GetPlace());
+
+    // get eigen tensor
+    auto out_mean_iou_t = EigenTensor<float, 1>::From(*out_mean_iou);
+    auto out_wrong_t = EigenTensor<int, 1>::From(*out_wrong);
+    auto out_correct_t = EigenTensor<int, 1>::From(*out_correct);
+
+    // Tmp tensor
+    Tensor denominator;
+    Tensor valid_count;
+    Tensor iou_sum;
+
+    // get data ptr of tmp tensor
+    int* denominator_data = denominator.mutable_data<int>(
+        {static_cast<int64_t>(num_classes)}, ctx.GetPlace());
+    int* valid_count_data = valid_count.mutable_data<int>({1}, ctx.GetPlace());
+    float* iou_sum_data = iou_sum.mutable_data<float>({1}, ctx.GetPlace());
+
+    // get eigen tensor of tmp tensor
+    auto denominator_t = EigenTensor<int, 1>::From(denominator);
+    auto valid_count_t = EigenTensor<int, 1>::From(valid_count);
+    auto iou_sum_t = EigenTensor<float, 1>::From(iou_sum);
+
+    // init out_wrong, out_correct and out_mean_iou
+    out_wrong_t = out_wrong_t.constant(0);
+    out_correct_t = out_correct_t.constant(0);
+    out_mean_iou_t = out_mean_iou_t.constant(0);
+
+    // collect pre wrong, correct and mean_iou
+    auto in_mean_ious = ctx.MultiInput<Tensor>("InMeanIou");
+    for (size_t i = 0; i < in_mean_ious.size(); ++i) {
+      out_mean_iou_t.device(place) +=
+          EigenTensor<float, 1>::From(*in_mean_ious[i]);
+    }
+    auto in_wrongs = ctx.MultiInput<Tensor>("InWrongs");
+    for (size_t i = 0; i < in_wrongs.size(); ++i) {
+      out_wrong_t.device(place) += EigenTensor<int, 1>::From(*in_wrongs[i]);
+    }
+    auto in_corrects = ctx.MultiInput<Tensor>("InCorrects");
+    for (size_t i = 0; i < in_corrects.size(); ++i) {
+      out_correct_t.device(place) += EigenTensor<int, 1>::From(*in_corrects[i]);
+    }
+
+    // compute
+    for (int64_t i = 0; i < predictions->numel(); ++i) {
+      if (predictions_data[i] == labels_data[i]) {
+        out_correct_data[predictions_data[i]] += 1;
+      } else {
+        out_wrong_data[labels_data[i]] += 1;
+        out_wrong_data[predictions_data[i]] += 1;
+      }
+    }
+
+    denominator_t = out_wrong_t + out_correct_t;
+    valid_count_t =
+        (denominator_t > denominator_t.constant(0.0f)).cast<int>().sum();
+
+    for (int i = 0; i < num_classes; ++i) {
+      if (denominator_data[i] == 0) {
+        denominator_data[i] = 1;
+      }
+    }
+
+    iou_sum_t =
+        (out_correct_t.cast<float>() / denominator_t.cast<float>()).sum();
+    out_mean_iou_data[0] += (iou_sum_data[0] / valid_count_data[0]);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
@@ -33,12 +33,10 @@ class MeanOp : public framework::OperatorWithKernel {
 class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddInput("X", "The input of mean op");
-    AddOutput("Out", "The output of mean op").Reuse("X");
+    AddInput("X", "(Tensor) The input of mean op");
+    AddOutput("Out", "(Tensor) The output of mean op").Reuse("X");
    AddComment(R"DOC(
-Mean Operator.
-
-Out is a scalar which is the mean of all elements in X. 
+Mean Operator calculates the mean of all elements in X.

 )DOC");
  }

--- a/paddle/fluid/operators/merge_ids_op.cc
+++ b/paddle/fluid/operators/merge_ids_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/merge_ids_op.h"
+
+namespace paddle {
+namespace operators {
+
+class MergeIdsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}");
+    AddInput(
+        "X",
+        "(LoDTensors) multi input tensor with shape{batch_num, N}, N is the "
+        "size of embedding table")
+        .AsDuplicable();
+    AddOutput("Out", "(LoDTensor) The merged outputs of the input tensors.");
+
+    AddComment(R"DOC(
+Merge multi LoDTensor's into one according to Ids's shard num.
+
+
+split_ids_op -> prefetch_op -> merge_ids_op
+
+
+merge_ids_op should be used after split_ids_op and prefetch_op, split_ids_op
+ will split input Ids into multiple tensors according to Id's shard number.
+prefetch_op will send them to parameter server to prefetch embedding value
+back. During split, the order of ids is disordered. In merge_ids_op we use
+the original Ids to restore the order of the fetched embedding value and
+ also pass the lod information to the merged output.
+
+
+Example:
+
+    Ids = [1,2,3,4,5,6] # 3 shared
+
+split_ids_op ->
+
+    Id0 = [3, 6] # id % 3 == 0
+    Id1 = [1, 4] # id % 3 == 1
+    Id2 = [2, 5] # id % 3 == 2
+
+prefetch_op ->
+
+    X0 = [[0.3 0.3]   # 3
+          [0.6 0.6]]  # 6
+    X1 = [[0.1 0.1]   # 1
+          [0.4 0.4]]  # 4
+    X2 = [[0.2 0.2]   # 2
+          [0.5 0.5]]  # 5
+
+merge_ids_op ->
+
+    Out = [[0.1 0.1]  # 1
+           [0.2 0.2]  # 2
+           [0.3 0.3]  # 3
+           [0.4 0.4]  # 4
+           [0.5 0.5]  # 5
+           [0.6 0.6]] # 6
+)DOC");
+  }
+};
+
+class MergeIdsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Ids"), "MergeIdsOp must has input Ids.");
+    PADDLE_ENFORCE(ctx->HasInputs("X"), "MergeIdsOp must has input X.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "MergeIdsOp must has output Out.");
+
+    auto ids_var_type = ctx->GetInputsVarType("Ids").front();
+    auto ids_dims = ctx->GetInputDim("Ids");
+    if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
+      PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
+      PADDLE_ENFORCE_EQ(ids_dims[1], 1);
+    }
+    auto x_var_type = ctx->GetInputsVarType("X");
+    for (auto &var_type : x_var_type) {
+      PADDLE_ENFORCE_EQ(var_type, framework::proto::VarType::LOD_TENSOR,
+                        "input X only support lod tensors");
+    }
+    ctx->ShareLoD("Ids", "Out");
+  }
+
+ private:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.MultiInput<framework::Tensor>("X").front()->type()),
+        ctx.GetPlace());
+  }
+};
+
+class MergeIdsOpInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    auto *input_var = block->Var(op_desc.Input("Ids")[0]);
+    for (auto &out_var : op_desc.Output("Out")) {
+      block->Var(out_var)->SetType(input_var->GetType());
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(merge_ids, ops::MergeIdsOp, ops::MergeIdsOpMaker,
+                  ops::MergeIdsOpInferVarType);
+REGISTER_OP_CPU_KERNEL(
+    merge_ids, ops::MergeIdsOpKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/fluid/operators/merge_ids_op.h
+++ b/paddle/fluid/operators/merge_ids_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class MergeIdsOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto place = ctx.GetPlace();
+    if (!platform::is_cpu_place(place)) {
+      PADDLE_THROW("MergeIds do not support GPU kernel");
+    }
+    VLOG(3) << "run in MergeIdsOpKernel";
+
+    const auto *ids_var = ctx.InputVar("Ids");
+    PADDLE_ENFORCE(ids_var->IsType<framework::LoDTensor>(),
+                   "only support to merge Ids of LoDTensor");
+
+    const auto &ids_tensor = ids_var->Get<framework::LoDTensor>();
+    const auto &ids_dims = ids_tensor.dims();
+    const int64_t *ids = ids_tensor.data<int64_t>();
+
+    auto x_tensors = ctx.MultiInput<framework::LoDTensor>("X");
+
+    auto *out = ctx.Output<framework::LoDTensor>("Out");
+
+    int batch_size = 0;
+    int embedding_size = 0;
+    for (auto &input : x_tensors) {
+      if (framework::product(input->dims()) != 0) {
+        if (embedding_size == 0) {
+          embedding_size = input->dims()[1];
+        }
+        PADDLE_ENFORCE_EQ(embedding_size, input->dims()[1],
+                          "embedding size of all input should be the same");
+        batch_size += input->dims()[0];
+      }
+    }
+    PADDLE_ENFORCE_EQ(
+        batch_size, ids_dims[0],
+        "the batch size of ids and merged embedding value should be the same");
+
+    const size_t shard_num = x_tensors.size();
+
+    if (shard_num == 1) {
+      VLOG(3) << "only one shard, we can copy the data directly";
+      TensorCopy(*x_tensors[0], place, out);
+    } else {
+      std::vector<int> in_indexs(shard_num, 0);
+      auto *out_data = out->mutable_data<T>(
+          framework::make_ddim({batch_size, embedding_size}), place);
+      // copy data from ins[shard_num] to out.
+      for (int i = 0; i < ids_dims[0]; ++i) {
+        int64_t id = ids[i];
+        size_t shard_id = static_cast<size_t>(id) % shard_num;
+        int index = in_indexs[shard_id];
+        memcpy(out_data + embedding_size * i,
+               x_tensors[shard_id]->data<T>() + index * embedding_size,
+               sizeof(T) * embedding_size);
+        in_indexs[shard_id] += 1;
+      }
+
+      for (size_t i = 0; i < shard_num; ++i) {
+        PADDLE_ENFORCE_EQ(in_indexs[i], x_tensors[i]->dims()[0],
+                          "after merge, all data in x_tensor should be used");
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/multiplex_op.cc
+++ b/paddle/fluid/operators/multiplex_op.cc
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
--- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
--- a/paddle/fluid/operators/shape_op.cc
+++ b/paddle/fluid/operators/shape_op.cc
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
--- a/paddle/fluid/operators/split_op.cu.cc
+++ b/paddle/fluid/operators/split_op.cu.cc
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
--- a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
--- a/python/paddle/batch.py
+++ b/python/paddle/batch.py
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
--- a/python/paddle/fluid/layers/metric.py
+++ b/python/paddle/fluid/layers/metric.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
--- a/python/paddle/fluid/lod_tensor.py
+++ b/python/paddle/fluid/lod_tensor.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
--- a/python/paddle/fluid/tests/test_data_feeder.py
+++ b/python/paddle/fluid/tests/test_data_feeder.py
--- a/python/paddle/fluid/tests/test_lod_tensor.py
+++ b/python/paddle/fluid/tests/test_lod_tensor.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
--- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
--- a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
--- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
--- a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
+++ b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
--- a/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
--- a/python/paddle/fluid/tests/unittests/test_ctc_align.py
+++ b/python/paddle/fluid/tests/unittests/test_ctc_align.py
--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_train.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_train.py
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
--- a/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
+++ b/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
--- a/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
+++ b/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
--- a/python/paddle/fluid/tests/unittests/test_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_op.py
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
--- a/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
+++ b/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
--- a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
--- a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
--- a/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
--- a/python/paddle/fluid/tests/unittests/test_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_op.py
--- a/python/paddle/fluid/tests/unittests/test_lstmp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstmp_op.py
--- a/python/paddle/fluid/tests/unittests/test_mean_iou.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_iou.py
--- a/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
+++ b/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
--- a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
--- a/python/paddle/fluid/tests/unittests/test_one_hot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
--- a/python/paddle/fluid/tests/unittests/test_print_op.py
+++ b/python/paddle/fluid/tests/unittests/test_print_op.py
--- a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
--- a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
--- a/python/paddle/fluid/tests/unittests/test_row_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_row_conv_op.py
--- a/python/paddle/fluid/tests/unittests/test_seq_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_concat_op.py
--- a/python/paddle/fluid/tests/unittests/test_seq_conv.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_conv.py
--- a/python/paddle/fluid/tests/unittests/test_seq_pool.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_pool.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
--- a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
+++ b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
--- a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
--- a/python/paddle/fluid/tests/unittests/test_target_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_target_assign_op.py
--- a/python/paddle/fluid/tests/unittests/test_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor.py
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
--- a/python/paddle/fluid/tests/unittests/test_weight_normalization.py
+++ b/python/paddle/fluid/tests/unittests/test_weight_normalization.py
--- a/python/paddle/fluid/tests/unittests/testsuite.py
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
--- a/python/paddle/v2/minibatch.py
+++ b/python/paddle/v2/minibatch.py
--- a/tools/codestyle/cpplint_pre_commit.hook
+++ b/tools/codestyle/cpplint_pre_commit.hook