Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into expose_Parameter_2

d0a8eea2 · fengjiayi · e9ed62bf · a64844ad · d0a8eea2 · d0a8eea2
92 changed file
--- a/Dockerfile
+++ b/Dockerfile
@@ -76,7 +76,8 @@ RUN easy_install -U pip && \
    pip install sphinx-rtd-theme==0.1.9 recommonmark
 RUN pip install pre-commit 'ipython==5.3.0' && \
-    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0'
+    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip install opencv-python
 #For docstring checker
 RUN pip install pylint pytest astroid isort

--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -40,12 +40,12 @@ ExternalProject_Add(
    # NOTE(wuyi):
    # this package is generated by following steps:
    # 1. git clone -b v1.8.x https://github.com/grpc/grpc.git
-    # 2. submodule update --init
+    # 2. git submodule update --init
    # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
    #    checkout and clean other dirs under third_party
    # 4. remove .git, and package the directory.
-    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.8.x.tar.gz"
+    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz"
-    URL_MD5  "c9c58ee7d0e8929a63155af6a2ecdbd0"
+    URL_MD5  "1f268a2aff6759839dccd256adcc91cf"
    PREFIX          ${GRPC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CONFIGURE_COMMAND ""

--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -54,7 +54,7 @@ ExternalProject_Add(
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ${MKLDNN_DEPENDS}
    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "db3424ad44901513c03a1ea31ccaacdf633fbe9f"
+    GIT_TAG             "a29d8487a63afca3d5b8c5bbdbb473cf8ccc6e51"
    PREFIX              ${MKLDNN_SOURCES_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}

--- a/doc/fluid/api/evaluator.rst
+++ b/doc/fluid/api/evaluator.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!
-=========
+=============
-evaluator
+fluid.average
-=========
+=============
+.. _api_fluid_average_WeightedAverage:
+WeightedAverage
+---------------
+..  autoclass:: paddle.fluid.average.WeightedAverage
+    :members:
+    :noindex:
--- a/doc/fluid/api/backward.rst
+++ b/doc/fluid/api/backward.rst
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+==============
+fluid.backward
+==============
+.. _api_fluid_backward_append_backward:
+append_backward
+---------------
+..  autofunction:: paddle.fluid.backward.append_backward
+    :noindex:
+.. _api_fluid_backward_calc_gradient:
+calc_gradient
+-------------
+..  autofunction:: paddle.fluid.backward.calc_gradient
+    :noindex:
--- a/doc/fluid/api/clip.rst
+++ b/doc/fluid/api/clip.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!
-====
+==========
-clip
+fluid.clip
-====
+==========
+.. _api_fluid_clip_ErrorClipByValue:
 ErrorClipByValue
 ----------------
@@ -12,6 +14,8 @@ ErrorClipByValue
    :members:
    :noindex:
+.. _api_fluid_clip_GradientClipByValue:
 GradientClipByValue
 -------------------
@@ -19,6 +23,8 @@ GradientClipByValue
    :members:
    :noindex:
+.. _api_fluid_clip_GradientClipByNorm:
 GradientClipByNorm
 ------------------
@@ -26,6 +32,8 @@ GradientClipByNorm
    :members:
    :noindex:
+.. _api_fluid_clip_GradientClipByGlobalNorm:
 GradientClipByGlobalNorm
 ------------------------
@@ -33,15 +41,3 @@ GradientClipByGlobalNorm
    :members:
    :noindex:
-append_gradient_clip_ops
------------------------
-..  autofunction:: paddle.fluid.clip.append_gradient_clip_ops
-    :noindex:
-error_clip_callback
-------------------
-..  autofunction:: paddle.fluid.clip.error_clip_callback
-    :noindex:
--- a/doc/fluid/api/data.rst
+++ b/doc/fluid/api/data.rst
-==================================
-Data Reader Interface and DataSets
-==================================
-..  toctree::
-    :maxdepth: 1
-    data/data_reader.rst
-    data/image.rst
-    data/dataset.rst
--- a/doc/fluid/api/data_feeder.rst
+++ b/doc/fluid/api/data_feeder.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!
-===========
+=================
-data_feeder
+fluid.data_feeder
-===========
+=================
+.. _api_fluid_data_feeder_DataFeeder:
 DataFeeder
 ----------

--- a/doc/fluid/api/detection.rst
+++ b/doc/fluid/api/detection.rst
--- a/doc/fluid/api/executor.rst
+++ b/doc/fluid/api/executor.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!
-========
+==============
-executor
+fluid.executor
-========
+==============
+.. _api_fluid_executor_Executor:
 Executor
 --------
@@ -12,24 +14,32 @@ Executor
    :members:
    :noindex:
+.. _api_fluid_executor_global_scope:
 global_scope
 ------------
 ..  autofunction:: paddle.fluid.executor.global_scope
    :noindex:
+.. _api_fluid_executor_scope_guard:
 scope_guard
 -----------
 ..  autofunction:: paddle.fluid.executor.scope_guard
    :noindex:
-switch_scope
+.. _api_fluid_executor__switch_scope:
------------
+_switch_scope
+-------------
-..  autofunction:: paddle.fluid.executor.switch_scope
+..  autofunction:: paddle.fluid.executor._switch_scope
    :noindex:
+.. _api_fluid_executor_fetch_var:
 fetch_var
 ---------

--- a/doc/fluid/api/fluid.rst
+++ b/doc/fluid/api/fluid.rst
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+=====
+fluid
+=====
+.. _api_fluid_Block:
+Block
+-----
+..  autoclass:: paddle.fluid.Block
+    :members:
+    :noindex:
+.. _api_fluid_Variable:
+Variable
+--------
+..  autoclass:: paddle.fluid.Variable
+    :members:
+    :noindex:
+.. _api_fluid_Program:
+Program
+-------
+..  autoclass:: paddle.fluid.Program
+    :members:
+    :noindex:
+.. _api_fluid_Operator:
+Operator
+--------
+..  autoclass:: paddle.fluid.Operator
+    :members:
+    :noindex:
+.. _api_fluid_default_startup_program:
+default_startup_program
+-----------------------
+..  autofunction:: paddle.fluid.default_startup_program
+    :noindex:
+.. _api_fluid_default_main_program:
+default_main_program
+--------------------
+..  autofunction:: paddle.fluid.default_main_program
+    :noindex:
+.. _api_fluid_program_guard:
+program_guard
+-------------
+..  autofunction:: paddle.fluid.program_guard
+    :noindex:
+.. _api_fluid_get_var:
+get_var
+-------
+..  autofunction:: paddle.fluid.get_var
+    :noindex:
+.. _api_fluid_Executor:
+Executor
+--------
+..  autoclass:: paddle.fluid.Executor
+    :members:
+    :noindex:
+.. _api_fluid_global_scope:
+global_scope
+------------
+..  autofunction:: paddle.fluid.global_scope
+    :noindex:
+.. _api_fluid_scope_guard:
+scope_guard
+-----------
+..  autofunction:: paddle.fluid.scope_guard
+    :noindex:
+.. _api_fluid__switch_scope:
+_switch_scope
+-------------
+..  autofunction:: paddle.fluid._switch_scope
+    :noindex:
+.. _api_fluid_fetch_var:
+fetch_var
+---------
+..  autofunction:: paddle.fluid.fetch_var
+    :noindex:
+.. _api_fluid_Go:
+Go
+--
+..  autoclass:: paddle.fluid.Go
+    :members:
+    :noindex:
+.. _api_fluid_make_channel:
+make_channel
+------------
+..  autofunction:: paddle.fluid.make_channel
+    :noindex:
+.. _api_fluid_channel_send:
+channel_send
+------------
+..  autofunction:: paddle.fluid.channel_send
+    :noindex:
+.. _api_fluid_channel_recv:
+channel_recv
+------------
+..  autofunction:: paddle.fluid.channel_recv
+    :noindex:
+.. _api_fluid_channel_close:
+channel_close
+-------------
+..  autofunction:: paddle.fluid.channel_close
+    :noindex:
+.. _api_fluid_Select:
+Select
+------
+..  autoclass:: paddle.fluid.Select
+    :members:
+    :noindex:
+.. _api_fluid_Trainer:
+Trainer
+-------
+..  autoclass:: paddle.fluid.Trainer
+    :members:
+    :noindex:
+.. _api_fluid_BeginEpochEvent:
+BeginEpochEvent
+---------------
+..  autoclass:: paddle.fluid.BeginEpochEvent
+    :members:
+    :noindex:
+.. _api_fluid_EndEpochEvent:
+EndEpochEvent
+-------------
+..  autoclass:: paddle.fluid.EndEpochEvent
+    :members:
+    :noindex:
+.. _api_fluid_BeginStepEvent:
+BeginStepEvent
+--------------
+..  autoclass:: paddle.fluid.BeginStepEvent
+    :members:
+    :noindex:
+.. _api_fluid_EndStepEvent:
+EndStepEvent
+------------
+..  autoclass:: paddle.fluid.EndStepEvent
+    :members:
+    :noindex:
+.. _api_fluid_CheckpointConfig:
+CheckpointConfig
+----------------
+..  autoclass:: paddle.fluid.CheckpointConfig
+    :members:
+    :noindex:
+.. _api_fluid_Inferencer:
+Inferencer
+----------
+..  autoclass:: paddle.fluid.Inferencer
+    :members:
+    :noindex:
+.. _api_fluid_DistributeTranspiler:
+DistributeTranspiler
+--------------------
+..  autoclass:: paddle.fluid.DistributeTranspiler
+    :members:
+    :noindex:
+.. _api_fluid_memory_optimize:
+memory_optimize
+---------------
+..  autofunction:: paddle.fluid.memory_optimize
+    :noindex:
+.. _api_fluid_release_memory:
+release_memory
+--------------
+..  autofunction:: paddle.fluid.release_memory
+    :noindex:
+.. _api_fluid_ParallelExecutor:
+ParallelExecutor
+----------------
+..  autoclass:: paddle.fluid.ParallelExecutor
+    :members:
+    :noindex:
+.. _api_fluid_ExecutionStrategy:
+ExecutionStrategy
+-----------------
+..  autoclass:: paddle.fluid.ExecutionStrategy
+    :members:
+    :noindex:
+.. _api_fluid_BuildStrategy:
+BuildStrategy
+-------------
+..  autoclass:: paddle.fluid.BuildStrategy
+    :members:
+    :noindex:
+.. _api_fluid_create_lod_tensor:
+create_lod_tensor
+-----------------
+..  autofunction:: paddle.fluid.create_lod_tensor
+    :noindex:
+.. _api_fluid_create_random_int_lodtensor:
+create_random_int_lodtensor
+---------------------------
+..  autofunction:: paddle.fluid.create_random_int_lodtensor
+    :noindex:
+.. _api_fluid_LoDTensor:
+LoDTensor
+---------
+..  autoclass:: paddle.fluid.LoDTensor
+    :members:
+    :noindex:
+.. _api_fluid_CPUPlace:
+CPUPlace
+--------
+..  autoclass:: paddle.fluid.CPUPlace
+    :members:
+    :noindex:
+.. _api_fluid_CUDAPlace:
+CUDAPlace
+---------
+..  autoclass:: paddle.fluid.CUDAPlace
+    :members:
+    :noindex:
+.. _api_fluid_CUDAPinnedPlace:
+CUDAPinnedPlace
+---------------
+..  autoclass:: paddle.fluid.CUDAPinnedPlace
+    :members:
+    :noindex:
+.. _api_fluid_Tensor:
+Tensor
+------
+..  autoclass:: paddle.fluid.Tensor
+    :members:
+    :noindex:
+.. _api_fluid_ParamAttr:
+ParamAttr
+---------
+..  autoclass:: paddle.fluid.ParamAttr
+    :members:
+    :noindex:
+.. _api_fluid_WeightNormParamAttr:
+WeightNormParamAttr
+-------------------
+..  autoclass:: paddle.fluid.WeightNormParamAttr
+    :members:
+    :noindex:
+.. _api_fluid_DataFeeder:
+DataFeeder
+----------
+..  autoclass:: paddle.fluid.DataFeeder
+    :members:
+    :noindex:
+.. _api_fluid_Scope:
+Scope
+-----
+..  autoclass:: paddle.fluid.Scope
+    :members:
+    :noindex:
--- a/doc/fluid/api/gen_doc.py
+++ b/doc/fluid/api/gen_doc.py
@@ -29,19 +29,27 @@ def parse_arg():
 class DocGenerator(object):
-    def __init__(self, module_name, stream=sys.stdout):
+    def __init__(self, module_name=None, stream=sys.stdout):
+        if module_name == "":
+            module_name = None
        self.stream = stream
-        self.module_name = module_name
+        if module_name is None:
-        if not hasattr(fluid, module_name):
+            self.module_name = "fluid"
-            raise ValueError("Cannot find fluid.{0}".format(module_name))
        else:
-            self.module = getattr(fluid, module_name)
+            self.module_name = "fluid." + module_name
+        if module_name is None:
+            self.module = fluid
+        else:
+            if not hasattr(fluid, module_name):
+                raise ValueError("Cannot find fluid.{0}".format(module_name))
+            else:
+                self.module = getattr(fluid, module_name)
        self.stream.write('''..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!
 ''')
-        self._print_header_(module_name, dot='=', is_title=True)
+        self._print_header_(self.module_name, dot='=', is_title=True)
    def print_submodule(self, submodule_name):
        submodule = getattr(self.module, submodule_name)
@@ -60,25 +68,29 @@ class DocGenerator(object):
        self._print_header_(name, dot='=', is_title=False)
    def print_item(self, name):
-        item = getattr(self.module, name)
+        item = getattr(self.module, name, None)
+        if item is None:
+            return
        if isinstance(item, types.TypeType):
            self.print_class(name)
        elif isinstance(item, types.FunctionType):
            self.print_method(name)
        else:
-            raise RuntimeError("Unsupported item {0}".format(name))
+            pass
    def print_class(self, name):
+        self._print_ref_(name)
        self._print_header_(name, dot='-', is_title=False)
-        self.stream.write('''..  autoclass:: paddle.fluid.{0}.{1}
+        self.stream.write('''..  autoclass:: paddle.{0}.{1}
    :members:
    :noindex:
 '''.format(self.module_name, name))
    def print_method(self, name):
+        self._print_ref_(name)
        self._print_header_(name, dot='-', is_title=False)
-        self.stream.write('''..  autofunction:: paddle.fluid.{0}.{1}
+        self.stream.write('''..  autofunction:: paddle.{0}.{1}
    :noindex:
 '''.format(self.module_name, name))
@@ -94,6 +106,10 @@ class DocGenerator(object):
        self.stream.write('\n')
        self.stream.write('\n')
+    def _print_ref_(self, name):
+        self.stream.write(".. _api_{0}_{1}:\n\n".format("_".join(
+            self.module_name.split(".")), name))
 def main():
    args = parse_arg()

--- a/doc/fluid/api/gen_doc.sh
+++ b/doc/fluid/api/gen_doc.sh
 #!/bin/bash
-python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler metric > layers.rst
+python gen_doc.py layers --submodules control_flow device io nn ops tensor learning_rate_scheduler detection metric_op tensor > layers.rst
-for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler
+for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler recordio_writer backward average profiler
 do
  python gen_doc.py ${module} > ${module}.rst
 done
+python gen_doc.py "" > fluid.rst
--- a/doc/fluid/api/index_en.rst
+++ b/doc/fluid/api/index_en.rst
-======================
+=============
-Fluid
+API Reference
-======================
+=============
 ..  toctree::
    :maxdepth: 1
+    fluid.rst
    layers.rst
    data_feeder.rst
    executor.rst
@@ -18,3 +19,8 @@ Fluid
    regularizer.rst
    io.rst
    data.rst
+    transpiler.rst
+    recordio_writer.rst
+    backward.rst
+    average.rst
+    profiler.rst
--- a/doc/fluid/api/initializer.rst
+++ b/doc/fluid/api/initializer.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!
-===========
+=================
-initializer
+fluid.initializer
-===========
+=================
+.. _api_fluid_initializer_Constant:
 Constant
 --------
@@ -12,6 +14,8 @@ Constant
    :members:
    :noindex:
+.. _api_fluid_initializer_Uniform:
 Uniform
 -------
@@ -19,6 +23,8 @@ Uniform
    :members:
    :noindex:
+.. _api_fluid_initializer_Normal:
 Normal
 ------
@@ -26,6 +32,8 @@ Normal
    :members:
    :noindex:
+.. _api_fluid_initializer_Xavier:
 Xavier
 ------
@@ -33,6 +41,8 @@ Xavier
    :members:
    :noindex:
+.. _api_fluid_initializer_Bilinear:
 Bilinear
 --------
@@ -40,18 +50,33 @@ Bilinear
    :members:
    :noindex:
+.. _api_fluid_initializer_MSRA:
+MSRA
+----
+..  autoclass:: paddle.fluid.initializer.MSRA
+    :members:
+    :noindex:
+.. _api_fluid_initializer_force_init_on_cpu:
 force_init_on_cpu
 -----------------
 ..  autofunction:: paddle.fluid.initializer.force_init_on_cpu
    :noindex:
+.. _api_fluid_initializer_init_on_cpu:
 init_on_cpu
 -----------
 ..  autofunction:: paddle.fluid.initializer.init_on_cpu
    :noindex:
+.. _api_fluid_initializer_ConstantInitializer:
 ConstantInitializer
 -------------------
@@ -59,6 +84,8 @@ ConstantInitializer
    :members:
    :noindex:
+.. _api_fluid_initializer_UniformInitializer:
 UniformInitializer
 ------------------
@@ -66,6 +93,8 @@ UniformInitializer
    :members:
    :noindex:
+.. _api_fluid_initializer_NormalInitializer:
 NormalInitializer
 -----------------
@@ -73,6 +102,8 @@ NormalInitializer
    :members:
    :noindex:
+.. _api_fluid_initializer_XavierInitializer:
 XavierInitializer
 -----------------
@@ -80,6 +111,8 @@ XavierInitializer
    :members:
    :noindex:
+.. _api_fluid_initializer_BilinearInitializer:
 BilinearInitializer
 -------------------
@@ -87,3 +120,12 @@ BilinearInitializer
    :members:
    :noindex:
+.. _api_fluid_initializer_MSRAInitializer:
+MSRAInitializer
+---------------
+..  autoclass:: paddle.fluid.initializer.MSRAInitializer
+    :members:
+    :noindex:
--- a/doc/fluid/api/io.rst
+++ b/doc/fluid/api/io.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!
-==
+========
-io
+fluid.io
-==
+========
+.. _api_fluid_io_save_vars:
 save_vars
 ---------
@@ -11,84 +13,112 @@ save_vars
 ..  autofunction:: paddle.fluid.io.save_vars
    :noindex:
+.. _api_fluid_io_save_params:
 save_params
 -----------
 ..  autofunction:: paddle.fluid.io.save_params
    :noindex:
+.. _api_fluid_io_save_persistables:
 save_persistables
 -----------------
 ..  autofunction:: paddle.fluid.io.save_persistables
    :noindex:
+.. _api_fluid_io_load_vars:
 load_vars
 ---------
 ..  autofunction:: paddle.fluid.io.load_vars
    :noindex:
+.. _api_fluid_io_load_params:
 load_params
 -----------
 ..  autofunction:: paddle.fluid.io.load_params
    :noindex:
+.. _api_fluid_io_load_persistables:
 load_persistables
 -----------------
 ..  autofunction:: paddle.fluid.io.load_persistables
    :noindex:
+.. _api_fluid_io_save_inference_model:
 save_inference_model
 --------------------
 ..  autofunction:: paddle.fluid.io.save_inference_model
    :noindex:
+.. _api_fluid_io_load_inference_model:
 load_inference_model
 --------------------
 ..  autofunction:: paddle.fluid.io.load_inference_model
    :noindex:
+.. _api_fluid_io_get_inference_program:
 get_inference_program
 ---------------------
 ..  autofunction:: paddle.fluid.io.get_inference_program
    :noindex:
+.. _api_fluid_io_save_checkpoint:
 save_checkpoint
 ---------------
 ..  autofunction:: paddle.fluid.io.save_checkpoint
    :noindex:
+.. _api_fluid_io_load_checkpoint:
 load_checkpoint
 ---------------
 ..  autofunction:: paddle.fluid.io.load_checkpoint
    :noindex:
+.. _api_fluid_io_clean_checkpoint:
 clean_checkpoint
 ----------------
 ..  autofunction:: paddle.fluid.io.clean_checkpoint
    :noindex:
+.. _api_fluid_io_load_persist_vars_without_grad:
 load_persist_vars_without_grad
 ------------------------------
 ..  autofunction:: paddle.fluid.io.load_persist_vars_without_grad
    :noindex:
+.. _api_fluid_io_save_persist_vars_without_grad:
 save_persist_vars_without_grad
 ------------------------------
 ..  autofunction:: paddle.fluid.io.save_persist_vars_without_grad
    :noindex:
+.. _api_fluid_io_get_latest_checkpoint_serial:
 get_latest_checkpoint_serial
 ----------------------------

--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!
-======
+============
-layers
+fluid.layers
-======
+============
 control_flow
 ============
+.. _api_fluid_layers_split_lod_tensor:
 split_lod_tensor
 ----------------
 ..  autofunction:: paddle.fluid.layers.split_lod_tensor
    :noindex:
+.. _api_fluid_layers_merge_lod_tensor:
 merge_lod_tensor
 ----------------
 ..  autofunction:: paddle.fluid.layers.merge_lod_tensor
    :noindex:
+.. _api_fluid_layers_BlockGuard:
 BlockGuard
 ----------
@@ -27,6 +33,8 @@ BlockGuard
    :members:
    :noindex:
+.. _api_fluid_layers_BlockGuardWithCompletion:
 BlockGuardWithCompletion
 ------------------------
@@ -34,12 +42,7 @@ BlockGuardWithCompletion
    :members:
    :noindex:
-StaticRNNMemoryLink
+.. _api_fluid_layers_WhileGuard:
-------------------
-..  autoclass:: paddle.fluid.layers.StaticRNNMemoryLink
-    :members:
-    :noindex:
 WhileGuard
 ----------
@@ -48,6 +51,8 @@ WhileGuard
    :members:
    :noindex:
+.. _api_fluid_layers_While:
 While
 -----
@@ -55,6 +60,8 @@ While
    :members:
    :noindex:
+.. _api_fluid_layers_Switch:
 Switch
 ------
@@ -62,78 +69,104 @@ Switch
    :members:
    :noindex:
+.. _api_fluid_layers_lod_rank_table:
 lod_rank_table
 --------------
 ..  autofunction:: paddle.fluid.layers.lod_rank_table
    :noindex:
+.. _api_fluid_layers_max_sequence_len:
 max_sequence_len
 ----------------
 ..  autofunction:: paddle.fluid.layers.max_sequence_len
    :noindex:
+.. _api_fluid_layers_lod_tensor_to_array:
 lod_tensor_to_array
 -------------------
 ..  autofunction:: paddle.fluid.layers.lod_tensor_to_array
    :noindex:
+.. _api_fluid_layers_array_to_lod_tensor:
 array_to_lod_tensor
 -------------------
 ..  autofunction:: paddle.fluid.layers.array_to_lod_tensor
    :noindex:
+.. _api_fluid_layers_increment:
 increment
 ---------
 ..  autofunction:: paddle.fluid.layers.increment
    :noindex:
+.. _api_fluid_layers_array_write:
 array_write
 -----------
 ..  autofunction:: paddle.fluid.layers.array_write
    :noindex:
+.. _api_fluid_layers_create_array:
 create_array
 ------------
 ..  autofunction:: paddle.fluid.layers.create_array
    :noindex:
+.. _api_fluid_layers_less_than:
 less_than
 ---------
 ..  autofunction:: paddle.fluid.layers.less_than
    :noindex:
+.. _api_fluid_layers_equal:
 equal
 -----
 ..  autofunction:: paddle.fluid.layers.equal
    :noindex:
+.. _api_fluid_layers_array_read:
 array_read
 ----------
 ..  autofunction:: paddle.fluid.layers.array_read
    :noindex:
+.. _api_fluid_layers_shrink_memory:
 shrink_memory
 -------------
 ..  autofunction:: paddle.fluid.layers.shrink_memory
    :noindex:
+.. _api_fluid_layers_array_length:
 array_length
 ------------
 ..  autofunction:: paddle.fluid.layers.array_length
    :noindex:
+.. _api_fluid_layers_IfElse:
 IfElse
 ------
@@ -141,6 +174,8 @@ IfElse
    :members:
    :noindex:
+.. _api_fluid_layers_DynamicRNN:
 DynamicRNN
 ----------
@@ -148,6 +183,8 @@ DynamicRNN
    :members:
    :noindex:
+.. _api_fluid_layers_ConditionalBlock:
 ConditionalBlock
 ----------------
@@ -155,6 +192,8 @@ ConditionalBlock
    :members:
    :noindex:
+.. _api_fluid_layers_StaticRNN:
 StaticRNN
 ---------
@@ -162,12 +201,16 @@ StaticRNN
    :members:
    :noindex:
+.. _api_fluid_layers_reorder_lod_tensor_by_rank:
 reorder_lod_tensor_by_rank
 --------------------------
 ..  autofunction:: paddle.fluid.layers.reorder_lod_tensor_by_rank
    :noindex:
+.. _api_fluid_layers_ParallelDo:
 ParallelDo
 ----------
@@ -175,12 +218,16 @@ ParallelDo
    :members:
    :noindex:
+.. _api_fluid_layers_Print:
 Print
 -----
 ..  autofunction:: paddle.fluid.layers.Print
    :noindex:
+.. _api_fluid_layers_is_empty:
 is_empty
 --------
@@ -190,6 +237,8 @@ is_empty
 device
 ======
+.. _api_fluid_layers_get_places:
 get_places
 ----------
@@ -199,12 +248,16 @@ get_places
 io
 ==
+.. _api_fluid_layers_data:
 data
 ----
 ..  autofunction:: paddle.fluid.layers.data
    :noindex:
+.. _api_fluid_layers_BlockGuardServ:
 BlockGuardServ
 --------------
@@ -212,6 +265,8 @@ BlockGuardServ
    :members:
    :noindex:
+.. _api_fluid_layers_ListenAndServ:
 ListenAndServ
 -------------
@@ -219,60 +274,80 @@ ListenAndServ
    :members:
    :noindex:
+.. _api_fluid_layers_Send:
 Send
 ----
 ..  autofunction:: paddle.fluid.layers.Send
    :noindex:
+.. _api_fluid_layers_Recv:
 Recv
 ----
 ..  autofunction:: paddle.fluid.layers.Recv
    :noindex:
+.. _api_fluid_layers_open_recordio_file:
 open_recordio_file
 ------------------
 ..  autofunction:: paddle.fluid.layers.open_recordio_file
    :noindex:
+.. _api_fluid_layers_open_files:
 open_files
 ----------
 ..  autofunction:: paddle.fluid.layers.open_files
    :noindex:
+.. _api_fluid_layers_read_file:
 read_file
 ---------
 ..  autofunction:: paddle.fluid.layers.read_file
    :noindex:
+.. _api_fluid_layers_shuffle:
 shuffle
 -------
 ..  autofunction:: paddle.fluid.layers.shuffle
    :noindex:
+.. _api_fluid_layers_batch:
 batch
 -----
 ..  autofunction:: paddle.fluid.layers.batch
    :noindex:
+.. _api_fluid_layers_double_buffer:
 double_buffer
 -------------
 ..  autofunction:: paddle.fluid.layers.double_buffer
    :noindex:
+.. _api_fluid_layers_random_data_generator:
 random_data_generator
 ---------------------
 ..  autofunction:: paddle.fluid.layers.random_data_generator
    :noindex:
+.. _api_fluid_layers_Preprocessor:
 Preprocessor
 ------------
@@ -280,6 +355,8 @@ Preprocessor
    :members:
    :noindex:
+.. _api_fluid_layers_load:
 load
 ----
@@ -289,584 +366,802 @@ load
 nn
 ==
+.. _api_fluid_layers_fc:
 fc
 --
 ..  autofunction:: paddle.fluid.layers.fc
    :noindex:
+.. _api_fluid_layers_embedding:
 embedding
 ---------
 ..  autofunction:: paddle.fluid.layers.embedding
    :noindex:
+.. _api_fluid_layers_dynamic_lstm:
 dynamic_lstm
 ------------
 ..  autofunction:: paddle.fluid.layers.dynamic_lstm
    :noindex:
+.. _api_fluid_layers_dynamic_lstmp:
 dynamic_lstmp
 -------------
 ..  autofunction:: paddle.fluid.layers.dynamic_lstmp
    :noindex:
+.. _api_fluid_layers_dynamic_gru:
 dynamic_gru
 -----------
 ..  autofunction:: paddle.fluid.layers.dynamic_gru
    :noindex:
+.. _api_fluid_layers_gru_unit:
 gru_unit
 --------
 ..  autofunction:: paddle.fluid.layers.gru_unit
    :noindex:
+.. _api_fluid_layers_linear_chain_crf:
 linear_chain_crf
 ----------------
 ..  autofunction:: paddle.fluid.layers.linear_chain_crf
    :noindex:
+.. _api_fluid_layers_crf_decoding:
 crf_decoding
 ------------
 ..  autofunction:: paddle.fluid.layers.crf_decoding
    :noindex:
+.. _api_fluid_layers_cos_sim:
 cos_sim
 -------
 ..  autofunction:: paddle.fluid.layers.cos_sim
    :noindex:
+.. _api_fluid_layers_cross_entropy:
 cross_entropy
 -------------
 ..  autofunction:: paddle.fluid.layers.cross_entropy
    :noindex:
+.. _api_fluid_layers_square_error_cost:
 square_error_cost
 -----------------
 ..  autofunction:: paddle.fluid.layers.square_error_cost
    :noindex:
+.. _api_fluid_layers_chunk_eval:
 chunk_eval
 ----------
 ..  autofunction:: paddle.fluid.layers.chunk_eval
    :noindex:
+.. _api_fluid_layers_sequence_conv:
 sequence_conv
 -------------
 ..  autofunction:: paddle.fluid.layers.sequence_conv
    :noindex:
+.. _api_fluid_layers_conv2d:
 conv2d
 ------
 ..  autofunction:: paddle.fluid.layers.conv2d
    :noindex:
+.. _api_fluid_layers_conv3d:
 conv3d
 ------
 ..  autofunction:: paddle.fluid.layers.conv3d
    :noindex:
+.. _api_fluid_layers_sequence_pool:
 sequence_pool
 -------------
 ..  autofunction:: paddle.fluid.layers.sequence_pool
    :noindex:
+.. _api_fluid_layers_sequence_softmax:
 sequence_softmax
 ----------------
 ..  autofunction:: paddle.fluid.layers.sequence_softmax
    :noindex:
+.. _api_fluid_layers_softmax:
 softmax
 -------
 ..  autofunction:: paddle.fluid.layers.softmax
    :noindex:
+.. _api_fluid_layers_pool2d:
 pool2d
 ------
 ..  autofunction:: paddle.fluid.layers.pool2d
    :noindex:
+.. _api_fluid_layers_pool3d:
 pool3d
 ------
 ..  autofunction:: paddle.fluid.layers.pool3d
    :noindex:
+.. _api_fluid_layers_batch_norm:
 batch_norm
 ----------
 ..  autofunction:: paddle.fluid.layers.batch_norm
    :noindex:
+.. _api_fluid_layers_beam_search_decode:
 beam_search_decode
 ------------------
 ..  autofunction:: paddle.fluid.layers.beam_search_decode
    :noindex:
+.. _api_fluid_layers_conv2d_transpose:
 conv2d_transpose
 ----------------
 ..  autofunction:: paddle.fluid.layers.conv2d_transpose
    :noindex:
+.. _api_fluid_layers_conv3d_transpose:
 conv3d_transpose
 ----------------
 ..  autofunction:: paddle.fluid.layers.conv3d_transpose
    :noindex:
+.. _api_fluid_layers_sequence_expand:
 sequence_expand
 ---------------
 ..  autofunction:: paddle.fluid.layers.sequence_expand
    :noindex:
+.. _api_fluid_layers_lstm_unit:
 lstm_unit
 ---------
 ..  autofunction:: paddle.fluid.layers.lstm_unit
    :noindex:
+.. _api_fluid_layers_reduce_sum:
 reduce_sum
 ----------
 ..  autofunction:: paddle.fluid.layers.reduce_sum
    :noindex:
+.. _api_fluid_layers_reduce_mean:
 reduce_mean
 -----------
 ..  autofunction:: paddle.fluid.layers.reduce_mean
    :noindex:
+.. _api_fluid_layers_reduce_max:
 reduce_max
 ----------
 ..  autofunction:: paddle.fluid.layers.reduce_max
    :noindex:
+.. _api_fluid_layers_reduce_min:
 reduce_min
 ----------
 ..  autofunction:: paddle.fluid.layers.reduce_min
    :noindex:
+.. _api_fluid_layers_reduce_prod:
 reduce_prod
 -----------
 ..  autofunction:: paddle.fluid.layers.reduce_prod
    :noindex:
+.. _api_fluid_layers_sequence_first_step:
 sequence_first_step
 -------------------
 ..  autofunction:: paddle.fluid.layers.sequence_first_step
    :noindex:
+.. _api_fluid_layers_sequence_last_step:
 sequence_last_step
 ------------------
 ..  autofunction:: paddle.fluid.layers.sequence_last_step
    :noindex:
+.. _api_fluid_layers_dropout:
 dropout
 -------
 ..  autofunction:: paddle.fluid.layers.dropout
    :noindex:
+.. _api_fluid_layers_split:
 split
 -----
 ..  autofunction:: paddle.fluid.layers.split
    :noindex:
+.. _api_fluid_layers_ctc_greedy_decoder:
 ctc_greedy_decoder
 ------------------
 ..  autofunction:: paddle.fluid.layers.ctc_greedy_decoder
    :noindex:
+.. _api_fluid_layers_edit_distance:
 edit_distance
 -------------
 ..  autofunction:: paddle.fluid.layers.edit_distance
    :noindex:
+.. _api_fluid_layers_l2_normalize:
 l2_normalize
 ------------
 ..  autofunction:: paddle.fluid.layers.l2_normalize
    :noindex:
+.. _api_fluid_layers_matmul:
 matmul
 ------
 ..  autofunction:: paddle.fluid.layers.matmul
    :noindex:
+.. _api_fluid_layers_topk:
 topk
 ----
 ..  autofunction:: paddle.fluid.layers.topk
    :noindex:
+.. _api_fluid_layers_warpctc:
 warpctc
 -------
 ..  autofunction:: paddle.fluid.layers.warpctc
    :noindex:
+.. _api_fluid_layers_sequence_reshape:
 sequence_reshape
 ----------------
 ..  autofunction:: paddle.fluid.layers.sequence_reshape
    :noindex:
+.. _api_fluid_layers_transpose:
 transpose
 ---------
 ..  autofunction:: paddle.fluid.layers.transpose
    :noindex:
+.. _api_fluid_layers_im2sequence:
 im2sequence
 -----------
 ..  autofunction:: paddle.fluid.layers.im2sequence
    :noindex:
+.. _api_fluid_layers_nce:
 nce
 ---
 ..  autofunction:: paddle.fluid.layers.nce
    :noindex:
+.. _api_fluid_layers_beam_search:
 beam_search
 -----------
 ..  autofunction:: paddle.fluid.layers.beam_search
    :noindex:
+.. _api_fluid_layers_row_conv:
 row_conv
 --------
 ..  autofunction:: paddle.fluid.layers.row_conv
    :noindex:
+.. _api_fluid_layers_multiplex:
 multiplex
 ---------
 ..  autofunction:: paddle.fluid.layers.multiplex
    :noindex:
+.. _api_fluid_layers_layer_norm:
 layer_norm
 ----------
 ..  autofunction:: paddle.fluid.layers.layer_norm
    :noindex:
+.. _api_fluid_layers_softmax_with_cross_entropy:
 softmax_with_cross_entropy
 --------------------------
 ..  autofunction:: paddle.fluid.layers.softmax_with_cross_entropy
    :noindex:
+.. _api_fluid_layers_smooth_l1:
 smooth_l1
 ---------
 ..  autofunction:: paddle.fluid.layers.smooth_l1
    :noindex:
+.. _api_fluid_layers_one_hot:
 one_hot
 -------
 ..  autofunction:: paddle.fluid.layers.one_hot
    :noindex:
+.. _api_fluid_layers_autoincreased_step_counter:
 autoincreased_step_counter
 --------------------------
 ..  autofunction:: paddle.fluid.layers.autoincreased_step_counter
    :noindex:
+.. _api_fluid_layers_reshape:
 reshape
 -------
 ..  autofunction:: paddle.fluid.layers.reshape
    :noindex:
+.. _api_fluid_layers_lod_reset:
 lod_reset
 ---------
 ..  autofunction:: paddle.fluid.layers.lod_reset
    :noindex:
+.. _api_fluid_layers_lrn:
 lrn
 ---
 ..  autofunction:: paddle.fluid.layers.lrn
    :noindex:
+.. _api_fluid_layers_pad:
 pad
 ---
 ..  autofunction:: paddle.fluid.layers.pad
    :noindex:
+.. _api_fluid_layers_label_smooth:
 label_smooth
 ------------
 ..  autofunction:: paddle.fluid.layers.label_smooth
    :noindex:
+.. _api_fluid_layers_roi_pool:
 roi_pool
 --------
 ..  autofunction:: paddle.fluid.layers.roi_pool
    :noindex:
+.. _api_fluid_layers_dice_loss:
 dice_loss
 ---------
 ..  autofunction:: paddle.fluid.layers.dice_loss
    :noindex:
+.. _api_fluid_layers_image_resize:
 image_resize
 ------------
 ..  autofunction:: paddle.fluid.layers.image_resize
    :noindex:
+.. _api_fluid_layers_image_resize_short:
 image_resize_short
 ------------------
 ..  autofunction:: paddle.fluid.layers.image_resize_short
    :noindex:
+.. _api_fluid_layers_resize_bilinear:
 resize_bilinear
 ---------------
 ..  autofunction:: paddle.fluid.layers.resize_bilinear
    :noindex:
+.. _api_fluid_layers_gather:
 gather
 ------
 ..  autofunction:: paddle.fluid.layers.gather
    :noindex:
+.. _api_fluid_layers_random_crop:
 random_crop
 -----------
 ..  autofunction:: paddle.fluid.layers.random_crop
    :noindex:
+.. _api_fluid_layers_mean_iou:
 mean_iou
 --------
 ..  autofunction:: paddle.fluid.layers.mean_iou
    :noindex:
+.. _api_fluid_layers_relu:
+relu
+----
+..  autofunction:: paddle.fluid.layers.relu
+    :noindex:
+.. _api_fluid_layers_log:
+log
+---
+..  autofunction:: paddle.fluid.layers.log
+    :noindex:
+.. _api_fluid_layers_crop:
+crop
+----
+..  autofunction:: paddle.fluid.layers.crop
+    :noindex:
 ops
 ===
+.. _api_fluid_layers_mean:
 mean
 ----
 ..  autofunction:: paddle.fluid.layers.mean
    :noindex:
+.. _api_fluid_layers_mul:
 mul
 ---
 ..  autofunction:: paddle.fluid.layers.mul
    :noindex:
+.. _api_fluid_layers_scale:
 scale
 -----
 ..  autofunction:: paddle.fluid.layers.scale
    :noindex:
+.. _api_fluid_layers_sigmoid_cross_entropy_with_logits:
 sigmoid_cross_entropy_with_logits
 ---------------------------------
 ..  autofunction:: paddle.fluid.layers.sigmoid_cross_entropy_with_logits
    :noindex:
+.. _api_fluid_layers_elementwise_add:
 elementwise_add
 ---------------
 ..  autofunction:: paddle.fluid.layers.elementwise_add
    :noindex:
+.. _api_fluid_layers_elementwise_div:
 elementwise_div
 ---------------
 ..  autofunction:: paddle.fluid.layers.elementwise_div
    :noindex:
+.. _api_fluid_layers_elementwise_sub:
 elementwise_sub
 ---------------
 ..  autofunction:: paddle.fluid.layers.elementwise_sub
    :noindex:
+.. _api_fluid_layers_elementwise_mul:
 elementwise_mul
 ---------------
 ..  autofunction:: paddle.fluid.layers.elementwise_mul
    :noindex:
+.. _api_fluid_layers_elementwise_max:
 elementwise_max
 ---------------
 ..  autofunction:: paddle.fluid.layers.elementwise_max
    :noindex:
+.. _api_fluid_layers_elementwise_min:
 elementwise_min
 ---------------
 ..  autofunction:: paddle.fluid.layers.elementwise_min
    :noindex:
+.. _api_fluid_layers_elementwise_pow:
 elementwise_pow
 ---------------
 ..  autofunction:: paddle.fluid.layers.elementwise_pow
    :noindex:
+.. _api_fluid_layers_clip:
 clip
 ----
 ..  autofunction:: paddle.fluid.layers.clip
    :noindex:
+.. _api_fluid_layers_clip_by_norm:
 clip_by_norm
 ------------
 ..  autofunction:: paddle.fluid.layers.clip_by_norm
    :noindex:
+.. _api_fluid_layers_logical_and:
 logical_and
 -----------
 ..  autofunction:: paddle.fluid.layers.logical_and
    :noindex:
+.. _api_fluid_layers_logical_or:
 logical_or
 ----------
 ..  autofunction:: paddle.fluid.layers.logical_or
    :noindex:
+.. _api_fluid_layers_logical_xor:
 logical_xor
 -----------
 ..  autofunction:: paddle.fluid.layers.logical_xor
    :noindex:
+.. _api_fluid_layers_logical_not:
 logical_not
 -----------
 ..  autofunction:: paddle.fluid.layers.logical_not
    :noindex:
+.. _api_fluid_layers_uniform_random_batch_size_like:
 uniform_random_batch_size_like
 ------------------------------
 ..  autofunction:: paddle.fluid.layers.uniform_random_batch_size_like
    :noindex:
+.. _api_fluid_layers_gaussian_random:
 gaussian_random
 ---------------
 ..  autofunction:: paddle.fluid.layers.gaussian_random
    :noindex:
+.. _api_fluid_layers_gaussian_random_batch_size_like:
 gaussian_random_batch_size_like
 -------------------------------
 ..  autofunction:: paddle.fluid.layers.gaussian_random_batch_size_like
    :noindex:
+.. _api_fluid_layers_scatter:
 scatter
 -------
 ..  autofunction:: paddle.fluid.layers.scatter
    :noindex:
+.. _api_fluid_layers_sum:
 sum
 ---
 ..  autofunction:: paddle.fluid.layers.sum
    :noindex:
+.. _api_fluid_layers_slice:
 slice
 -----
 ..  autofunction:: paddle.fluid.layers.slice
    :noindex:
+.. _api_fluid_layers_polygon_box_transform:
 polygon_box_transform
 ---------------------
 ..  autofunction:: paddle.fluid.layers.polygon_box_transform
    :noindex:
+.. _api_fluid_layers_shape:
 shape
 -----
 ..  autofunction:: paddle.fluid.layers.shape
    :noindex:
+.. _api_fluid_layers_iou_similarity:
+iou_similarity
+--------------
+..  autofunction:: paddle.fluid.layers.iou_similarity
+    :noindex:
+.. _api_fluid_layers_maxout:
 maxout
 ------
 ..  autofunction:: paddle.fluid.layers.maxout
    :noindex:
+.. _api_fluid_layers_sigmoid:
 sigmoid
 -------
 ..  autofunction:: paddle.fluid.layers.sigmoid
    :noindex:
+.. _api_fluid_layers_logsigmoid:
 logsigmoid
 ----------
 ..  autofunction:: paddle.fluid.layers.logsigmoid
    :noindex:
+.. _api_fluid_layers_exp:
 exp
 ---
 ..  autofunction:: paddle.fluid.layers.exp
    :noindex:
-relu
+.. _api_fluid_layers_tanh:
----
-..  autofunction:: paddle.fluid.layers.relu
-    :noindex:
 tanh
 ----
@@ -874,71 +1169,87 @@ tanh
 ..  autofunction:: paddle.fluid.layers.tanh
    :noindex:
+.. _api_fluid_layers_tanh_shrink:
 tanh_shrink
 -----------
 ..  autofunction:: paddle.fluid.layers.tanh_shrink
    :noindex:
+.. _api_fluid_layers_softshrink:
 softshrink
 ----------
 ..  autofunction:: paddle.fluid.layers.softshrink
    :noindex:
+.. _api_fluid_layers_sqrt:
 sqrt
 ----
 ..  autofunction:: paddle.fluid.layers.sqrt
    :noindex:
+.. _api_fluid_layers_abs:
 abs
 ---
 ..  autofunction:: paddle.fluid.layers.abs
    :noindex:
+.. _api_fluid_layers_ceil:
 ceil
 ----
 ..  autofunction:: paddle.fluid.layers.ceil
    :noindex:
+.. _api_fluid_layers_floor:
 floor
 -----
 ..  autofunction:: paddle.fluid.layers.floor
    :noindex:
+.. _api_fluid_layers_cos:
 cos
 ---
 ..  autofunction:: paddle.fluid.layers.cos
    :noindex:
+.. _api_fluid_layers_sin:
 sin
 ---
 ..  autofunction:: paddle.fluid.layers.sin
    :noindex:
+.. _api_fluid_layers_round:
 round
 -----
 ..  autofunction:: paddle.fluid.layers.round
    :noindex:
+.. _api_fluid_layers_reciprocal:
 reciprocal
 ----------
 ..  autofunction:: paddle.fluid.layers.reciprocal
    :noindex:
-log
+.. _api_fluid_layers_square:
---
-..  autofunction:: paddle.fluid.layers.log
-    :noindex:
 square
 ------
@@ -946,90 +1257,120 @@ square
 ..  autofunction:: paddle.fluid.layers.square
    :noindex:
+.. _api_fluid_layers_softplus:
 softplus
 --------
 ..  autofunction:: paddle.fluid.layers.softplus
    :noindex:
+.. _api_fluid_layers_softsign:
 softsign
 --------
 ..  autofunction:: paddle.fluid.layers.softsign
    :noindex:
+.. _api_fluid_layers_brelu:
 brelu
 -----
 ..  autofunction:: paddle.fluid.layers.brelu
    :noindex:
+.. _api_fluid_layers_leaky_relu:
 leaky_relu
 ----------
 ..  autofunction:: paddle.fluid.layers.leaky_relu
    :noindex:
+.. _api_fluid_layers_soft_relu:
 soft_relu
 ---------
 ..  autofunction:: paddle.fluid.layers.soft_relu
    :noindex:
+.. _api_fluid_layers_elu:
 elu
 ---
 ..  autofunction:: paddle.fluid.layers.elu
    :noindex:
+.. _api_fluid_layers_relu6:
 relu6
 -----
 ..  autofunction:: paddle.fluid.layers.relu6
    :noindex:
+.. _api_fluid_layers_pow:
 pow
 ---
 ..  autofunction:: paddle.fluid.layers.pow
    :noindex:
+.. _api_fluid_layers_stanh:
 stanh
 -----
 ..  autofunction:: paddle.fluid.layers.stanh
    :noindex:
+.. _api_fluid_layers_hard_sigmoid:
 hard_sigmoid
 ------------
 ..  autofunction:: paddle.fluid.layers.hard_sigmoid
    :noindex:
+.. _api_fluid_layers_swish:
 swish
 -----
 ..  autofunction:: paddle.fluid.layers.swish
    :noindex:
+.. _api_fluid_layers_uniform_random:
 uniform_random
 --------------
 ..  autofunction:: paddle.fluid.layers.uniform_random
    :noindex:
+.. _api_fluid_layers_hard_shrink:
 hard_shrink
 -----------
 ..  autofunction:: paddle.fluid.layers.hard_shrink
    :noindex:
+.. _api_fluid_layers_cumsum:
 cumsum
 ------
 ..  autofunction:: paddle.fluid.layers.cumsum
    :noindex:
+.. _api_fluid_layers_thresholded_relu:
 thresholded_relu
 ----------------
@@ -1039,192 +1380,383 @@ thresholded_relu
 tensor
 ======
+.. _api_fluid_layers_create_tensor:
 create_tensor
 -------------
 ..  autofunction:: paddle.fluid.layers.create_tensor
    :noindex:
+.. _api_fluid_layers_create_parameter:
 create_parameter
 ----------------
 ..  autofunction:: paddle.fluid.layers.create_parameter
    :noindex:
+.. _api_fluid_layers_create_global_var:
 create_global_var
 -----------------
 ..  autofunction:: paddle.fluid.layers.create_global_var
    :noindex:
+.. _api_fluid_layers_cast:
 cast
 ----
 ..  autofunction:: paddle.fluid.layers.cast
    :noindex:
+.. _api_fluid_layers_concat:
 concat
 ------
 ..  autofunction:: paddle.fluid.layers.concat
    :noindex:
+.. _api_fluid_layers_sums:
 sums
 ----
 ..  autofunction:: paddle.fluid.layers.sums
    :noindex:
+.. _api_fluid_layers_assign:
 assign
 ------
 ..  autofunction:: paddle.fluid.layers.assign
    :noindex:
+.. _api_fluid_layers_fill_constant_batch_size_like:
 fill_constant_batch_size_like
 -----------------------------
 ..  autofunction:: paddle.fluid.layers.fill_constant_batch_size_like
    :noindex:
+.. _api_fluid_layers_fill_constant:
 fill_constant
 -------------
 ..  autofunction:: paddle.fluid.layers.fill_constant
    :noindex:
+.. _api_fluid_layers_argmin:
 argmin
 ------
 ..  autofunction:: paddle.fluid.layers.argmin
    :noindex:
+.. _api_fluid_layers_argmax:
 argmax
 ------
 ..  autofunction:: paddle.fluid.layers.argmax
    :noindex:
+.. _api_fluid_layers_ones:
 ones
 ----
 ..  autofunction:: paddle.fluid.layers.ones
    :noindex:
+.. _api_fluid_layers_zeros:
 zeros
 -----
 ..  autofunction:: paddle.fluid.layers.zeros
    :noindex:
+.. _api_fluid_layers_reverse:
+reverse
+-------
+..  autofunction:: paddle.fluid.layers.reverse
+    :noindex:
+learning_rate_scheduler
+=======================
+.. _api_fluid_layers_exponential_decay:
+exponential_decay
+-----------------
+..  autofunction:: paddle.fluid.layers.exponential_decay
+    :noindex:
+.. _api_fluid_layers_natural_exp_decay:
+natural_exp_decay
+-----------------
+..  autofunction:: paddle.fluid.layers.natural_exp_decay
+    :noindex:
+.. _api_fluid_layers_inverse_time_decay:
+inverse_time_decay
+------------------
+..  autofunction:: paddle.fluid.layers.inverse_time_decay
+    :noindex:
+.. _api_fluid_layers_polynomial_decay:
+polynomial_decay
+----------------
+..  autofunction:: paddle.fluid.layers.polynomial_decay
+    :noindex:
+.. _api_fluid_layers_piecewise_decay:
+piecewise_decay
+---------------
+..  autofunction:: paddle.fluid.layers.piecewise_decay
+    :noindex:
+.. _api_fluid_layers_noam_decay:
+noam_decay
+----------
+..  autofunction:: paddle.fluid.layers.noam_decay
+    :noindex:
+.. _api_fluid_layers_append_LARS:
+append_LARS
+-----------
+..  autofunction:: paddle.fluid.layers.append_LARS
+    :noindex:
 detection
 =========
+.. _api_fluid_layers_prior_box:
 prior_box
 ---------
 ..  autofunction:: paddle.fluid.layers.prior_box
    :noindex:
+.. _api_fluid_layers_multi_box_head:
 multi_box_head
 --------------
 ..  autofunction:: paddle.fluid.layers.multi_box_head
    :noindex:
+.. _api_fluid_layers_bipartite_match:
 bipartite_match
 ---------------
 ..  autofunction:: paddle.fluid.layers.bipartite_match
    :noindex:
+.. _api_fluid_layers_target_assign:
 target_assign
 -------------
 ..  autofunction:: paddle.fluid.layers.target_assign
    :noindex:
+.. _api_fluid_layers_detection_output:
 detection_output
 ----------------
 ..  autofunction:: paddle.fluid.layers.detection_output
    :noindex:
+.. _api_fluid_layers_ssd_loss:
 ssd_loss
 --------
 ..  autofunction:: paddle.fluid.layers.ssd_loss
    :noindex:
+.. _api_fluid_layers_detection_map:
 detection_map
 -------------
 ..  autofunction:: paddle.fluid.layers.detection_map
    :noindex:
+.. _api_fluid_layers_iou_similarity:
 iou_similarity
 --------------
 ..  autofunction:: paddle.fluid.layers.iou_similarity
    :noindex:
+.. _api_fluid_layers_box_coder:
 box_coder
 ---------
 ..  autofunction:: paddle.fluid.layers.box_coder
    :noindex:
-learning_rate_scheduler
+metric_op
-=======================
+=========
-exponential_decay
+.. _api_fluid_layers_accuracy:
-----------------
-..  autofunction:: paddle.fluid.layers.exponential_decay
+accuracy
+--------
+..  autofunction:: paddle.fluid.layers.accuracy
    :noindex:
-natural_exp_decay
+.. _api_fluid_layers_auc:
-----------------
-..  autofunction:: paddle.fluid.layers.natural_exp_decay
+auc
+---
+..  autofunction:: paddle.fluid.layers.auc
    :noindex:
-inverse_time_decay
+tensor
------------------
+======
-..  autofunction:: paddle.fluid.layers.inverse_time_decay
+.. _api_fluid_layers_create_tensor:
+create_tensor
+-------------
+..  autofunction:: paddle.fluid.layers.create_tensor
    :noindex:
-polynomial_decay
+.. _api_fluid_layers_create_parameter:
+create_parameter
 ----------------
-..  autofunction:: paddle.fluid.layers.polynomial_decay
+..  autofunction:: paddle.fluid.layers.create_parameter
    :noindex:
-piecewise_decay
+.. _api_fluid_layers_create_global_var:
---------------
-..  autofunction:: paddle.fluid.layers.piecewise_decay
+create_global_var
+-----------------
+..  autofunction:: paddle.fluid.layers.create_global_var
    :noindex:
-noam_decay
+.. _api_fluid_layers_cast:
----------
-..  autofunction:: paddle.fluid.layers.noam_decay
+cast
+----
+..  autofunction:: paddle.fluid.layers.cast
    :noindex:
-metric
+.. _api_fluid_layers_concat:
-======
-accuracy
+concat
--------
+------
-..  autofunction:: paddle.fluid.layers.accuracy
+..  autofunction:: paddle.fluid.layers.concat
    :noindex:
-auc
+.. _api_fluid_layers_sums:
---
-..  autofunction:: paddle.fluid.layers.auc
+sums
+----
+..  autofunction:: paddle.fluid.layers.sums
+    :noindex:
+.. _api_fluid_layers_assign:
+assign
+------
+..  autofunction:: paddle.fluid.layers.assign
+    :noindex:
+.. _api_fluid_layers_fill_constant_batch_size_like:
+fill_constant_batch_size_like
+-----------------------------
+..  autofunction:: paddle.fluid.layers.fill_constant_batch_size_like
+    :noindex:
+.. _api_fluid_layers_fill_constant:
+fill_constant
+-------------
+..  autofunction:: paddle.fluid.layers.fill_constant
+    :noindex:
+.. _api_fluid_layers_argmin:
+argmin
+------
+..  autofunction:: paddle.fluid.layers.argmin
+    :noindex:
+.. _api_fluid_layers_argmax:
+argmax
+------
+..  autofunction:: paddle.fluid.layers.argmax
+    :noindex:
+.. _api_fluid_layers_ones:
+ones
+----
+..  autofunction:: paddle.fluid.layers.ones
+    :noindex:
+.. _api_fluid_layers_zeros:
+zeros
+-----
+..  autofunction:: paddle.fluid.layers.zeros
+    :noindex:
+.. _api_fluid_layers_reverse:
+reverse
+-------
+..  autofunction:: paddle.fluid.layers.reverse
    :noindex:
--- a/doc/fluid/api/metrics.rst
+++ b/doc/fluid/api/metrics.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!
-=======
+=============
-metrics
+fluid.metrics
-=======
+=============
+.. _api_fluid_metrics_MetricBase:
 MetricBase
 ----------
@@ -12,6 +14,8 @@ MetricBase
    :members:
    :noindex:
+.. _api_fluid_metrics_CompositeMetric:
 CompositeMetric
 ---------------
@@ -19,6 +23,26 @@ CompositeMetric
    :members:
    :noindex:
+.. _api_fluid_metrics_Precision:
+Precision
+---------
+..  autoclass:: paddle.fluid.metrics.Precision
+    :members:
+    :noindex:
+.. _api_fluid_metrics_Recall:
+Recall
+------
+..  autoclass:: paddle.fluid.metrics.Recall
+    :members:
+    :noindex:
+.. _api_fluid_metrics_Accuracy:
 Accuracy
 --------
@@ -26,6 +50,8 @@ Accuracy
    :members:
    :noindex:
+.. _api_fluid_metrics_ChunkEvaluator:
 ChunkEvaluator
 --------------
@@ -33,6 +59,8 @@ ChunkEvaluator
    :members:
    :noindex:
+.. _api_fluid_metrics_EditDistance:
 EditDistance
 ------------
@@ -40,6 +68,8 @@ EditDistance
    :members:
    :noindex:
+.. _api_fluid_metrics_DetectionMAP:
 DetectionMAP
 ------------
@@ -47,6 +77,8 @@ DetectionMAP
    :members:
    :noindex:
+.. _api_fluid_metrics_Auc:
 Auc
 ---

--- a/doc/fluid/api/nets.rst
+++ b/doc/fluid/api/nets.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!
-====
+==========
-nets
+fluid.nets
-====
+==========
+.. _api_fluid_nets_simple_img_conv_pool:
 simple_img_conv_pool
 --------------------
@@ -11,18 +13,24 @@ simple_img_conv_pool
 ..  autofunction:: paddle.fluid.nets.simple_img_conv_pool
    :noindex:
+.. _api_fluid_nets_sequence_conv_pool:
 sequence_conv_pool
 ------------------
 ..  autofunction:: paddle.fluid.nets.sequence_conv_pool
    :noindex:
+.. _api_fluid_nets_glu:
 glu
 ---
 ..  autofunction:: paddle.fluid.nets.glu
    :noindex:
+.. _api_fluid_nets_scaled_dot_product_attention:
 scaled_dot_product_attention
 ----------------------------

--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!
-=========
+===============
-optimizer
+fluid.optimizer
-=========
+===============
+.. _api_fluid_optimizer_SGD:
 SGD
 ---
@@ -12,6 +14,8 @@ SGD
    :members:
    :noindex:
+.. _api_fluid_optimizer_Momentum:
 Momentum
 --------
@@ -19,6 +23,8 @@ Momentum
    :members:
    :noindex:
+.. _api_fluid_optimizer_Adagrad:
 Adagrad
 -------
@@ -26,6 +32,8 @@ Adagrad
    :members:
    :noindex:
+.. _api_fluid_optimizer_Adam:
 Adam
 ----
@@ -33,6 +41,8 @@ Adam
    :members:
    :noindex:
+.. _api_fluid_optimizer_Adamax:
 Adamax
 ------
@@ -40,6 +50,8 @@ Adamax
    :members:
    :noindex:
+.. _api_fluid_optimizer_DecayedAdagrad:
 DecayedAdagrad
 --------------
@@ -47,6 +59,17 @@ DecayedAdagrad
    :members:
    :noindex:
+.. _api_fluid_optimizer_Ftrl:
+Ftrl
+----
+..  autoclass:: paddle.fluid.optimizer.Ftrl
+    :members:
+    :noindex:
+.. _api_fluid_optimizer_SGDOptimizer:
 SGDOptimizer
 ------------
@@ -54,6 +77,8 @@ SGDOptimizer
    :members:
    :noindex:
+.. _api_fluid_optimizer_MomentumOptimizer:
 MomentumOptimizer
 -----------------
@@ -61,6 +86,8 @@ MomentumOptimizer
    :members:
    :noindex:
+.. _api_fluid_optimizer_AdagradOptimizer:
 AdagradOptimizer
 ----------------
@@ -68,6 +95,8 @@ AdagradOptimizer
    :members:
    :noindex:
+.. _api_fluid_optimizer_AdamOptimizer:
 AdamOptimizer
 -------------
@@ -75,6 +104,8 @@ AdamOptimizer
    :members:
    :noindex:
+.. _api_fluid_optimizer_AdamaxOptimizer:
 AdamaxOptimizer
 ---------------
@@ -82,6 +113,8 @@ AdamaxOptimizer
    :members:
    :noindex:
+.. _api_fluid_optimizer_DecayedAdagradOptimizer:
 DecayedAdagradOptimizer
 -----------------------
@@ -89,6 +122,8 @@ DecayedAdagradOptimizer
    :members:
    :noindex:
+.. _api_fluid_optimizer_RMSPropOptimizer:
 RMSPropOptimizer
 ----------------
@@ -96,6 +131,17 @@ RMSPropOptimizer
    :members:
    :noindex:
+.. _api_fluid_optimizer_FtrlOptimizer:
+FtrlOptimizer
+-------------
+..  autoclass:: paddle.fluid.optimizer.FtrlOptimizer
+    :members:
+    :noindex:
+.. _api_fluid_optimizer_Adadelta:
 Adadelta
 --------
@@ -103,6 +149,8 @@ Adadelta
    :members:
    :noindex:
+.. _api_fluid_optimizer_ModelAverage:
 ModelAverage
 ------------
@@ -110,6 +158,8 @@ ModelAverage
    :members:
    :noindex:
+.. _api_fluid_optimizer_Optimizer:
 Optimizer
 ---------
@@ -117,3 +167,12 @@ Optimizer
    :members:
    :noindex:
+.. _api_fluid_optimizer_RMSPropOptimizer:
+RMSPropOptimizer
+----------------
+..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
+    :members:
+    :noindex:
--- a/doc/fluid/api/param_attr.rst
+++ b/doc/fluid/api/param_attr.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!
-==========
+================
-param_attr
+fluid.param_attr
-==========
+================
+.. _api_fluid_param_attr_ParamAttr:
 ParamAttr
 ---------
@@ -12,6 +14,8 @@ ParamAttr
    :members:
    :noindex:
+.. _api_fluid_param_attr_WeightNormParamAttr:
 WeightNormParamAttr
 -------------------

--- a/doc/fluid/api/profiler.rst
+++ b/doc/fluid/api/profiler.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!
-========
+==============
-profiler
+fluid.profiler
-========
+==============
+.. _api_fluid_profiler_cuda_profiler:
 cuda_profiler
 -------------
@@ -11,24 +13,32 @@ cuda_profiler
 ..  autofunction:: paddle.fluid.profiler.cuda_profiler
    :noindex:
+.. _api_fluid_profiler_reset_profiler:
 reset_profiler
 --------------
 ..  autofunction:: paddle.fluid.profiler.reset_profiler
    :noindex:
+.. _api_fluid_profiler_profiler:
 profiler
 --------
 ..  autofunction:: paddle.fluid.profiler.profiler
    :noindex:
+.. _api_fluid_profiler_start_profiler:
 start_profiler
 --------------
 ..  autofunction:: paddle.fluid.profiler.start_profiler
    :noindex:
+.. _api_fluid_profiler_stop_profiler:
 stop_profiler
 -------------

--- a/doc/fluid/api/recordio_writer.rst
+++ b/doc/fluid/api/recordio_writer.rst
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+=====================
+fluid.recordio_writer
+=====================
+.. _api_fluid_recordio_writer_convert_reader_to_recordio_file:
+convert_reader_to_recordio_file
+-------------------------------
+..  autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_file
+    :noindex:
+.. _api_fluid_recordio_writer_convert_reader_to_recordio_files:
+convert_reader_to_recordio_files
+--------------------------------
+..  autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_files
+    :noindex:
--- a/doc/fluid/api/regularizer.rst
+++ b/doc/fluid/api/regularizer.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!
-===========
+=================
-regularizer
+fluid.regularizer
-===========
+=================
+.. _api_fluid_regularizer_append_regularization_ops:
 append_regularization_ops
 -------------------------
@@ -11,12 +13,7 @@ append_regularization_ops
 ..  autofunction:: paddle.fluid.regularizer.append_regularization_ops
    :noindex:
-WeightDecayRegularizer
+.. _api_fluid_regularizer_L1Decay:
----------------------
-..  autoclass:: paddle.fluid.regularizer.WeightDecayRegularizer
-    :members:
-    :noindex:
 L1Decay
 -------
@@ -25,6 +22,8 @@ L1Decay
    :members:
    :noindex:
+.. _api_fluid_regularizer_L2Decay:
 L2Decay
 -------
@@ -32,6 +31,8 @@ L2Decay
    :members:
    :noindex:
+.. _api_fluid_regularizer_L1DecayRegularizer:
 L1DecayRegularizer
 ------------------
@@ -39,6 +40,8 @@ L1DecayRegularizer
    :members:
    :noindex:
+.. _api_fluid_regularizer_L2DecayRegularizer:
 L2DecayRegularizer
 ------------------

--- a/doc/fluid/api/transpiler.rst
+++ b/doc/fluid/api/transpiler.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!
-==========
+================
-transpiler
+fluid.transpiler
-==========
+================
+.. _api_fluid_transpiler_DistributeTranspiler:
 DistributeTranspiler
 --------------------
@@ -12,12 +14,7 @@ DistributeTranspiler
    :members:
    :noindex:
-InferenceTranspiler
+.. _api_fluid_transpiler_memory_optimize:
-------------------
-..  autoclass:: paddle.fluid.transpiler.InferenceTranspiler
-    :members:
-    :noindex:
 memory_optimize
 ---------------
@@ -25,12 +22,16 @@ memory_optimize
 ..  autofunction:: paddle.fluid.transpiler.memory_optimize
    :noindex:
+.. _api_fluid_transpiler_release_memory:
 release_memory
 --------------
 ..  autofunction:: paddle.fluid.transpiler.release_memory
    :noindex:
+.. _api_fluid_transpiler_HashName:
 HashName
 --------
@@ -38,9 +39,12 @@ HashName
    :members:
    :noindex:
+.. _api_fluid_transpiler_RoundRobin:
 RoundRobin
 ----------
 ..  autoclass:: paddle.fluid.transpiler.RoundRobin
    :members:
    :noindex:
--- a/doc/v2/faq/build_and_install/index_cn.rst
+++ b/doc/v2/faq/build_and_install/index_cn.rst
@@ -213,3 +213,12 @@ virtualenv本身也是Python的一个包，可以用pip进行安装：
 保存并关闭文件。
 这样，每次打开终端时就会自动启动名为‘paddle’的Python环境了。
+10. 通过pip安装的PaddlePaddle在  :code:`import paddle.fluid` 报找不到 :code:`libmkldnn.so` 或 :code:`libmklml_intel.so`
+------------------------------------------------------------------------------------------
+出现这种问题的原因是在导入 :code:`paddle.fluid` 时需要加载 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so`，
+但是系统没有找到该文件。一般通过pip安装PaddlePaddle时会将 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so`
+拷贝到 :code:`/usr/local/lib` 路径下，所以解决办法是将该路径加到 :code:`LD_LIBRARY_PATH` 环境变量下，
+即： :code:`export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH` 。
+**注意**：如果是在虚拟环境中安装PaddlePaddle， :code:`libmkldnn.so` 和 :code:`libmklml_intel.so` 可能不在 :code:`/usr/local/lib` 路径下。
\ No newline at end of file
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -147,10 +147,9 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                 "Input tensor type is not supported: ", in.type().name());
  memory::data_type out_type = in_type;
-  memory::format in_format =
+  auto in_format = MKLDNNFormatForSize(in_tz.size(), in.format());
-      in_tz.size() == 2 ? memory::format::nc : in.format();
+  auto out_format =
-  memory::format out_format =
+      MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
-      out_tz.size() == 2 ? memory::format::nc : ToMKLDNNFormat(out_layout);
  void* in_data = GetDataFromTensor(in, in_type);

--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -61,6 +61,13 @@ inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) {
  if (iter != dict.end()) return iter->second;
  return MKLDNNDataType::data_undef;
 }
+inline MKLDNNFormat MKLDNNFormatForSize(size_t dims_size,
+                                        MKLDNNFormat default_format) {
+  return (dims_size == 1
+              ? mkldnn::memory::format::x
+              : dims_size == 2 ? mkldnn::memory::format::nc : default_format);
+}
 #endif
 void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,

--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -47,9 +47,13 @@ void DataTransform(const OpKernelType& expected_kernel_type,
 #ifdef PADDLE_WITH_MKLDNN
        // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
        // Just set layout/format. No real transform occur
+        auto out_format =
+            MKLDNNFormatForSize(in.dims().size(), ToMKLDNNFormat(lin));
        out.ShareDataWith(input_tensor);
        out.set_layout(DataLayout::kMKLDNN);
-        out.set_format(ToMKLDNNFormat(lin));
+        out.set_format(out_format);
 #endif
      } else {
        // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel

--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -103,50 +103,23 @@ void BroadcastOpHandle::RunImpl() {
          });
    }
-    // FIXME(zcd): a temporary fix for some language model that has sparse
+    this->RunAndRecordEvent([&] {
-    // parameter.
+      {
-    bool use_mutex = true;
+        platform::NCCLGroupGuard guard;
-    if (in_var->IsType<paddle::framework::SelectedRows>()) {
+        for (auto &call : broadcast_calls) {
-      use_mutex = false;
+          call();
-    }
-    if (use_mutex) {
-      this->RunAndRecordEvent([&] {
-        {
-          platform::NCCLGroupGuard guard;
-          for (auto &call : broadcast_calls) {
-            call();
-          }
-        }
-        if (!out_handle->IsTheSameVar(*in_var_handle)) {
-          auto out_var = var_scopes.at(in_var_handle->scope_idx_)
-                             ->FindVar(out_var_handles[0]->name_);
-          paddle::framework::TensorCopy(
-              in_tensor, in_var_handle->place_,
-              *(dev_ctxes_.at(in_var_handle->place_)),
-              &VariableVisitor::GetMutableTensor(out_var));
-        }
-      });
-    } else {
-      this->RunAndRecordEventNoMutex([&] {
-        {
-          platform::NCCLGroupGuard guard;
-          for (auto &call : broadcast_calls) {
-            call();
-          }
-        }
-        if (!out_handle->IsTheSameVar(*in_var_handle)) {
-          auto out_var = var_scopes.at(in_var_handle->scope_idx_)
-                             ->FindVar(out_var_handles[0]->name_);
-          paddle::framework::TensorCopy(
-              in_tensor, in_var_handle->place_,
-              *(dev_ctxes_.at(in_var_handle->place_)),
-              &VariableVisitor::GetMutableTensor(out_var));
        }
-      });
+      }
-    }
+      if (!out_handle->IsTheSameVar(*in_var_handle)) {
+        auto out_var = var_scopes.at(in_var_handle->scope_idx_)
+                           ->FindVar(out_var_handles[0]->name_);
+        paddle::framework::TensorCopy(
+            in_tensor, in_var_handle->place_,
+            *(dev_ctxes_.at(in_var_handle->place_)),
+            &VariableVisitor::GetMutableTensor(out_var));
+      }
+    });
 #else
    PADDLE_THROW("CUDA is not enabled.");
 #endif

--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -470,7 +470,7 @@ void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op,
 void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
                                                const OpDesc &op) const {
  int op_dev_id = -1;
-  if (op.Type() == "split_byref") {
+  if (op.Type() == "split_byref" || op.Type() == "split_selected_rows") {
    op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
      op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());

--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -47,7 +47,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 #endif
  std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
-  int GetVarDeviceID(const std::string &varname) const;
+  int GetVarDeviceID(const std::string &varname) const override;
 private:
  void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op,

--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -11,8 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/op_handle_base.h"
+#include <map>
 namespace paddle {
 namespace framework {
@@ -122,35 +122,17 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
 #ifdef PADDLE_WITH_CUDA
  if (!events_.empty()) {  // Use event
    std::function<void()> method = callback;
+    // NOTE(zcd): device context must be ordered here because RecordEvent
+    // will use a mutex to ensure the safe of multi-threads.
+    std::map<platform::DeviceContext *, platform::Place> ordered_ctxes;
    for (auto &p : dev_ctxes_) {
-      method = [method, p, this]() {
+      ordered_ctxes.emplace(p.second, p.first);
-        static_cast<platform::CUDADeviceContext *>(p.second)->RecordEvent(
-            events_.at(boost::get<platform::CUDAPlace>(p.first).device),
-            method);
-      };
    }
-    method();
+    for (auto &p : ordered_ctxes) {
-  } else {
-#endif
-    callback();
-#ifdef PADDLE_WITH_CUDA
-  }
-#endif
-}
-void OpHandleBase::RunAndRecordEventNoMutex(
-    const std::function<void()> &callback) {
-#ifdef PADDLE_WITH_CUDA
-  if (!events_.empty()) {  // Use event
-    std::function<void()> method = callback;
-    for (auto &p : dev_ctxes_) {
      method = [method, p, this]() {
-        static_cast<platform::CUDADeviceContext *>(p.second)
+        static_cast<platform::CUDADeviceContext *>(p.first)->RecordEvent(
-            ->RecordEventNoMutex(
+            events_.at(boost::get<platform::CUDAPlace>(p.second).device),
-                events_.at(boost::get<platform::CUDAPlace>(p.first).device),
+            method);
-                method);
      };
    }
    method();

--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -85,10 +85,6 @@ class OpHandleBase {
 protected:
  void RunAndRecordEvent(const std::function<void()> &callback);
-  // FIXME(zcd): A temporary fix for some language model that has sparse
-  // parameter.
-  void RunAndRecordEventNoMutex(const std::function<void()> &callback);
  void RunAndRecordEvent(platform::Place p,
                         const std::function<void()> &callback);

--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -80,9 +80,7 @@ void ReduceOpHandle::RunImpl() {
  }
  if (pre_in_var->IsType<framework::SelectedRows>()) {
-    // FIXME(zcd): A temporary fix for some language model that has sparse
+    this->RunAndRecordEvent([&] {
-    // parameter.
-    this->RunAndRecordEventNoMutex([&] {
      std::vector<const SelectedRows *> in_selected_rows =
          GetInputValues<SelectedRows>(in_var_handles, var_scopes);
      GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_, t_out_p,

--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -27,6 +27,7 @@ enum AttrType {
  BOOLEANS = 7;
  BLOCK = 8;
  LONG = 9;
+  BLOCKS = 10;
 }
 // OpDesc describes an instance of a C++ framework::OperatorBase
@@ -46,6 +47,7 @@ message OpDesc {
    repeated bool bools = 11;
    optional int32 block_idx = 12;
    optional int64 l = 13;
+    repeated int32 blocks_idx = 14;
  };
  message Var {

--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -51,8 +51,6 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) {
 }
 std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
-  PADDLE_ENFORCE(t.type().hash_code() == typeid(float).hash_code());
  if (!platform::is_cpu_place(t.place())) {
    LoDTensor tt;
    framework::TensorCopy(t, platform::CPUPlace(), &tt);
@@ -70,7 +68,13 @@ std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
  // only print first ten elements
  int64_t size = t.numel() < 10 ? t.numel() : 10;
  for (int64_t i = 0; i < size; ++i) {
-    os << t.data<float>()[i] << " ";
+    if (t.type().hash_code() == typeid(float).hash_code()) {
+      os << t.data<float>()[i] << " ";
+    } else if (t.type().hash_code() == typeid(int64_t).hash_code()) {
+      os << t.data<int64_t>()[i] << " ";
+    } else {
+      PADDLE_THROW("LoDTensor data type not in [float, int64_t]");
+    }
  }
  return os;

--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -26,6 +26,20 @@
 namespace paddle {
 namespace framework {
+TEST(LoD, PrintLoDTensor) {
+  LoDTensor tensor1;
+  tensor1.mutable_data<float>(platform::CPUPlace());
+  tensor1.data<float>()[0] = 0.2;
+  tensor1.data<float>()[1] = 0.5;
+  LOG(INFO) << tensor1;
+  LoDTensor tensor2;
+  tensor2.mutable_data<int64_t>(platform::CPUPlace());
+  tensor2.data<int64_t>()[0] = 1;
+  tensor2.data<int64_t>()[1] = 2;
+  LOG(INFO) << tensor2;
+}
 TEST(LoD, data) {
  LoD lod{{0, 1, 2}};
  lod.push_back({0, 2, 4, 5});
@@ -37,7 +51,7 @@ TEST(LoD, data) {
  }
 }
-TEST(LodExpand, test) {
+TEST(LoD, ExpandLoD) {
  LoD lod{{0, 2}};
  LoDTensor tensor;
  tensor.set_lod(lod);

--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -211,6 +211,12 @@ void OpDesc::SetBlockAttr(const std::string &name, BlockDesc *block) {
  need_update_ = true;
 }
+void OpDesc::SetBlocksAttr(const std::string &name,
+                           std::vector<BlockDesc *> blocks) {
+  this->attrs_[name] = blocks;
+  need_update_ = true;
+}
 void OpDesc::SetAttrMap(
    const std::unordered_map<std::string, Attribute> &attr_map) {
  attrs_ = attr_map;
@@ -305,6 +311,13 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
  void operator()(const std::vector<bool> &v) const {
    VectorToRepeated(v, attr_->mutable_bools());
  }
+  void operator()(const std::vector<BlockDesc *> &v) const {
+    std::vector<int> blocks_idx;
+    for (auto blk : v) {
+      blocks_idx.push_back(blk->ID());
+    }
+    VectorToRepeated(blocks_idx, attr_->mutable_blocks_idx());
+  }
  void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); }
  void operator()(int64_t v) const { attr_->set_l(v); }
  void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }

--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -77,6 +77,8 @@ class OpDesc {
  void SetBlockAttr(const std::string &name, BlockDesc *block);
+  void SetBlocksAttr(const std::string &name, std::vector<BlockDesc *> blocks);
  Attribute GetAttr(const std::string &name) const;
  Attribute GetNullableAttr(const std::string &name) const;

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -121,7 +121,7 @@ ParallelExecutor::ParallelExecutor(
 #endif
  }
-  builder_ = std::move(builder_factory.Create());
+  builder_ = builder_factory.Create();
  member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
      exec_strategy, member_->local_scopes_, places,
      builder_->Build(main_program)));

--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -35,7 +35,8 @@ using VariableNameMap = std::map<std::string, std::vector<std::string>>;
 using Attribute =
    boost::variant<boost::blank, int, float, std::string, std::vector<int>,
                   std::vector<float>, std::vector<std::string>, bool,
-                   std::vector<bool>, BlockDesc*, int64_t>;
+                   std::vector<bool>, BlockDesc*, int64_t,
+                   std::vector<BlockDesc*>>;
 using AttributeMap = std::unordered_map<std::string, Attribute>;

--- a/paddle/fluid/operators/assign_value_op.cc
+++ b/paddle/fluid/operators/assign_value_op.cc
@@ -70,6 +70,7 @@ $$Out = values$$
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(assign_value, ops::AssignValueOp, ops::AssignValueOpMaker);
+REGISTER_OPERATOR(assign_value, ops::AssignValueOp, ops::AssignValueOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
 REGISTER_OP_CPU_KERNEL(assign_value, ops::AssignValueKernel<int>,
                       ops::AssignValueKernel<float>);
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <limits>
+#include "glog/logging.h"  // For VLOG
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -75,6 +76,9 @@ bool GRPCClient::AsyncSendVar(const std::string& ep,
    var_h.scope = p_scope;
    var_h.name = var_name_val;
    var_h.ctx = p_ctx;
+    var_h.method = "Send";
+    VLOG(3) << var_h.String() << " begin";
    // stub context
    SendProcessor* s = new SendProcessor(ch);
@@ -129,6 +133,9 @@ bool GRPCClient::AsyncGetVar(const std::string& ep,
    var_h.scope = p_scope;
    var_h.name = var_name_val;
    var_h.ctx = p_ctx;
+    var_h.method = "Get";
+    VLOG(3) << var_h.String() << " begin";
    // stub context
    GetProcessor* s = new GetProcessor(ch);
@@ -172,6 +179,9 @@ bool GRPCClient::AsyncPrefetchVar(const std::string& ep,
    var_h.scope = p_scope;
    var_h.name = out_var_name_val;
    var_h.ctx = p_ctx;
+    var_h.method = "Prefetch";
+    VLOG(3) << var_h.String() << " begin";
    // stub context
    GetProcessor* s = new GetProcessor(ch);
@@ -243,10 +253,11 @@ void GRPCClient::Proceed() {
    GPR_ASSERT(ok);
    PADDLE_ENFORCE(c);
    if (c->status_.ok()) {
+      VLOG(3) << c->var_h_.String() << " process";
      c->Process();
    } else {
-      LOG(FATAL) << "var: " << c->var_h_.String()
+      LOG(FATAL) << c->var_h_.String()
-                 << " grpc error:" << c->status_.error_message();
+                 << " meets grpc error:" << c->status_.error_message();
    }
    delete c;
    {
@@ -258,14 +269,15 @@ void GRPCClient::Proceed() {
 }
 std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
-  // TODO(Yancey1989): make grpc client completely thread-safe
  std::lock_guard<std::mutex> guard(chan_mutex_);
  auto it = channels_.find(ep);
  if (it != channels_.end()) {
    return it->second;
  }
+  // Channel configurations:
  grpc::ChannelArguments args;
+  args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 2000);
  args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE);
  args.SetMaxSendMessageSize(std::numeric_limits<int>::max());
  args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());

--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -47,14 +47,18 @@ namespace operators {
 namespace distributed {
 struct VarHandle {
+  // RPC endpoint.
  std::string ep;
  const platform::DeviceContext* ctx;
  const framework::Scope* scope;
+  // Variable name.
  std::string name;
+  // RPC method name.
+  std::string method;
  std::string String() const {
    std::ostringstream s;
-    s << "name:[" << name << "] ep:[" << ep << "]";
+    s << method << " name:[" << name << "], ep:[" << ep << "]";
    return s.str();
  }
 };
@@ -72,6 +76,7 @@ class BaseProcessor {
  virtual void Prepare(const VarHandle& var_info, int64_t time_out) {
    context_.reset(new grpc::ClientContext());
    var_h_ = var_info;
+    context_->set_wait_for_ready(true);
    std::chrono::system_clock::time_point deadline =
        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
@@ -81,6 +86,7 @@ class BaseProcessor {
  virtual void Prepare(int64_t time_out) {
    context_.reset(new grpc::ClientContext());
+    context_->set_wait_for_ready(true);
    std::chrono::system_clock::time_point deadline =
        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
@@ -172,26 +178,24 @@ class GRPCClient : public RPCClient {
  bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
                    const framework::Scope& scope, const std::string& var_name,
-                    int64_t time_out = RPCClient::rpc_time_out) override;
+                    int64_t time_out = FLAGS_grpc_deadline) override;
  bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx,
                   const framework::Scope& scope, const std::string& var_name,
-                   int64_t time_out = RPCClient::rpc_time_out) override;
+                   int64_t time_out = FLAGS_grpc_deadline) override;
  bool AsyncPrefetchVar(const std::string& ep,
                        const platform::DeviceContext& ctx,
                        const framework::Scope& scope,
                        const std::string& in_var_name,
                        const std::string& out_var_name,
-                        int64_t time_out = RPCClient::rpc_time_out) override;
+                        int64_t time_out = FLAGS_grpc_deadline) override;
-  void AsyncSendBatchBarrier(
+  void AsyncSendBatchBarrier(const std::string& ep,
-      const std::string& ep,
+                             int64_t time_out = FLAGS_grpc_deadline) override;
-      int64_t time_out = RPCClient::rpc_time_out) override;
-  void AsyncSendFetchBarrier(
+  void AsyncSendFetchBarrier(const std::string& ep,
-      const std::string& ep,
+                             int64_t time_out = FLAGS_grpc_deadline) override;
-      int64_t time_out = RPCClient::rpc_time_out) override;
  void Wait() override;
@@ -207,7 +211,7 @@ class GRPCClient : public RPCClient {
  void Proceed();
  void AsyncSendComplete(const std::string& ep,
-                         int64_t time_out = RPCClient::rpc_time_out);
+                         int64_t time_out = FLAGS_grpc_deadline);
  std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);

--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
@@ -41,6 +41,19 @@ class RequestBase {
  virtual ~RequestBase() {}
  virtual void Process() = 0;
+  std::string Status2String(const std::string& method) {
+    std::string status = "Process";
+    if (status_ == FINISH) {
+      status = "Finish";
+    }
+    std::ostringstream s;
+    s << method << " name:[" << GetReqName() << "]"
+      << ", ep:[" << ctx_.peer() << "]"
+      << " " << status << " using req_id:" << req_id_;
+    return s.str();
+  }
  CallStatus Status() const {
    std::lock_guard<std::mutex> l(status_mu_);
    return status_;
@@ -84,7 +97,7 @@ class RequestSend final : public RequestBase {
  void Process() override {
    std::string varname = GetReqName();
-    VLOG(3) << "RequestSend var_name:" << varname;
+    VLOG(4) << "RequestSend var_name:" << varname;
    auto scope = request_->GetMutableLocalScope();
    auto invar = request_->GetVar();
@@ -119,7 +132,7 @@ class RequestGet final : public RequestBase {
  void Process() override {
    // proc request.
    std::string varname = request_.varname();
-    VLOG(3) << "RequestGet " << varname;
+    VLOG(4) << "RequestGet " << varname;
    auto scope = request_handler_->scope();
    auto invar = scope->FindVar(varname);
@@ -165,7 +178,7 @@ class RequestPrefetch final : public RequestBase {
    // prefetch process...
    std::string in_var_name = request_->Varname();
    std::string out_var_name = request_->OutVarname();
-    VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name
+    VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name
            << " out_var_name: " << out_var_name;
    auto scope = request_->GetMutableLocalScope();
@@ -188,10 +201,10 @@ class RequestPrefetch final : public RequestBase {
 };
 void AsyncGRPCServer::WaitServerReady() {
-  VLOG(3) << "AsyncGRPCServer is wait server ready";
+  VLOG(4) << "AsyncGRPCServer is wait server ready";
  std::unique_lock<std::mutex> lock(this->mutex_ready_);
  condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
-  VLOG(3) << "AsyncGRPCServer WaitSeverReady";
+  VLOG(4) << "AsyncGRPCServer WaitSeverReady";
 }
 void AsyncGRPCServer::StartServer() {
@@ -230,7 +243,7 @@ void AsyncGRPCServer::StartServer() {
    for (int i = 0; i < threadnum; i++) {
      rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind(
          &AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f)));
-      VLOG(3) << t.first << " creates threads!";
+      VLOG(4) << t.first << " creates threads!";
    }
  }
@@ -247,7 +260,7 @@ void AsyncGRPCServer::StartServer() {
    auto& threads = t.second;
    for (size_t i = 0; i < threads.size(); ++i) {
      threads[i]->join();
-      VLOG(3) << t.first << " threads ends!";
+      VLOG(4) << t.first << " threads ends!";
    }
  }
 }
@@ -255,7 +268,7 @@ void AsyncGRPCServer::StartServer() {
 void AsyncGRPCServer::ShutdownQueue() {
  for (auto& t : rpc_cq_) {
    t.second->Shutdown();
-    VLOG(3) << t.first << " shutdown!";
+    VLOG(4) << t.first << " queue shutdown!";
  }
 }
@@ -264,7 +277,7 @@ void AsyncGRPCServer::ShutDownImpl() {
  is_shut_down_ = true;
  ShutdownQueue();
-  VLOG(3) << "server_ shutdown!";
+  VLOG(4) << "server_ shutdown!";
  server_->Shutdown();
 }
@@ -272,7 +285,7 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
                                          int req_id) {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  if (is_shut_down_) {
-    VLOG(3) << "shutdown, do not TryToRegisterNewSendOne";
+    VLOG(4) << "shutdown, do not TryToRegisterNewSendOne";
    return;
  }
@@ -306,14 +319,14 @@ void AsyncGRPCServer::HandleRequest(
  bool ok = false;
  while (true) {
-    VLOG(3) << "HandleRequest " << rpc_name << " wait next";
+    VLOG(4) << "HandleRequest " << rpc_name << " wait next";
    if (!cq->Next(&tag, &ok)) {
      LOG(INFO) << "CompletionQueue " << rpc_name << " shutdown!";
      break;
    }
    int req_id = static_cast<int>(reinterpret_cast<intptr_t>(tag));
-    VLOG(3) << "HandleRequest " << rpc_name << ", req_id:" << req_id
+    VLOG(4) << "HandleRequest " << rpc_name << ", req_id:" << req_id
            << " get next";
    auto& reqs = rpc_reqs_[rpc_name];
@@ -324,22 +337,21 @@ void AsyncGRPCServer::HandleRequest(
      base = reqs[req_id];
    }
+    VLOG(3) << base->Status2String(rpc_name);
    // reference:
    // https://github.com/tensorflow/tensorflow/issues/5596
    // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
    // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
    if (!ok) {
      LOG(WARNING) << "completion queue:" << rpc_name
-                   << " recv no regular event:argument name["
+                   << " recv no regular event"
-                   << base->GetReqName() << "]";
+                   << " context:" << base->Status2String(rpc_name);
      TryToRegisterNewOne(rpc_name, req_id);
      delete base;
      continue;
    }
-    VLOG(3) << "queue id:" << rpc_name << ", req_id:" << req_id
-            << ", status:" << base->Status();
    switch (base->Status()) {
      case PROCESS: {
        base->Process();

--- a/paddle/fluid/operators/distributed/rpc_client.cc
+++ b/paddle/fluid/operators/distributed/rpc_client.cc
@@ -13,6 +13,10 @@
 // limitations under the License.
 #include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "gflags/gflags.h"
+// default to 3min to avoid temprary network failures.
+DEFINE_int32(grpc_deadline, 180000, "deadline timeouts for grpc");
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -15,11 +15,14 @@
 #pragma once
 #include <string>
+#include "gflags/gflags.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
+DECLARE_int32(grpc_deadline);
 namespace paddle {
 namespace operators {
 namespace distributed {
@@ -32,26 +35,26 @@ class RPCClient {
                            const platform::DeviceContext& ctx,
                            const framework::Scope& scope,
                            const std::string& var_name,
-                            int64_t time_out = rpc_time_out) = 0;
+                            int64_t time_out = FLAGS_grpc_deadline) = 0;
  virtual bool AsyncGetVar(const std::string& ep,
                           const platform::DeviceContext& ctx,
                           const framework::Scope& scope,
                           const std::string& var_name,
-                           int64_t time_out = rpc_time_out) = 0;
+                           int64_t time_out = FLAGS_grpc_deadline) = 0;
  virtual bool AsyncPrefetchVar(const std::string& ep,
                                const platform::DeviceContext& ctx,
                                const framework::Scope& scope,
                                const std::string& in_var_name,
                                const std::string& out_var_name,
-                                int64_t time_out = rpc_time_out) = 0;
+                                int64_t time_out = FLAGS_grpc_deadline) = 0;
-  virtual void AsyncSendBatchBarrier(const std::string& ep,
+  virtual void AsyncSendBatchBarrier(
-                                     int64_t time_out = rpc_time_out) = 0;
+      const std::string& ep, int64_t time_out = FLAGS_grpc_deadline) = 0;
-  virtual void AsyncSendFetchBarrier(const std::string& ep,
+  virtual void AsyncSendFetchBarrier(
-                                     int64_t time_out = rpc_time_out) = 0;
+      const std::string& ep, int64_t time_out = FLAGS_grpc_deadline) = 0;
  // SendComplete tells all the server that current trainer have no more data
  // to train, so that the pserver can reduce it's barrier count, and continue
@@ -60,8 +63,6 @@ class RPCClient {
  virtual void Wait() = 0;
-  static constexpr int64_t rpc_time_out = 120 * 1000;
  template <typename T>
  static RPCClient* GetInstance() {
    std::call_once(init_flag_, &RPCClient::Init<T>);

--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -47,11 +47,12 @@ void RPCServer::WaitBarrier(const std::string& rpc_name) {
    return (barrier_counter_[rpc_name] >= client_num_ || exit_flag_.load());
  });
-  VLOG(3) << "batch_barrier_:" << barrier_counter_[rpc_name];
+  VLOG(3) << "batch_barrier_: " << rpc_name << " "
+          << barrier_counter_[rpc_name];
 }
 void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
-  VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
+  VLOG(4) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
  int b = 0;
  std::unique_lock<std::mutex> lock(mutex_);
  b = ++barrier_counter_[rpc_name];
@@ -100,7 +101,7 @@ void RPCServer::SetCond(const std::string& rpc_name) {
 }
 void RPCServer::WaitCond(const std::string& rpc_name) {
-  VLOG(3) << "RPCServer WaitCond " << rpc_name;
+  VLOG(4) << "RPCServer WaitCond " << rpc_name;
  int cond = 0;
  {
    std::unique_lock<std::mutex> lock(mutex_);

--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@@ -76,6 +76,8 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
      if (total_written + size_to_write > length) {
        size_to_write = length - total_written;
      }
+      // This log is useful to see how long a internal block size is of rpc.
+      VLOG(7) << "copy " << size_to_write << " data to CUDAPlace";
      memory::Copy(boost::get<platform::CUDAPlace>(place),
                   reinterpret_cast<void*>(p), cpu, data, size_to_write,
                   gpu_dev_ctx.stream());
@@ -103,6 +105,8 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
    }
    // TODO(gongwb): can we avoid copy?
    platform::CPUPlace cpu;
+    // This log is useful to see how long a internal block size is of rpc.
+    VLOG(7) << "copy " << size_to_write << " data to CPUPlace";
    memory::Copy(cpu, reinterpret_cast<void*>(p), cpu, data, size_to_write);
    p += size_to_write;

--- a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/elementwise_add_op.h"
+#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+namespace paddle {
+namespace operators {
+using framework::DataLayout;
+using framework::Tensor;
+using mkldnn::memory;
+using mkldnn::reorder;
+using mkldnn::primitive;
+using mkldnn::stream;
+using mkldnn::sum;
+template <typename T>
+class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    const T* x_data = x->data<T>();
+    const T* y_data = y->data<T>();
+    T* z_data = z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    auto x_dims = x->dims();
+    auto y_dims = y->dims();
+    auto z_dims = z->dims();
+    // Execute default elementwise_add operator when
+    // broadcast operations need to performed.
+    if (x_dims != y_dims) {
+      auto sum_func = [](T a, T b) -> T { return a + b; };
+      TransformFunctor<decltype(sum_func), T,
+                       paddle::platform::CPUDeviceContext, T>
+          functor(
+              x, y, z,
+              ctx.template device_context<paddle::platform::CPUDeviceContext>(),
+              sum_func);
+      axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+      PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+                     "Axis should be in range [0, x_dims)");
+      trim_trailing_singular_dims(&y_dims);
+      axis = (y_dims.size() == 0) ? x_dims.size() : axis;
+      int pre, n, post;
+      get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
+      if (post == 1) {
+        functor.RunRowWise(n, pre);
+      } else {
+        functor.RunMidWise(n, pre, post);
+      }
+      z->set_layout(DataLayout::kMKLDNN);
+      z->set_format(x->format());
+    } else {
+      PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
+                         x->format() != memory::format::format_undef,
+                     "Wrong layout/format set for X tensor");
+      PADDLE_ENFORCE(y->layout() == DataLayout::kMKLDNN &&
+                         y->format() != memory::format::format_undef,
+                     "Wrong layout/format set for X tensor");
+      std::vector<int> src_x_tz = framework::vectorize2int(x_dims);
+      std::vector<int> src_y_tz = framework::vectorize2int(y_dims);
+      std::vector<int> dst_tz = framework::vectorize2int(z_dims);
+      std::vector<memory::primitive_desc> srcs_pd;
+      std::vector<memory> srcs;
+      std::vector<float> scales = {1.0f, 1.0f};
+      auto src_x_pd = memory::primitive_desc(
+          {{src_x_tz}, memory::data_type::f32, x->format()}, mkldnn_engine);
+      auto src_y_pd = memory::primitive_desc(
+          {{src_y_tz}, memory::data_type::f32, y->format()}, mkldnn_engine);
+      auto src_x_memory =
+          memory(src_x_pd, paddle::platform::to_void_cast(x_data));
+      auto src_y_memory =
+          memory(src_y_pd, paddle::platform::to_void_cast(y_data));
+      srcs_pd.push_back(src_x_pd);
+      srcs_pd.push_back(src_y_pd);
+      srcs.push_back(src_x_memory);
+      srcs.push_back(src_y_memory);
+      auto dst_md =
+          memory::desc({dst_tz}, memory::data_type::f32, memory::format::any);
+      // create primitive descriptor for sum
+      auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_pd);
+      // create mkldnn memory for dst
+      memory dst_memory = memory(sum_pd.dst_primitive_desc(), z_data);
+      std::vector<primitive::at> inputs;
+      inputs.push_back(srcs[0]);
+      inputs.push_back(srcs[1]);
+      // create sum primitive
+      auto sum_prim = sum(sum_pd, inputs, dst_memory);
+      std::vector<primitive> pipeline;
+      pipeline.push_back(sum_prim);
+      stream(stream::kind::eager).submit(pipeline).wait();
+      z->set_layout(DataLayout::kMKLDNN);
+      z->set_format(
+          (memory::format)dst_memory.get_primitive_desc().desc().data.format);
+    }
+  }
+};
+template <typename T>
+class EltwiseAddMKLDNNGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
+    auto set_mkldnn_format = [](Tensor* in, const Tensor* out) {
+      in->set_layout(DataLayout::kMKLDNN);
+      in->set_format(out->format());
+    };
+    if (x->dims() == y->dims()) {
+      auto blas = math::GetBlas<paddle::platform::CPUDeviceContext, T>(ctx);
+      if (dx) {
+        blas.VCOPY(dout->numel(), dout->data<T>(),
+                   dx->mutable_data<T>(ctx.GetPlace()));
+        set_mkldnn_format(dx, dout);
+      }
+      if (dy) {
+        blas.VCOPY(dout->numel(), dout->data<T>(),
+                   dy->mutable_data<T>(ctx.GetPlace()));
+        set_mkldnn_format(dy, dout);
+      }
+    } else {
+      // Execute default kernel when broadcast is needed
+      ElemwiseGradCompute<paddle::platform::CPUDeviceContext, T,
+                          IdentityGrad<T>, IdentityGrad<T>>(
+          ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(),
+          IdentityGrad<T>());
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(elementwise_add, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::EltwiseAddMKLDNNKernel<float>)
+REGISTER_OP_KERNEL(elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::EltwiseAddMKLDNNGradKernel<float>)
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -14,8 +14,12 @@ limitations under the License. */
 #pragma once
 #include <string>
+#include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 namespace paddle {
 namespace operators {
@@ -40,6 +44,21 @@ class ElementwiseOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("Out", x_dim);
    ctx->ShareLoD("X", /*->*/ "Out");
  }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+#ifdef PADDLE_WITH_MKLDNN
+    if (platform::CanMKLDNNBeUsed(ctx)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 class ElementwiseOpInferVarType : public framework::VarTypeInference {
@@ -65,6 +84,8 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
                 "for broadcasting Y onto X.")
        .SetDefault(-1)
        .EqualGreaterThan(-1);
+    AddAttr<bool>("use_mkldnn", "(bool, default false). Used by MKLDNN.")
+        .SetDefault(false);
    AddComment(string::Sprintf(R"DOC(
 Limited Elementwise %s Operator
@@ -138,6 +159,21 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
      ctx->SetOutputDim(y_grad_name, y_dims);
    }
  }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+#ifdef PADDLE_WITH_MKLDNN
+    if (platform::CanMKLDNNBeUsed(ctx)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 }  // namespace operators
 }  // namespace paddle

--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -101,17 +101,16 @@ void ListenAndServOp::RunSyncLoop(
    framework::Scope *recv_scope,
    const std::vector<int> &prefetch_block_id_list) const {
  size_t num_blocks = program->Size();
+  auto optimize_blocks =
+      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
  PADDLE_ENFORCE_GE(num_blocks, 2,
                    "server program should have at least 2 blocks");
-  std::vector<int> optimize_block_id_list;
+  std::vector<int> optimize_blocks_idx;
-  for (int blkid = 1; blkid < num_blocks; ++blkid) {
+  for (auto blk : optimize_blocks) {
-    if (std::find(prefetch_block_id_list.begin(), prefetch_block_id_list.end(),
+    optimize_blocks_idx.push_back(blk->ID());
-                  blkid) == prefetch_block_id_list.end()) {
-      optimize_block_id_list.push_back(blkid);
-    }
  }
-  auto optimize_prepared = executor->Prepare(*program, optimize_block_id_list);
+  auto optimize_prepared = executor->Prepare(*program, optimize_blocks_idx);
  // Insert placeholder for block0 which holds current op itself.
  optimize_prepared.insert(
      optimize_prepared.begin(),
@@ -134,14 +133,14 @@ void ListenAndServOp::RunSyncLoop(
    // and this will still work.
    // The optimize blocks which have the same parent ID would run parallel
    // TODO(Yancey1989): need to use ParallelExecutor for future
-    int32_t last_parent_blkid = program->Block(1).Parent();
+    int32_t last_parent_blkid = optimize_blocks[0]->Parent();
    std::vector<size_t> parallel_blkids;
-    parallel_blkids.push_back(1);
+    parallel_blkids.push_back(optimize_blocks[0]->ID());
    double ts = GetTimestamp();
-    for (size_t i = 1; i < optimize_block_id_list.size(); ++i) {
+    for (size_t i = 1; i < optimize_blocks.size(); ++i) {
      // skip the first optimize block because it is already in the
      // parallel_blkids.
-      int blkid = optimize_block_id_list[i];
+      int blkid = optimize_blocks[i]->ID();
      if (program->Block(blkid).Parent() != last_parent_blkid) {
        ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
                              program, recv_scope);
@@ -164,8 +163,8 @@ void ListenAndServOp::RunSyncLoop(
 }
 void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
-                                   framework::ProgramDesc *program) const {
+                                   framework::ProgramDesc *program,
-  VLOG(3) << "RunAsyncLoop in";
+                                   framework::Scope *recv_scope) const {
  // grad name to block id
  std::unordered_map<std::string, int32_t> grad_to_block_id;
  std::unordered_map<int32_t, std::string> id_to_grad;
@@ -192,6 +191,10 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
    block_list.push_back(blkid);
  }
  auto optimize_prepared = executor->Prepare(*program, block_list);
+  // execute global block if needed
+  if (block_list[0] == 1 && id_to_grad.count(1) == 0) {
+    executor->RunPreparedContext(optimize_prepared[0].get(), recv_scope);
+  }
  std::unordered_map<std::string,
                     std::shared_ptr<framework::ExecutorPrepareContext>>
      grad_to_prepared_ctx;
@@ -203,7 +206,6 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
  request_get_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
  request_prefetch_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
-  VLOG(3) << "RunAsyncLoop into while";
  while (true) {
    if (rpc_service_->IsExit()) {
      LOG(INFO) << "get exit!rpc_processor break!";
@@ -261,8 +263,11 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
  rpc_service_->RegisterRPC(distributed::kRequestPrefetch,
                            request_prefetch_handler_.get());
-  auto *optimize_block = Attr<framework::BlockDesc *>(kOptimizeBlock);
+  auto optimize_blocks =
-  auto *program = optimize_block->Program();
+      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
+  PADDLE_ENFORCE(optimize_blocks.size() >= 1,
+                 "optimize blocks should be 1 at least on the pserver side.");
+  auto *program = optimize_blocks[0]->Program();
  framework::Executor executor(dev_place);
  // prepare for prefetch
@@ -317,7 +322,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
  if (sync_mode) {
    RunSyncLoop(&executor, program, &recv_scope, prefetch_block_id_list);
  } else {
-    RunAsyncLoop(&executor, program);
+    RunAsyncLoop(&executor, program, &recv_scope);
  }
 }
@@ -339,8 +344,9 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
        "a map from grad name to it's optimize block id")
        .SetDefault({});
    AddAttr<bool>("sync_mode", "if works at sync_mode or not").SetDefault(true);
-    AddAttr<framework::BlockDesc *>(kOptimizeBlock,
+    AddAttr<std::vector<framework::BlockDesc *>>(
-                                    "BlockID to run on server side.");
+        kOptimizeBlocks, "Optimize blocks to run on server side.")
+        .SetDefault({});
    AddAttr<std::vector<std::string>>(kPrefetchVarNameToBlockId,
                                      "prefetch blocks to run on server side.")
        .SetDefault({});

--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -30,7 +30,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-constexpr char kOptimizeBlock[] = "OptimizeBlock";
+constexpr char kOptimizeBlocks[] = "optimize_blocks";
 constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id";
 void RunServer(std::shared_ptr<distributed::RPCServer> service);
@@ -50,7 +50,8 @@ class ListenAndServOp : public framework::OperatorBase {
                   const std::vector<int>& prefetch_block_id_list) const;
  void RunAsyncLoop(framework::Executor* executor,
-                    framework::ProgramDesc* program) const;
+                    framework::ProgramDesc* program,
+                    framework::Scope* recv_scope) const;
  void SavePort() const;

--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@@ -295,7 +295,7 @@ class ParallelDoGradOp : public framework::OperatorBase {
        auto sum_op = framework::OpRegistry::CreateOp(
            "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
-            framework::AttributeMap{});
+            framework::AttributeMap{{"use_mkldnn", {false}}});
        VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]);
        sum_op->Run(*sub_scopes[0], places[0]);
        WaitOnPlace(places[0]);

--- a/paddle/fluid/operators/random_crop_op.cc
+++ b/paddle/fluid/operators/random_crop_op.cc
@@ -37,6 +37,11 @@ class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("SeedOut", "The random seed after random cropping.")
        .AsIntermediate();
    AddAttr<std::vector<int>>("shape", "The shape of a cropped instance.");
+    AddAttr<int>("startup_seed",
+                 "If the input 'Seed' is not initialized, the 'startup_seed' "
+                 "will be used to replace it. Even so, the seed after random "
+                 "crop will also be outputed to the 'SeedOut'.")
+        .SetDefault(0);
    AddComment(R"DOC(
      This operator takes a batch of instance, and do random cropping on each instance.
      It means that cropping positions differs on each instance, which is determined
@@ -49,8 +54,6 @@ class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker {
 class RandomCropOpInferShape : public framework::InferShapeBase {
 public:
  void operator()(framework::InferShapeContext* ctx) const override {
-    auto seed_dim = ctx->GetInputDim("Seed");
-    PADDLE_ENFORCE(seed_dim.size() == 1 && seed_dim[0] == 1);
    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
    auto x_dim = ctx->GetInputDim("X");
    PADDLE_ENFORCE_GT(x_dim.size(), static_cast<int64_t>(shape.size()));
@@ -62,7 +65,6 @@ class RandomCropOpInferShape : public framework::InferShapeBase {
      out_dim[x_i] = shape[shape_i];
    }
    ctx->SetOutputDim("Out", framework::make_ddim(out_dim));
-    ctx->SetOutputDim("SeedOut", framework::make_ddim({1}));
  }
 };

--- a/paddle/fluid/operators/random_crop_op.h
+++ b/paddle/fluid/operators/random_crop_op.h
@@ -142,16 +142,22 @@ template <typename DeviceContext, typename T>
 class RandomCropKernel : public framework::OpKernel<T> {
 public:
  virtual void Compute(const framework::ExecutionContext& ctx) const {
-    auto& seed_tensor = detail::Ref(ctx.Input<framework::LoDTensor>("Seed"));
    int64_t seed = 0;
-    if (platform::is_cpu_place(seed_tensor.place())) {
+    auto& seed_tensor = detail::Ref(ctx.Input<framework::LoDTensor>("Seed"));
-      seed = *seed_tensor.data<int64_t>();
+    if (seed_tensor.IsInitialized()) {
+      if (platform::is_cpu_place(seed_tensor.place())) {
+        seed = *seed_tensor.data<int64_t>();
+      } else {
+        LOG(WARNING) << "It is slow to place seed in GPU memory. Please verify "
+                        "your program";
+        framework::LoDTensor cpu_seed;
+        framework::TensorCopySync(seed_tensor, platform::CPUPlace(), &cpu_seed);
+        seed = *cpu_seed.data<int64_t>();
+      }
    } else {
-      LOG(WARNING) << "It is slow to place seed in GPU memory. Please verify "
+      VLOG(5) << "WARNING: The input 'Seed' is not initialized, use attribute "
-                      "your program";
+                 "'startup_seed' instead.";
-      framework::LoDTensor cpu_seed;
+      seed = ctx.Attr<int>("startup_seed");
-      framework::TensorCopySync(seed_tensor, platform::CPUPlace(), &cpu_seed);
-      seed = *cpu_seed.data<int64_t>();
    }
    auto shape = ctx.Attr<std::vector<int>>("shape");
    auto& x = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
@@ -171,7 +177,7 @@ class RandomCropKernel : public framework::OpKernel<T> {
    engine.discard(functor.prod_batchsize_dims_ *
                   (functor.rank_ - functor.num_batchsize_dims_));
    *ctx.Output<framework::LoDTensor>("SeedOut")->mutable_data<int64_t>(
-        platform::CPUPlace()) = engine();
+        framework::make_ddim({1}), platform::CPUPlace()) = engine();
  }
 };

--- a/paddle/fluid/operators/reader/create_custom_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc
@@ -39,6 +39,7 @@ class CustomReader : public framework::DecoratedReader {
  const framework::ProgramDesc program_;
  int sub_block_id_;
  framework::Executor exe_;
+  framework::Scope scope_;
  std::vector<std::string> source_var_names_;
  std::vector<std::string> sink_var_names_;
@@ -158,23 +159,24 @@ void CustomReader::ReadNext(std::vector<framework::LoDTensor>* out) {
  // The scope for CustomReader's sub-block should be independent and shouldn't
  // be any other computation scope's child. Otherwise, data preprocessing and
  // compution cannot be concurrent.
-  framework::Scope scope;
+  framework::Scope* exe_scope = &scope_.NewScope();
  // 1. Copy LoDTensors from underlying reader's output to source variables.
  for (size_t i = 0; i < source_var_names_.size(); ++i) {
-    framework::Variable* var = scope.Var(source_var_names_[i]);
+    framework::Variable* var = exe_scope->Var(source_var_names_[i]);
    framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
    tensor->ShareDataWith(underlying_outs[i]);
    tensor->set_lod(underlying_outs[i].lod());
  }
  // 2. Run the sub-block.
-  exe_.Run(program_, &scope, sub_block_id_, false, true);
+  exe_.Run(program_, exe_scope, sub_block_id_, false, true);
  // 3. Copy LoDTensors from sink variables to out.
  out->resize(sink_var_names_.size());
  for (size_t i = 0; i < sink_var_names_.size(); ++i) {
-    const auto& tensor = detail::Ref(scope.FindVar(sink_var_names_[i]))
+    const auto& tensor = detail::Ref(exe_scope->FindVar(sink_var_names_[i]))
                             .Get<framework::LoDTensor>();
    framework::TensorCopySync(tensor, platform::CPUPlace(), &(*out)[i]);
  }
+  scope_.DeleteScope(exe_scope);
 }
 }  // namespace reader

--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -23,13 +23,13 @@ namespace reader {
 // 'Double buffer' means we shall maintain two batches of input data at the same
 // time. So the kCacheSize shoul be at least 2.
-static constexpr size_t kCacheSize = 3;
+static constexpr size_t kCacheSize = 5;
 // There will be two bacthes out of the channel during training:
 // 1. the one waiting to be sent to the channel
 // 2. the one just be received from the channel, which is also being used by
 // subsequent operators.
 // So the channel size should be kChacheSize - 2
-static constexpr size_t kChannelSize = 1;  // kCacheSize - 2
+static constexpr size_t kChannelSize = 3;  // kCacheSize - 2
 class DoubleBufferReader : public framework::DecoratedReader {
 public:

--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -429,7 +429,8 @@ class RecurrentGradOp : public RecurrentBase {
          auto sum_op = framework::OpRegistry::CreateOp(
              "sum", {{"X", {pg_names[param_id], new_inside_name}}},
-              {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
+              {{"Out", {pg_names[param_id]}}},
+              framework::AttributeMap{{"use_mkldnn", {false}}});
          sum_op->Run(cur_scope, place);
          cur_scope.Rename(new_inside_name, inside_grad_name);

--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -129,7 +129,10 @@ void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) {
  // sub program run in listen_and_serv_op, for simple test we use sum
  f::ProgramDesc program;
  const auto &root_block = program.Block(0);
+  std::vector<framework::BlockDesc *> optimize_blocks;
  auto *optimize_block = program.AppendBlock(root_block);
+  optimize_blocks.push_back(optimize_block);
  auto *prefetch_block = program.AppendBlock(root_block);
  // X for server side tensors, RX for received tensors, must be of same shape.
  AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block,
@@ -139,7 +142,7 @@ void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) {
  attrs.insert({"Fanin", 1});
  attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
  attrs.insert({"GradList", std::vector<std::string>({"x1"})});
-  attrs.insert({"OptimizeBlock", optimize_block});
+  attrs.insert({"optimize_blocks", optimize_blocks});
  attrs.insert({"PrefetchBlock", prefetch_block});
  attrs.insert({"grad_to_block_id", std::vector<std::string>({""})});
  attrs.insert({"sync_mode", true});

--- a/paddle/fluid/operators/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/softmax_mkldnn_op.cc
@@ -27,8 +27,81 @@ using paddle::platform::MKLDNNMemDesc;
 using mkldnn::memory;  // Note: paddle has also "memory" namespace
 using mkldnn::primitive;
 using mkldnn::softmax_forward;
+using mkldnn::softmax_backward;
 using mkldnn::prop_kind;
 using mkldnn::stream;
+using platform::to_void_cast;
+class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
+ public:
+  SoftmaxMKLDNNHandler(
+      std::shared_ptr<mkldnn::softmax_forward::primitive_desc> softmax_pd,
+      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+      const std::string& base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
+        softmax_pd_(softmax_pd) {}
+  SoftmaxMKLDNNHandler(
+      std::shared_ptr<mkldnn::softmax_forward::primitive_desc> softmax_pd,
+      std::shared_ptr<mkldnn::softmax_backward::primitive_desc> softmax_bwd_pd,
+      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+      const std::string& base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
+        softmax_pd_(softmax_pd),
+        softmax_bwd_pd_(softmax_bwd_pd) {
+    // If we are in Grad operatgor then update a key with BWD suffix to
+    // distinguish from FWD memory primitives
+    key_ += "-BWD";
+  }
+  std::shared_ptr<mkldnn::softmax_forward> AcquireSoftmax(
+      std::shared_ptr<mkldnn::memory> dst_memory_p,
+      std::shared_ptr<mkldnn::memory> src_memory_p) {
+    /*Generate key*/
+    auto prim_key = key_ + "@softmax_p";
+    auto softmax_p = std::static_pointer_cast<mkldnn::softmax_forward>(
+        dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((softmax_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find softmax primitive in device context");
+    if (softmax_p == nullptr) {
+      softmax_p = std::make_shared<mkldnn::softmax_forward>(
+          *(softmax_pd_.get()),
+          *(static_cast<mkldnn::memory*>(src_memory_p.get())),
+          *(static_cast<mkldnn::memory*>(dst_memory_p.get())));
+      dev_ctx_.SetBlob(prim_key, softmax_p);
+    } else {
+      is_reusing_ = true;
+    }
+    return softmax_p;
+  }
+  std::shared_ptr<mkldnn::softmax_backward> AcquireSoftmaxBackward(
+      std::shared_ptr<mkldnn::memory> dst_memory_p,
+      std::shared_ptr<mkldnn::memory> diff_dst_memory_p,
+      std::shared_ptr<mkldnn::memory> diff_src_memory_p) {
+    auto prim_key = key_ + "@softmax_bwd_p";
+    auto softmax_bwd_p = std::static_pointer_cast<mkldnn::softmax_backward>(
+        dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((softmax_bwd_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find softmax backward primitive in device context");
+    if (softmax_bwd_p == nullptr) {
+      softmax_bwd_p = std::make_shared<mkldnn::softmax_backward>(
+          *softmax_bwd_pd_, *(dst_memory_p.get()), *(diff_dst_memory_p.get()),
+          *(diff_src_memory_p.get()));
+      dev_ctx_.SetBlob(prim_key, softmax_bwd_p);
+    } else {
+      is_reusing_ = true;
+    }
+    return softmax_bwd_p;
+  }
+ private:
+  std::shared_ptr<mkldnn::softmax_forward::primitive_desc> softmax_pd_;
+  std::shared_ptr<mkldnn::softmax_backward::primitive_desc> softmax_bwd_pd_;
+};
 template <typename T>
 class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
@@ -54,56 +127,27 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
    // Same memory descriptor to be used for input and output
    memory::dims softmax_tz = {src_tz[0], src_tz[1]};
    // Generate keys for storing/retriving primitives for this operator
-    // TODO(jczaja): Each MKLDNN operator may have diffrent hashing function
+    const std::string key =
-    auto gethash = [](memory::dims& operand_dims) {
+        platform::MKLDNNHandler::GetHash(softmax_tz, ctx.op().Output("Out"));
-      return std::string(std::to_string(operand_dims[0]) + "-" +
+    const std::string key_softmax_pd = key + "@softmax_pd";
-                         std::to_string(operand_dims[1]));
-    };
+    // Currently only NC data format is supported
-    const std::string key = gethash(softmax_tz);
+    auto softmax_md = MKLDNNMemDesc(
-    const std::string key_softmax_p = key + "@softmax_p";
+        {softmax_tz}, platform::MKLDNNGetDataType<T>(), memory::format::nc);
-    const std::string key_softmax_src_mem_p = key + "@softmax_src_mem_p";
+    // Normalization is made after innermost dimension eg. C out of NC
-    const std::string key_softmax_dst_mem_p = key + "@softmax_dst_mem_p";
+    auto softmax_desc = softmax_forward::desc(prop_kind::forward_scoring,
+                                              softmax_md, 1 /*dim: C*/);
-    std::shared_ptr<void> softmax_p = dev_ctx.GetBlob(key_softmax_p);
+    auto softmax_pd = std::make_shared<mkldnn::softmax_forward::primitive_desc>(
-    if (softmax_p == nullptr) {
+        softmax_desc, mkldnn_engine);
-      // Currently only NC data format is supported
+    dev_ctx.SetBlob(key_softmax_pd, softmax_pd);
-      auto softmax_md =
-          MKLDNNMemDesc({softmax_tz}, memory::f32, memory::format::nc);
+    SoftmaxMKLDNNHandler handler(softmax_pd, dev_ctx, mkldnn_engine, key);
-      // Normalization is made after innermost dimension eg. C out of NC
+    auto softmax_src_memory_p =
-      auto softmax_desc = softmax_forward::desc(prop_kind::forward_scoring,
+        handler.AcquireSrcMemory(softmax_md, to_void_cast<T>(input_data));
-                                                softmax_md, 1 /*dim: C*/);
+    auto softmax_dst_memory_p =
-      // create memory primitives
+        handler.AcquireDstMemory(softmax_md, to_void_cast<T>(output_data));
-      auto softmax_src_memory_p = std::make_shared<memory>(
+    auto softmax_p =
-          memory::primitive_desc{softmax_md, mkldnn_engine},
+        handler.AcquireSoftmax(softmax_dst_memory_p, softmax_src_memory_p);
-          static_cast<void*>(const_cast<T*>(input_data)));
-      dev_ctx.SetBlob(key_softmax_src_mem_p, softmax_src_memory_p);
-      auto softmax_dst_memory_p = std::make_shared<memory>(
-          memory::primitive_desc{softmax_md, mkldnn_engine},
-          static_cast<void*>(output_data));
-      dev_ctx.SetBlob(key_softmax_dst_mem_p, softmax_dst_memory_p);
-      auto softmax_forward_pd =
-          std::make_shared<softmax_forward::primitive_desc>(softmax_desc,
-                                                            mkldnn_engine);
-      softmax_p = std::make_shared<softmax_forward>(
-          *(softmax_forward_pd.get()),
-          *(static_cast<memory*>(softmax_src_memory_p.get())),
-          *(static_cast<memory*>(softmax_dst_memory_p.get())));
-      dev_ctx.SetBlob(key_softmax_p, softmax_p);
-    } else {
-      // Primitives already exist
-      auto src_memory_p = std::static_pointer_cast<memory>(
-          dev_ctx.GetBlob(key_softmax_src_mem_p));
-      PADDLE_ENFORCE(src_memory_p != nullptr,
-                     "Fail to find softmax src mem_p in device context");
-      auto dst_memory_p = std::static_pointer_cast<memory>(
-          dev_ctx.GetBlob(key_softmax_dst_mem_p));
-      PADDLE_ENFORCE(dst_memory_p != nullptr,
-                     "Fail to find softmax dst mem_p in device context");
-      src_memory_p->set_data_handle(
-          reinterpret_cast<void*>(const_cast<T*>(input_data)));
-      dst_memory_p->set_data_handle(output_data);
-    }
    std::vector<primitive> pipeline{
        *(static_cast<softmax_forward::primitive*>(softmax_p.get()))};
@@ -120,6 +164,77 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
  }
 };
+template <typename T>
+class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    auto mkldnn_engine = dev_ctx.GetEngine();
+    const Tensor* output = ctx.Input<Tensor>("Out");
+    const T* dst_data = output->data<T>();
+    auto* dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
+    const auto* diff_dst_ptr = dout->template data<T>();
+    auto* dx =
+        ctx.template Output<framework::Tensor>(framework::GradVarName("X"));
+    T* diff_src_ptr = dx->template mutable_data<T>(ctx.GetPlace());
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    std::vector<int> src_tz(dst_tz);
+    PADDLE_ENFORCE(output->dims().size() == 2UL,
+                   "The input of softmax op must be a 2D matrix.");
+    // MKL-DNN does support softmax over selected axis. Having 2D Tensor,
+    // we will make normalization after final eg. axis: 1
+    PADDLE_ENFORCE(((src_tz[0] == dst_tz[0]) && (src_tz[1] == dst_tz[1])),
+                   "Softmax input and output dimensions should match");
+    // Same memory descriptor to be used for input and output
+    memory::dims softmax_tz = {src_tz[0], src_tz[1]};
+    // Currently only supports NC data format
+    // retrieve eltwise primitive desc from device context
+    const std::string key =
+        platform::MKLDNNHandler::GetHash(softmax_tz, ctx.op().Input("Out"));
+    const std::string key_softmax_pd = key + "@softmax_pd";
+    auto softmax_pd =
+        std::static_pointer_cast<mkldnn::softmax_forward::primitive_desc>(
+            dev_ctx.GetBlob(key_softmax_pd));
+    PADDLE_ENFORCE(softmax_pd != nullptr,
+                   "Fail to find softmax_pd in device context");
+    // TODO(jczaja): Add layouts support when there is a need to do so
+    // Two dimensional softmax does support NC format
+    auto data_softmax_md = MKLDNNMemDesc(
+        {softmax_tz}, platform::MKLDNNGetDataType<T>(), memory::format::nc);
+    auto diff_softmax_md = MKLDNNMemDesc(
+        {softmax_tz}, platform::MKLDNNGetDataType<T>(), memory::format::nc);
+    // Normalization is made after innermost dimension eg. C out of NC
+    auto softmax_bwd_desc =
+        softmax_backward::desc(diff_softmax_md, data_softmax_md, 1 /* dim: C*/);
+    auto softmax_bwd_pd =
+        std::make_shared<mkldnn::softmax_backward::primitive_desc>(
+            softmax_bwd_desc, mkldnn_engine, *softmax_pd);
+    SoftmaxMKLDNNHandler handler(softmax_pd, softmax_bwd_pd, dev_ctx,
+                                 mkldnn_engine, key);
+    auto dst_memory_p =
+        handler.AcquireDstMemory(data_softmax_md, to_void_cast<T>(dst_data));
+    auto diff_dst_memory_p = handler.AcquireDiffDstMemory(
+        diff_softmax_md, to_void_cast<T>(diff_dst_ptr));
+    auto diff_src_memory_p = handler.AcquireDiffSrcMemory(
+        diff_softmax_md, to_void_cast<T>(diff_src_ptr));
+    // Get primitve from device context
+    auto softmax_bwd_p = handler.AcquireSoftmaxBackward(
+        dst_memory_p, diff_dst_memory_p, diff_src_memory_p);
+    std::vector<primitive> pipeline{*softmax_bwd_p};
+    stream(stream::kind::eager).submit(pipeline).wait();
+  }
+};
 }  // namespace operators
 }  // namespace paddle
@@ -127,3 +242,5 @@ namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(softmax, MKLDNN, ::paddle::platform::CPUPlace,
                   ops::SoftmaxMKLDNNKernel<float>);
+REGISTER_OP_KERNEL(softmax_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::SoftmaxMKLDNNGradKernel<float>);
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -145,16 +145,30 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
      const framework::ExecutionContext& ctx) const override {
    // choose cudnn kernel if the runtime supported.
    framework::LibraryType library_{framework::LibraryType::kPlain};
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
 #ifdef PADDLE_WITH_CUDA
    if (platform::CanCUDNNBeUsed(ctx)) {
      library_ = framework::LibraryType::kCUDNN;
    }
 #endif
-    std::string data_format = ctx.Attr<std::string>("data_format");
+#ifdef PADDLE_WITH_MKLDNN
-    return framework::OpKernelType(
+    if (library_ == framework::LibraryType::kPlain &&
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+        platform::CanMKLDNNBeUsed(ctx)) {
-        framework::StringToDataLayout(data_format), library_);
+      library_ = framework::LibraryType::kMKLDNN;
+      layout_ = framework::DataLayout::kMKLDNN;
+    }
+#endif
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    if (input_data_type == framework::proto::VarType::FP16) {
+      PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                     "float16 can only be used on GPU place");
+    }
+    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
+                                   library_);
  }
 };

--- a/paddle/fluid/operators/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/sum_mkldnn_op.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/*Licensed under the Apache License, Version 2.0(the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+      http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. */
+#include "mkldnn.hpp"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/operators/sum_op.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+namespace paddle {
+namespace operators {
+using paddle::framework::Tensor;
+using paddle::platform::MKLDNNDeviceContext;
+using paddle::platform::CPUDeviceContext;
+using framework::DataLayout;
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::stream;
+using mkldnn::sum;
+using mkldnn::reorder;
+using platform::to_void_cast;
+template <typename T>
+class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+    auto in_vars = ctx.MultiInputVar("X");
+    const int N = in_vars.size();
+    auto out_var = ctx.OutputVar("Out");
+    bool in_place = out_var == in_vars[0];
+    if (out_var->IsType<framework::LoDTensor>()) {
+      LoDTensor* output = ctx.Output<LoDTensor>("Out");
+      T* output_data = output->mutable_data<T>(ctx.GetPlace());
+      std::vector<int> dst_tz = framework::vectorize2int(output->dims());
+      auto src_tz = dst_tz;
+      memory::format output_format{memory::format::format_undef};
+      std::vector<float> scales;
+      std::vector<memory::primitive_desc> srcs_mpd;
+      std::vector<mkldnn::memory> srcs_mem;
+      PADDLE_ENFORCE(in_vars[0]->IsType<LoDTensor>(),
+                     "Input[0] must be LoDTensors");
+      auto& input0 = in_vars[0]->Get<LoDTensor>();
+      PADDLE_ENFORCE(input0.layout() == DataLayout::kMKLDNN &&
+                         input0.format() != memory::format::format_undef,
+                     "Wrong layout/format for inputs[0]");
+      memory::format input_format = input0.format();
+      if (src_tz.size() == 1 && (input_format == memory::format::nchw ||
+                                 input_format == memory::format::nhwc)) {
+        input_format = memory::format::x;
+      }
+      if (src_tz.size() == 2 && (input_format == memory::format::nchw ||
+                                 input_format == memory::format::nhwc)) {
+        input_format = memory::format::nc;
+      }
+      for (int i = in_place ? 1 : 0; i < N; i++) {
+        PADDLE_ENFORCE(in_vars[i]->IsType<LoDTensor>(),
+                       "all inputs must be all LoDTensors");
+        auto& input = in_vars[i]->Get<LoDTensor>();
+        PADDLE_ENFORCE(input.layout() == DataLayout::kMKLDNN &&
+                           input.format() != memory::format::format_undef,
+                       "Wrong layout/format for inputs");
+        if (input.numel() == 0) {
+          continue;
+        }
+        const T* input_data = input.data<T>();
+        auto src_md =
+            memory::desc(src_tz, memory::data_type::f32, input_format);
+        auto src_mpd = memory::primitive_desc(src_md, mkldnn_engine);
+        auto src_mem = memory(src_mpd, to_void_cast(input_data));
+        srcs_mpd.push_back(src_mpd);
+        srcs_mem.push_back(src_mem);
+        scales.push_back(1.0);
+      }
+      auto dst_md =
+          memory::desc(dst_tz, memory::data_type::f32, memory::format::any);
+      auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_mpd);
+      std::shared_ptr<memory> dst_mem;
+      if (in_place) {
+        dst_mem.reset(new memory(sum_pd.dst_primitive_desc()));
+      } else {
+        dst_mem.reset(new memory(sum_pd.dst_primitive_desc(), output_data));
+      }
+      std::vector<mkldnn::primitive::at> inputs;
+      for (size_t i = 0; i < srcs_mem.size(); ++i) {
+        inputs.push_back(srcs_mem[i]);
+      }
+      auto sum_prim = mkldnn::sum(sum_pd, inputs, *dst_mem);
+      output_format = (memory::format)platform::GetMKLDNNFormat(sum_pd);
+      primitive reorder_prim;
+      std::shared_ptr<memory> target_mem;
+      if (in_place) {
+        output_format = input_format;
+        target_mem.reset(new memory(
+            {{{src_tz}, memory::data_type::f32, output_format}, mkldnn_engine},
+            output_data));
+        reorder_prim = reorder(*dst_mem, *target_mem);
+      }
+      std::vector<primitive> pipeline;
+      pipeline.push_back(sum_prim);
+      if (in_place) pipeline.push_back(reorder_prim);
+      stream(stream::kind::eager).submit(pipeline).wait();
+      output->set_layout(DataLayout::kMKLDNN);
+      output->set_format(output_format);
+    } else if (out_var->IsType<framework::SelectedRows>()) {
+      // TODO(@mozga-intel) Add MKLDNN SelectedRows support
+      std::unique_ptr<framework::SelectedRows> in0;
+      if (in_place) {
+        // If is in_place, we store the input[0] to in0
+        auto& in_sel0 = in_vars[0]->Get<SelectedRows>();
+        auto& rows = in_sel0.rows();
+        in0.reset(new framework::SelectedRows(rows, in_sel0.height()));
+        in0->mutable_value()->ShareDataWith(in_sel0.value());
+      }
+      auto get_selected_row = [&](size_t i) -> const SelectedRows& {
+        if (i == 0 && in0) {
+          return *in0.get();
+        } else {
+          return in_vars[i]->Get<SelectedRows>();
+        }
+      };
+      auto* out = ctx.Output<SelectedRows>("Out");
+      out->mutable_rows()->clear();
+      auto* out_value = out->mutable_value();
+      // Runtime InferShape
+      size_t first_dim = 0;
+      for (int i = 0; i < N; i++) {
+        auto& sel_row = get_selected_row(i);
+        first_dim += sel_row.rows().size();
+      }
+      auto in_dim =
+          framework::vectorize(get_selected_row(N - 1).value().dims());
+      in_dim[0] = static_cast<int64_t>(first_dim);
+      out_value->Resize(framework::make_ddim(in_dim));
+      // if all the input sparse vars are empty, no need to
+      // merge these vars.
+      if (first_dim == 0UL) {
+        return;
+      }
+      out_value->mutable_data<T>(ctx.GetPlace());
+      math::SelectedRowsAddTo<CPUDeviceContext, T> functor;
+      int64_t offset = 0;
+      for (int i = 0; i < N; i++) {
+        auto& sel_row = get_selected_row(i);
+        if (sel_row.rows().size() == 0) {
+          continue;
+        }
+        PADDLE_ENFORCE_EQ(out->height(), sel_row.height());
+        functor(ctx.template device_context<CPUDeviceContext>(), sel_row,
+                offset, out);
+        offset += sel_row.value().numel();
+      }
+    } else if (out_var->IsType<framework::LoDTensorArray>()) {
+      // TODO(@mozga-intel) Add MKLDNN LoDTensorArray support
+      auto& out_array = *out_var->GetMutable<framework::LoDTensorArray>();
+      for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) {
+        PADDLE_ENFORCE(in_vars[i]->IsType<framework::LoDTensorArray>(),
+                       "Only support all inputs are TensorArray");
+        auto& in_array = in_vars[i]->Get<framework::LoDTensorArray>();
+        for (size_t i = 0; i < in_array.size(); ++i) {
+          if (in_array[i].numel() != 0) {
+            if (i >= out_array.size()) {
+              out_array.resize(i + 1);
+            }
+            if (out_array[i].numel() == 0) {
+              framework::TensorCopy(in_array[i], in_array[i].place(),
+                                    ctx.device_context(), &out_array[i]);
+              out_array[i].set_lod(in_array[i].lod());
+            } else {
+              PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod());
+              auto in = EigenVector<T>::Flatten(in_array[i]);
+              auto result = EigenVector<T>::Flatten(out_array[i]);
+              result.device(*ctx.template device_context<MKLDNNDeviceContext>()
+                                 .eigen_device()) = result + in;
+            }
+          }
+        }
+      }
+    } else {
+      PADDLE_THROW("Unexpected branch, output variable type is %s",
+                   out_var->Type().name());
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+REGISTER_OP_KERNEL(sum, MKLDNN, ::paddle::platform::CPUPlace,
+                   paddle::operators::SumMKLDNNOpKernel<float>);
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -18,6 +18,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 namespace paddle {
 namespace operators {
 using framework::Tensor;
@@ -63,6 +67,18 @@ class SumOp : public framework::OperatorWithKernel {
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
    auto x_vars = ctx.MultiInputVar("X");
+    framework::LibraryType library{framework::LibraryType::kPlain};
+    framework::DataLayout layout{framework::DataLayout::kAnyLayout};
+#ifdef PADDLE_WITH_MKLDNN
+    if (library == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library = framework::LibraryType::kMKLDNN;
+      layout = framework::DataLayout::kMKLDNN;
+    }
+#endif
    if (x_vars[0]->IsType<framework::LoDTensor>()) {
      int dtype = -1;
      for (auto& x_var : x_vars) {
@@ -80,26 +96,27 @@ class SumOp : public framework::OperatorWithKernel {
                        "Sum operator should have at least one tensor");
      return framework::OpKernelType(
-          static_cast<framework::proto::VarType::Type>(dtype),
+          static_cast<framework::proto::VarType::Type>(dtype), ctx.GetPlace(),
-          ctx.device_context());
+          layout, library);
    } else if (x_vars[0]->IsType<framework::SelectedRows>()) {
      for (auto& var : x_vars) {
        auto& value = var->Get<framework::SelectedRows>().value();
        if (value.IsInitialized()) {
          return framework::OpKernelType(framework::ToDataType(value.type()),
-                                         ctx.device_context());
+                                         ctx.device_context(), layout, library);
        }
      }
      // if input sparse vars are not initialized, use an default kernel type.
      return framework::OpKernelType(framework::proto::VarType::FP32,
-                                     ctx.device_context());
+                                     ctx.device_context(), layout, library);
    } else if (x_vars[0]->IsType<framework::LoDTensorArray>()) {
      for (auto& x_var : x_vars) {
        auto& array = x_var->Get<framework::LoDTensorArray>();
        for (auto& each : array) {
          if (each.numel() != 0) {
            return framework::OpKernelType(framework::ToDataType(each.type()),
-                                           ctx.device_context());
+                                           ctx.device_context(), layout,
+                                           library);
          }
        }
      }
@@ -116,6 +133,9 @@ class SumOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "(vector<Tensor>) The input tensors of sum operator.")
        .AsDuplicable();
    AddOutput("Out", "(Tensor) The output tensor of sum operator.").Reuse("X");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
    AddComment(R"DOC(
 Sum operator.
@@ -132,7 +152,6 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
                  framework::BlockDesc* block) const override {
    auto& inputs = op_desc.Input("X");
    auto var_type = framework::proto::VarType::SELECTED_ROWS;
    for (auto& name : op_desc.Input("X")) {
      VLOG(10) << name << " "
               << block->FindRecursiveOrCreateVar(name).GetType();
@@ -206,6 +225,7 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(sum, ops::SumOp, ops::SumOpMaker, ops::SumGradMaker,
                  ops::SumOpVarTypeInference);
 REGISTER_OP_CPU_KERNEL(
    sum, ops::SumKernel<paddle::platform::CPUDeviceContext, float>,
    ops::SumKernel<paddle::platform::CPUDeviceContext, double>,

--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
@@ -203,11 +203,11 @@ class WhileGradOp : public framework::OperatorBase {
                ->set_lod(inside_tensor.lod());
          }
        }
        auto new_inside_name = cur_scope.Rename(inside_grad_name);
        auto sum_op = framework::OpRegistry::CreateOp(
            "sum", {{"X", {pg_names[param_id], new_inside_name}}},
-            {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
+            {{"Out", {pg_names[param_id]}}},
+            framework::AttributeMap{{"use_mkldnn", {false}}});
        sum_op->Run(cur_scope, dev_place);
        cur_scope.Rename(new_inside_name, inside_grad_name);
      }

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -106,14 +106,6 @@ class CUDADeviceContext : public DeviceContext {
    PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
  }
-  // FIXME(zcd): A temporary fix for some language model that has sparse
-  // parameter.
-  template <typename Callback>
-  void RecordEventNoMutex(cudaEvent_t ev, Callback callback) {
-    callback();
-    PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
-  }
 private:
  CUDAPlace place_;

--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
-list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc nccl.cc)
+list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc)
+# There is no macOS version of NCCL.
+if (NOT APPLE)
+  list(APPEND CUDA_SRCS nccl.cc)
+endif()
 if (TENSORRT_FOUND)
  list(APPEND CUDA_SRCS tensorrt.cc)
 endif()
 configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
 if (CUPTI_FOUND)
    list(APPEND CUDA_SRCS cupti.cc)

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -44,8 +44,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/dynload/curand.h"
+#ifndef __APPLE__
 #include "paddle/fluid/platform/dynload/nccl.h"
-#endif
+#endif  // __APPLE__
+#endif  // PADDLE_WITH_CUDA
 namespace paddle {
 namespace platform {
@@ -174,6 +176,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
  throw std::runtime_error(err + string::Sprintf(args...));
 }
+#ifndef __APPLE__
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
    ncclResult_t stat, const Args&... args) {
@@ -184,7 +187,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
                             string::Sprintf(args...));
  }
 }
+#endif  // __APPLE__
 #endif  // PADDLE_WITH_CUDA
 template <typename T>

--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -99,5 +99,143 @@ inline mkldnn::memory::format GetMKLDNNFormat(const mkldnn::memory memory) {
      memory.get_primitive_desc().desc().data.format);
 }
+inline mkldnn::memory::format GetMKLDNNFormat(
+    const mkldnn::sum::primitive_desc& memory) {
+  return static_cast<mkldnn::memory::format>(
+      memory.dst_primitive_desc().desc().data.format);
+}
+class MKLDNNHandler {
+ public:
+  MKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+                const std::string& base_key)
+      : dev_ctx_(dev_ctx),
+        engine_(engine),
+        key_(base_key),
+        is_reusing_(false) {}
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_src_mem_p");
+  }
+  std::shared_ptr<mkldnn::memory> AcquireWeightsMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_weights_mem_p");
+  }
+  std::shared_ptr<mkldnn::memory> AcquireDstMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_dst_mem_p");
+  }
+  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_diff_dst_mem_p");
+  }
+  std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_diff_src_mem_p");
+  }
+  std::shared_ptr<mkldnn::memory> AcquireMemoryFromPrimitive(
+      mkldnn::memory::primitive_desc mdp, void* ptr,
+      const std::string& suffix) {
+    auto local_key = key_ + suffix;
+    auto mem_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find mem primitive in device context");
+    if (mem_p == nullptr) {
+      mem_p = std::make_shared<mkldnn::memory>(mdp, ptr);
+      dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      mem_p->set_data_handle(ptr);
+      // Mark that reusing happenned. All primitives from operator instance
+      // should be reused or none of them. So we check consistency
+      is_reusing_ = true;
+    }
+    return mem_p;
+  }
+  std::shared_ptr<mkldnn::memory> AcquireMemory(const mkldnn::memory::desc& md,
+                                                void* ptr,
+                                                const std::string& suffix) {
+    /*Generate key*/
+    auto local_key = key_ + suffix;
+    auto mem_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find mem primitive in device context");
+    if (mem_p == nullptr) {
+      mem_p = std::make_shared<mkldnn::memory>(
+          mkldnn::memory::primitive_desc{md, engine_}, ptr);
+      dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      mem_p->set_data_handle(ptr);
+      // Mark that reusing happenned. All primitives from operator instance
+      // should be reused or none of them. So we check consistency
+      is_reusing_ = true;
+    }
+    return mem_p;
+  }
+  std::shared_ptr<mkldnn::memory> AcquireMemory(
+      mkldnn::memory::primitive_desc& mpd,
+      mkldnn::memory::primitive_desc& user_mpd,
+      const std::shared_ptr<mkldnn::memory> user_memory_p,
+      const std::string& suffix, std::vector<mkldnn::primitive>& pipeline) {
+    // create reorder primitive if the input format is not the preferred one
+    auto local_key = key_ + suffix;
+    auto key_reorder_p = key_ + suffix + "reorder_p";
+    auto target_memory_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((target_memory_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find mem primitive in device context");
+    if (target_memory_p == nullptr) {
+      target_memory_p = user_memory_p;
+      std::shared_ptr<mkldnn::primitive> reorder_p;
+      if (mpd != user_mpd) {
+        target_memory_p = std::make_shared<mkldnn::memory>(mpd);
+        auto reorder_p =
+            std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p);
+        dev_ctx_.SetBlob(key_reorder_p, reorder_p);
+        pipeline.push_back(*reorder_p);
+      }
+      dev_ctx_.SetBlob(local_key, target_memory_p);
+    } else {
+      // Make reorder if needed
+      auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
+          dev_ctx_.GetBlob(key_reorder_p));
+      if (reorder_p != nullptr) {
+        pipeline.push_back(*reorder_p);
+      }
+      is_reusing_ = true;
+    }
+    return target_memory_p;
+  }
+  static std::string GetHash(mkldnn::memory::dims& operand_dims,
+                             const std::string& suffix) {
+    auto dims2str = [](const mkldnn::memory::dims& operand_dims) {
+      std::string dstr = "";
+      for (size_t i = 0; i < operand_dims.size(); ++i) {
+        dstr += std::to_string(operand_dims[i]) + "-";
+      }
+      return dstr;
+    };
+    return dims2str(operand_dims) + suffix;
+  };
+ protected:
+  const MKLDNNDeviceContext& dev_ctx_;
+  mkldnn::engine engine_;
+  std::string key_;
+  bool is_reusing_;
+};
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -268,7 +268,8 @@ void BindOpDesc(pybind11::module *m) {
      .value("STRINGS", pd::proto::AttrType::STRINGS)
      .value("BOOL", pd::proto::AttrType::BOOLEAN)
      .value("BOOLS", pd::proto::AttrType::BOOLEANS)
-      .value("BLOCK", pd::proto::AttrType::BLOCK);
+      .value("BLOCK", pd::proto::AttrType::BLOCK)
+      .value("BLOCKS", pd::proto::AttrType::BLOCKS);
  pybind11::class_<pd::OpDesc> op_desc(*m, "OpDesc", "");
  op_desc
@@ -293,6 +294,7 @@ void BindOpDesc(pybind11::module *m) {
      .def("set_attr", &pd::OpDesc::SetAttr)
      .def("attr", &pd::OpDesc::GetAttr)
      .def("set_block_attr", &pd::OpDesc::SetBlockAttr)
+      .def("set_blocks_attr", &pd::OpDesc::SetBlocksAttr)
      .def("set_serialized_attr",
           [](pd::OpDesc &self, const std::string &name,
              const pybind11::bytes &seriralized) {

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -167,9 +167,6 @@ PYBIND11_PLUGIN(core) {
      .def("set_lod",
           [](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
             // the input lod is offset-based level-of-detail info
-             LOG(WARNING)
-                 << "set_lod is deprecated and will be removed by 9.2018, "
-                    "please switch to set_recursive_sequence_lengths.";
             LoD new_lod;
             new_lod.reserve(lod.size());
             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
@@ -196,8 +193,6 @@ PYBIND11_PLUGIN(core) {
      .def("lod",
           [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
             // output the offset-based lod info
-             LOG(WARNING) << "lod is deprecated and will be removed by 9.2018, "
-                             "please switch to recursive_sequence_lengths.";
             LoD lod = self.lod();
             std::vector<std::vector<size_t>> new_lod;
             new_lod.reserve(lod.size());

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -22,7 +22,7 @@
 function print_usage() {
    echo -e "\n${RED}Usage${NONE}:
    ${BOLD}${SCRIPT_NAME}${NONE} [OPTION]"
    echo -e "\n${RED}Options${NONE}:
    ${BLUE}build${NONE}: run build for x86 platform
    ${BLUE}build_android${NONE}: run build for android platform
@@ -133,7 +133,7 @@ EOF
        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
        -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
-        -DWITH_ANAKIN=ON
+        -DWITH_ANAKIN=${WITH_ANAKIN:-ON}
 }
 function abort(){
@@ -198,7 +198,7 @@ function build_android() {
    fi
    ANDROID_STANDALONE_TOOLCHAIN=$ANDROID_TOOLCHAINS_DIR/$ANDROID_ARCH-android-$ANDROID_API
    cat <<EOF
    ============================================
    Generating the standalone toolchain ...
@@ -212,13 +212,13 @@ EOF
          --arch=$ANDROID_ARCH \
          --platform=android-$ANDROID_API \
          --install-dir=$ANDROID_STANDALONE_TOOLCHAIN
    BUILD_ROOT=${PADDLE_ROOT}/build_android
    DEST_ROOT=${PADDLE_ROOT}/install_android
    mkdir -p $BUILD_ROOT
    cd $BUILD_ROOT
    if [ $ANDROID_ABI == "armeabi-v7a" ]; then
      cmake -DCMAKE_SYSTEM_NAME=Android \
            -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
@@ -286,7 +286,7 @@ function build_ios() {
          -DWITH_TESTING=OFF \
          -DWITH_SWIG_PY=OFF \
          -DCMAKE_BUILD_TYPE=Release
    make -j 2
 }
@@ -331,14 +331,14 @@ EOF
 function bind_test() {
    # the number of process to run tests
    NUM_PROC=6
    # calculate and set the memory usage for each process
    MEM_USAGE=$(printf "%.2f" `echo "scale=5; 1.0 / $NUM_PROC" | bc`)
    export FLAGS_fraction_of_gpu_memory_to_use=$MEM_USAGE
    # get the CUDA device count
    CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
    for (( i = 0; i < $NUM_PROC; i++ )); do
        cuda_list=()
        for (( j = 0; j < $CUDA_DEVICE_COUNT; j++ )); do

--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -132,9 +132,9 @@ def _addup_repetitive_outputs_(op_descs):
    for idx, op_desc in enumerate(op_descs):
        for var_name in op_desc.input_arg_names():
            if len(renamed_vars[var_name]) > 1:
-                pending_sum_ops.append(
+                pending_sum_ops.append((_create_op_desc_(
-                    (_create_op_desc_("sum", {"X": renamed_vars[var_name]},
+                    "sum", {"X": renamed_vars[var_name]}, {"Out": [var_name]},
-                                      {"Out": [var_name]}, {}), idx))
+                    {"use_mkldnn": False}), idx))
                renamed_vars[var_name] = [var_name]
        for var_name in op_desc.output_arg_names():
            if var_name == core.empty_var_name(
@@ -161,8 +161,9 @@ def _addup_repetitive_outputs_(op_descs):
                renamed_vars[var_name].append(new_name)
    for var_name, inputs in renamed_vars.iteritems():
        if len(inputs) > 1:
-            pending_sum_ops.append((_create_op_desc_(
+            pending_sum_ops.append(
-                "sum", {"X": inputs}, {"Out": [var_name]}, {}), len(op_descs)))
+                (_create_op_desc_("sum", {"X": inputs}, {"Out": [var_name]},
+                                  {"use_mkldnn": False}), len(op_descs)))
    # sum_op descs are sorted according to their insert position
    for p in reversed(pending_sum_ops):
        op_descs.insert(p[1], p[0])

--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -78,6 +78,8 @@ def as_numpy(tensor):
    Returns:
        numpy.ndarray
    """
+    if isinstance(tensor, core.LoDTensorArray):
+        return [as_numpy(t) for t in tensor]
    if isinstance(tensor, list):
        return [as_numpy(t) for t in tensor]
    assert isinstance(tensor, core.LoDTensor)

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -559,15 +559,9 @@ class Operator(object):
                if (attr_name not in self.attrs) or (
                        self.attrs[attr_name] is None):
                    continue
-                if isinstance(self.attrs[attr_name], Block):
+                attr_val = self.attrs[attr_name]
-                    self.desc.set_block_attr(attr_name,
+                self._update_desc_attr(attr_name, attr_val)
-                                             self.attrs[attr_name].desc)
-                elif isinstance(self.attrs[attr_name], core.BlockDesc) or \
-                        isinstance(self.attrs[attr_name], core.ProgramDesc):
-                    self.desc.set_serialized_attr(
-                        attr_name, self.attrs[attr_name].serialize_to_string())
-                else:
-                    self.desc.set_attr(attr_name, self.attrs[attr_name])
        self.desc.check_attrs()
        if self.has_kernel(type):
            self.desc.infer_var_type(self.block.desc)
@@ -714,8 +708,24 @@ class Operator(object):
            ValueError: If the type of value doesn't match with desc.attr_type(name).
        """
        self.attrs[name] = val
+        self._update_desc_attr(name, val)
+    def _update_desc_attr(self, name, val):
+        """
+        Update the value of desc's attribute by attribute's name.
+        Args:
+            name(str): the attribute name.
+            val(bool|int|str|float|list): the value of the attribute.
+        Raises:
+            ValueError: If the type of value doesn't match with desc.attr_type(name).
+        """
        if isinstance(val, Block):
            self.desc.set_block_attr(name, val.desc)
+        elif isinstance(val, list) and val and all(
+                isinstance(v, Block) for v in val):
+            self.desc.set_blocks_attr(name, [v.desc for v in val])
        elif isinstance(val, core.BlockDesc) or \
                isinstance(val, core.ProgramDesc):
            self.desc.set_serialized_attr(name, val.serialize_to_string())
@@ -1388,7 +1398,11 @@ class Program(object):
        * Set for_test to True when we want to clone the program for testing.
        Notes: This API DOES NOT prune any operator. Use
-        :code:`clone(for_test=True)` before backward and optimization please.
+        :code:`clone(for_test=True)` before backward and optimization please. e.g.
+            >>> test_program = fluid.default_main_program().clone(for_test=True)
+            >>> optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+            >>> optimizer.minimize()
        Args:
            for_test(bool): True if change the :code:`is_test` attribute of

--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -110,7 +110,7 @@ class BlockGuardServ(BlockGuard):
 class ListenAndServ(object):
    """
    **ListenAndServ Layer**
    ListenAndServ is used to create a rpc server bind and listen
    on specific TCP port, this server will run the sub-block when
    received variables from clients.
@@ -186,7 +186,6 @@ class ListenAndServ(object):
        main_program = self.helper.main_program
        current_block = main_program.current_block()
        parent_block = self.parent_block()
-        empty_block = Program().global_block()
        parent_block.append_op(
            type='listen_and_serv',
@@ -195,8 +194,9 @@ class ListenAndServ(object):
            attrs={
                'endpoint': self.endpoint,
                'Fanin': self.fan_in,
-                'OptimizeBlock': current_block,
+                'optimize_blocks': [
-                'PrefetchBlock': empty_block,
+                    current_block
+                ],  # did not support multiple optimize blocks in layers
                'sync_mode': True,  # did not support async now in layers
                'grad_to_block_id': [""]
            })
@@ -212,7 +212,7 @@ def Send(endpoints, send_vars, sync=True):
                   of send_vars to send
        send_vars (list): variables to send to server
        sync (bool): whether to wait the request finish
    """
    assert (type(send_vars) == list)
@@ -469,10 +469,13 @@ def open_files(filenames,
       lod_levels(list): List of ints which declaring data lod_level.
       dtypes(list): List of strs which declaring data type.
       thread_num(int): The maximal concurrent prefetch thread number.
-       buffer_size(int): The size of prefetch buffer.
+       buffer_size(int|None): The size of prefetch buffer. If it is setted None, 
+            buffer size will be thread_num * 3.
+            Default: None
       pass_num(int): Number of passes to run.
       for_parallel(Bool): Set it as True if you are going to run 
            subsequent operators in parallel.
+            Default: True
    Returns:
       Variable: A Reader Variable via which we can get file data.
@@ -492,7 +495,7 @@ def open_files(filenames,
         image, label = fluid.layers.io.read_file(reader)
    """
    if buffer_size is None:
-        buffer_size = thread_num
+        buffer_size = thread_num * 3
    if isinstance(filenames, basestring):
        filenames = [filenames]
    dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-All layers just related to the neural network. 
+All layers just related to the neural network.
 """
 from ..layer_helper import LayerHelper
@@ -23,6 +23,7 @@ from layer_function_generator import autodoc, templatedoc
 from tensor import concat
 import utils
 import random
+from .. import unique_name
 __all__ = [
    'fc',
@@ -109,14 +110,14 @@ def fc(input,
    """
    **Fully Connected Layer**
-    This function creates a fully connected layer in the network. It can take 
+    This function creates a fully connected layer in the network. It can take
-    multiple tensors as its inputs. It creates a variable called weights for 
+    multiple tensors as its inputs. It creates a variable called weights for
-    each input tensor, which represents a fully connected weight matrix from 
+    each input tensor, which represents a fully connected weight matrix from
-    each input unit to each output unit. The fully connected layer multiplies 
+    each input unit to each output unit. The fully connected layer multiplies
-    each input tensor with its coresponding weight to produce an output Tensor. 
+    each input tensor with its coresponding weight to produce an output Tensor.
-    If multiple input tensors are given, the results of multiple multiplications 
+    If multiple input tensors are given, the results of multiple multiplications
-    will be sumed up. If bias_attr is not None, a bias variable will be created 
+    will be sumed up. If bias_attr is not None, a bias variable will be created
-    and added to the output. Finally, if activation is not None, it will be applied 
+    and added to the output. Finally, if activation is not None, it will be applied
    to the output as well.
    This process can be formulated as follows:
@@ -198,7 +199,10 @@ def fc(input,
    else:
        pre_bias = helper.create_tmp_variable(dtype)
        helper.append_op(
-            type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
+            type="sum",
+            inputs={"X": mul_results},
+            outputs={"Out": pre_bias},
+            attrs={"use_mkldnn": use_mkldnn})
    # add bias
    pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)
    # add activation
@@ -847,7 +851,7 @@ def crf_decoding(input, param_attr, label=None):
    Returns:
        Variable: ${viterbi_path_comment}
    Examples:
        .. code-block:: python
@@ -1085,7 +1089,7 @@ def chunk_eval(input,
    Here is a NER example of labeling for these tagging schemes:
    .. code-block:: python
       ====== ====== ======  =====  ==  ============   =====  ===== =====  ==  =========
              Li     Ming    works  at  Agricultural   Bank   of    China  in  Beijing.
       ====== ====== ======  =====  ==  ============   =====  ===== =====  ==  =========
@@ -1111,7 +1115,7 @@ def chunk_eval(input,
    is the num of chunk types, and `tag_type` get its value from the following table.
    .. code-block:: python
       Scheme Begin Inside End   Single
        plain   0     -      -     -
        IOB     0     1      -     -
@@ -1147,7 +1151,7 @@ def chunk_eval(input,
        tuple: tuple containing: precision, recall, f1_score,
        num_infer_chunks, num_label_chunks,
        num_correct_chunks
    Examples:
        .. code-block:: python
@@ -1247,7 +1251,7 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
    """
    This function computes the softmax activation among all time-steps for each
    sequence. The dimension of each time-step should be 1. Thus, the shape of
-    input Tensor can be either :math:`[N, 1]` or :math:`[N]`, where :math:`N` 
+    input Tensor can be either :math:`[N, 1]` or :math:`[N]`, where :math:`N`
    is the sum of the length of all sequences.
    For i-th sequence in a mini-batch:
@@ -1267,7 +1271,7 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
        param_attr (ParamAttr|None): attributes for parameter
        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn \
        library is installed. Default: True
    Returns:
        Variable: output of sequence_softmax
@@ -1828,11 +1832,11 @@ def pool2d(input,
    ${comment}
    Args:
-        input (Variable): The input tensor of pooling operator. The format of 
+        input (Variable): The input tensor of pooling operator. The format of
-                          input tensor is NCHW, where N is batch size, C is 
+                          input tensor is NCHW, where N is batch size, C is
-                          the number of channels, H is the height of the 
+                          the number of channels, H is the height of the
                          feature, and W is the width of the feature.
-        pool_size (int): The side length of pooling windows. All pooling 
+        pool_size (int): The side length of pooling windows. All pooling
                         windows are squares with pool_size on a side.
        pool_type: ${pooling_type_comment}
        pool_stride (int): stride of the pooling layer.
@@ -1841,7 +1845,7 @@ def pool2d(input,
        use_cudnn: ${use_cudnn_comment}
        ceil_mode: ${ceil_mode_comment}
        use_mkldnn: ${use_mkldnn_comment}
-        name (str|None): A name for this layer(optional). If set None, the 
+        name (str|None): A name for this layer(optional). If set None, the
                        layer will be named automatically.
    Returns:
@@ -1859,10 +1863,10 @@ def pool2d(input,
          data = fluid.layers.data(
              name='data', shape=[3, 32, 32], dtype='float32')
          conv2d = fluid.layers.pool2d(
-                            input=data, 
+                            input=data,
-                            pool_size=2, 
+                            pool_size=2,
-                            pool_type='max', 
+                            pool_type='max',
-                            pool_stride=1, 
+                            pool_stride=1,
                            global_pooling=False)
    """
    if pool_type not in ["max", "avg"]:
@@ -2227,14 +2231,14 @@ def beam_search_decode(ids, scores, name=None):
    This layers is to pack the output of beam search layer into sentences and
    associated scores. It is usually called after the beam search layer.
    Typically, the output of beam search layer is a tensor of selected ids, with
-    a tensor of the score of each id. Beam search layer's output ids, however, 
+    a tensor of the score of each id. Beam search layer's output ids, however,
-    are generated directly during the tree search, and they are stacked by each 
+    are generated directly during the tree search, and they are stacked by each
-    level of the search tree. Thus we need to reorganize them into sentences, 
+    level of the search tree. Thus we need to reorganize them into sentences,
    based on the score of each id. This layer takes the output of beam search
    layer as input and repack them into sentences.
    Args:
-        ids (Variable): The selected ids, output of beam search layer. 
+        ids (Variable): The selected ids, output of beam search layer.
        scores (Variable): The associated scores of the ids, out put of beam
            search layer.
        name (str): The name of this layer. It is optional.
@@ -2242,7 +2246,7 @@ def beam_search_decode(ids, scores, name=None):
    Returns:
        tuple(Variable): a tuple of two output tensors: sentence_ids, sentence_scores.
        sentence_ids is a tensor with shape [size, length], where size is the
-        beam size of beam search, and length is the length of each sentence. 
+        beam size of beam search, and length is the length of each sentence.
        Note that the length of sentences may vary.
        sentence_scores is a tensor with the same shape as sentence_ids.
@@ -2919,7 +2923,7 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None):
            `None`, compute the mean over all elements of :attr:`input`
            and return a variable with a single element, otherwise it
            must be in the range :math:`[-rank(input), rank(input))`. If
-            :math:`dim[i] < 0`, the dimension to reduce is 
+            :math:`dim[i] < 0`, the dimension to reduce is
            :math:`rank(input) + dim[i]`.
        keep_dim (bool): Whether to reserve the reduced dimension in the
            output Tensor. The result tensor will have one fewer dimension
@@ -3390,16 +3394,16 @@ def topk(input, k, name=None):
    Args:
        input(Variable): The input variable which can be a vector or Tensor with
            higher rank.
-        k(int):  The number of top elements to look for along the last dimension 
+        k(int):  The number of top elements to look for along the last dimension
                 of input.
        name(str|None): A name for this layer(optional). If set None, the layer
-                       will be named automatically. 
+                       will be named automatically.
                       Default: None
    Returns:
-        Tuple[Variable]: A tuple with two elements. Each element is a Variable. 
+        Tuple[Variable]: A tuple with two elements. Each element is a Variable.
-        The first one is k largest elements along each last 
+        The first one is k largest elements along each last
-        dimensional slice. The second one is indices of values 
+        dimensional slice. The second one is indices of values
        within the last dimension of input.
    Raises:
@@ -3594,15 +3598,15 @@ def warpctc(input, label, blank=0, norm_by_times=False):
         It's shape is [Lp, num_classes + 1], where Lp is the sum of all input
         sequences' length and num_classes is the true number of classes.
         (not including the blank label).
-       label (Variable): The ground truth of variable-length sequence, 
+       label (Variable): The ground truth of variable-length sequence,
         which is a 2-D Tensor with LoD information. It is of the shape [Lg, 1],
         where Lg is th sum of all labels' length.
       blank (int, default 0): The blank label index of Connectionist
         Temporal Classification (CTC) loss, which is in the
         half-opened interval [0, num_classes + 1).
-       norm_by_times(bool, default false): Whether to normalize the gradients 
+       norm_by_times(bool, default false): Whether to normalize the gradients
-         by the number of time-step, which is also the sequence's length. 
+         by the number of time-step, which is also the sequence's length.
-         There is no need to normalize the gradients if warpctc layer was 
+         There is no need to normalize the gradients if warpctc layer was
         follewed by a mean_op.
    Returns:
@@ -3708,8 +3712,8 @@ def nce(input,
        input (Variable): input variable.
        label (Variable): label.
        num_total_classes (int):${num_total_classes_comment}
-        sample_weight (Variable|None): A Variable of shape [batch_size, 1] 
+        sample_weight (Variable|None): A Variable of shape [batch_size, 1]
-            storing a weight for each sample. The default weight for each 
+            storing a weight for each sample. The default weight for each
            sample is 1.0.
        param_attr (ParamAttr|None): attributes for parameter
        bias_attr (ParamAttr|None): attributes for bias
@@ -4099,7 +4103,7 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
    This layer computes the smooth L1 loss for Variable :attr:`x` and :attr:`y`.
    It takes the first dimension of :attr:`x` and :attr:`y` as batch size.
    For each instance, it computes the smooth L1 loss element by element first
-    and then sums all the losses. So the shape of ouput Variable is 
+    and then sums all the losses. So the shape of ouput Variable is
    [batch_size, 1].
    Args:
@@ -4108,14 +4112,14 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
        y (Variable): A tensor with rank at least 2. The target value of smooth
            L1 loss op with same shape as :attr:`x`.
        inside_weight (Variable|None):  A tensor with rank at least 2. This
-            input is optional and should have same shape with :attr:`x`. If 
+            input is optional and should have same shape with :attr:`x`. If
-            provided, the result of (:attr:`x` - :attr:`y`) will be multiplied 
+            provided, the result of (:attr:`x` - :attr:`y`) will be multiplied
            by this tensor element by element.
        outside_weight (Variable|None): A tensor with rank at least 2. This
-            input is optional and should have same shape with :attr:`x`. If 
+            input is optional and should have same shape with :attr:`x`. If
-            provided, the out smooth L1 loss will be multiplied by this tensor 
+            provided, the out smooth L1 loss will be multiplied by this tensor
            element by element.
-        sigma (float|None): Hyper parameter of smooth L1 loss layer. A float 
+        sigma (float|None): Hyper parameter of smooth L1 loss layer. A float
           scalar with default value 1.0.
    Returns:
@@ -4161,7 +4165,7 @@ def one_hot(input, depth):
    Examples:
        .. code-block:: python
            label = layers.data(name="label", shape=[1], dtype="float32")
            one_hot_label = layers.one_hot(input=label, depth=10)
    """
@@ -4263,14 +4267,18 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
                                say :attr:`actual_shape` has a higher priority
                                than :attr:`shape`.
        act (str): The non-linear activation to be applied to output variable.
-        inplace(bool): If this flag is set true, a new output tensor is created
+        inplace(bool): If this flag is set true, the output
-                       whose data is copied from input x, otherwise the output
+                       shares data with input without copying, otherwise
-                       shares data with input without copying.
+                       a new output tensor is created
+                       whose data is copied from input x.
        name (str): The name of this layer. It is optional.
    Returns:
        Variable: The output tensor.
+    Raises:
+        TypeError: if actual_shape is neither Variable nor None.
    Examples:
        .. code-block:: python
@@ -4282,6 +4290,11 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
    if not (isinstance(shape, list) or isinstance(shape, tuple)):
        raise ValueError("Input shape must be a python lsit or tuple.")
+    inputs = {"X": x}
+    if isinstance(actual_shape, Variable):
+        inputs["Shape"] = actual_shape
+    elif actual_shape is not None:
+        raise TypeError("actual_shape should either be Variable or None")
    # Validate the shape
    unk_dim_idx = -1
@@ -4302,9 +4315,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
    reshaped = helper.create_tmp_variable(dtype=x.dtype)
    helper.append_op(
        type="reshape",
-        inputs={"X": x,
+        inputs=inputs,
-                "Shape": actual_shape}
-        if isinstance(actual_shape, Variable) else {"X": x},
        attrs={"shape": shape,
               "inplace": inplace},
        outputs={"Out": reshaped})
@@ -4315,10 +4326,10 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
 def lod_reset(x, y=None, target_lod=None):
    """
    Set LoD of :attr:`x` to a new one specified by :attr:`y` or
-    :attr:`target_lod`. When :attr:`y` provided, :attr:`y.lod` would be 
+    :attr:`target_lod`. When :attr:`y` provided, :attr:`y.lod` would be
-    considered as target LoD first, otherwise :attr:`y.data` would be 
+    considered as target LoD first, otherwise :attr:`y.data` would be
-    considered as target LoD. If :attr:`y` is not provided, target LoD should 
+    considered as target LoD. If :attr:`y` is not provided, target LoD should
-    be specified by :attr:`target_lod`. If target LoD is specified by 
+    be specified by :attr:`target_lod`. If target LoD is specified by
    :attr:`Y.data` or :attr:`target_lod`, only one level LoD is supported.
    .. code-block:: text
@@ -4372,7 +4383,7 @@ def lod_reset(x, y=None, target_lod=None):
    Args:
        x (Variable): Input variable which could be a Tensor or LodTensor.
-        y (Variable|None): If provided, output's LoD would be derived 
+        y (Variable|None): If provided, output's LoD would be derived
                           from :attr:`y`.
        target_lod (list|tuple|None): One level LoD which should be considered
                                      as target LoD when :attr:`y` not provided.
@@ -4688,7 +4699,7 @@ def image_resize(input,
    """
    **Resize a Batch of Images**
-    The input must be a tensor of the shape (num_batches, channels, in_h, in_w), 
+    The input must be a tensor of the shape (num_batches, channels, in_h, in_w),
    and the resizing only applies on the last two dimensions(hight and width).
    Supporting resample methods:
@@ -4784,9 +4795,9 @@ def resize_bilinear(input, out_shape=None, scale=None, name=None):
 def image_resize_short(input, out_short_len, resample='BILINEAR'):
    """
-    Resize a batch of images. The short edge of input images will be 
+    Resize a batch of images. The short edge of input images will be
-    resized to the given 'out_short_len'. The long edge of input images 
+    resized to the given 'out_short_len'. The long edge of input images
-    will be resized proportionately to make images' length-width ratio 
+    will be resized proportionately to make images' length-width ratio
    constant.
    Args:
@@ -4819,7 +4830,7 @@ def gather(input, index):
    """
    **Gather Layer**
-    Output is obtained by gathering entries of the outer-most dimension 
+    Output is obtained by gathering entries of the outer-most dimension
    of X indexed by `index` and concatenate them together.
    .. math::
@@ -4844,7 +4855,7 @@ def gather(input, index):
                       [5, 6]]
    Args:
-        input (Variable): The source input with rank>=1. 
+        input (Variable): The source input with rank>=1.
        index (Variable): The index input with rank=1.
    Returns:
@@ -4880,40 +4891,32 @@ def random_crop(x, shape, seed=None):
    Returns:
        ${out_comment}
    Examples:
        >>> img = fluid.layers.data("img", [3, 256, 256])
        >>> cropped_img = fluid.layers.random_crop(img, shape=[3, 224, 224])
    """
    helper = LayerHelper("random_crop", **locals())
-    dtype = helper.input_dtype()
+    dtype = x.dtype
    out = helper.create_tmp_variable(dtype)
    if seed is None:
        seed = random.randint(-65536, 65535)
+    op_attrs = {"shape": shape}
    if isinstance(seed, int):
-        seed_value = seed
+        op_attrs["startup_seed"] = seed
-        seed = helper.create_tmp_variable(dtype="int64")
+        seed = helper.create_variable(
-        helper.append_op(
+            name=unique_name.generate("random_crop_seed"),
-            type="fill_constant",
+            dtype="int64",
-            inputs={},
+            persistable=True)
-            outputs={"Out": seed},
-            attrs={
-                "dtype": seed.dtype,
-                "shape": [1],
-                "value": float(seed_value),
-                "force_cpu": True
-            })
    elif not isinstance(seed, Variable):
        raise ValueError("'seed' must be a Variable or an int.")
-    seed_out = helper.create_tmp_variable(dtype="int64")
    helper.append_op(
        type="random_crop",
        inputs={"X": x,
                "Seed": seed},
        outputs={"Out": out,
-                 "SeedOut": seed_out},
+                 "SeedOut": seed},
-        attrs={"shape": shape})
+        attrs=op_attrs)
    return out
@@ -4926,7 +4929,7 @@ def log(x):
        Out = \\ln(x)
    Args:
-        x (Variable): Input tensor. 
+        x (Variable): Input tensor.
    Returns:
        Variable: The natural log of the input tensor computed element-wise.
@@ -4955,7 +4958,7 @@ def relu(x):
        Out = \\max(0, x)
    Args:
-        x (Variable): The input tensor. 
+        x (Variable): The input tensor.
    Returns:
        Variable: The output tensor with the same shape as input.
@@ -4976,15 +4979,15 @@ def relu(x):
 def mean_iou(input, label, num_classes):
    """
    Mean Intersection-Over-Union is a common evaluation metric for
-    semantic image segmentation, which first computes the IOU for each 
+    semantic image segmentation, which first computes the IOU for each
-    semantic class and then computes the average over classes. 
+    semantic class and then computes the average over classes.
-    IOU is defined as follows: 
+    IOU is defined as follows:
    .. math::
        IOU = \\frac{true\_positiv}{(true\_positive + false\_positive + false\_negative)}.
-    The predictions are accumulated in a confusion matrix and mean-IOU 
+    The predictions are accumulated in a confusion matrix and mean-IOU
    is then calculated from it.
@@ -4997,12 +5000,12 @@ def mean_iou(input, label, num_classes):
    Returns:
        mean_iou (Variable): A Tensor representing the mean intersection-over-union with shape [1].
        out_wrong(Variable): A Tensor with shape [num_classes]. The wrong numbers of each class.
-        out_correct(Variable): A Tensor with shape [num_classes]. The correct numbers of each class. 
+        out_correct(Variable): A Tensor with shape [num_classes]. The correct numbers of each class.
    Examples:
        .. code-block:: python
            iou, wrongs, corrects = fluid.layers.mean_iou(predict, label, num_classes)
    """
    helper = LayerHelper('mean_iou', **locals())

--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -155,7 +155,7 @@ def cast(x, dtype):
    Examples:
        .. code-block:: python
            data = fluid.layers.data(name='x', shape=[13], dtype='float32')
            result = fluid.layers.cast(x=data, dtype='float64')
    """
@@ -188,7 +188,7 @@ def concat(input, axis=0, name=None):
    Examples:
        .. code-block:: python
           out = fluid.layers.concat(input=[Efirst, Esecond, Ethird, Efourth])
    """
    helper = LayerHelper('concat', **locals())
@@ -230,11 +230,15 @@ def sums(input, out=None):
    helper = LayerHelper('sum', **locals())
    if out is None:
        out = helper.create_tmp_variable(dtype=helper.input_dtype())
-    helper.append_op(type='sum', inputs={'X': input}, outputs={'Out': out})
+    helper.append_op(
+        type='sum',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={'use_mkldnn': False})
    return out
-def assign(input, output):
+def assign(input, output=None):
    """
    **Assign**
@@ -242,7 +246,7 @@ def assign(input, output):
    Args:
        input(Variable|numpy.ndarray): The source variable
-        output(Variable): The destination variable
+        output(Variable|None): The destination variable
    Returns:
        Variable: The destination variable that was supplied as the *output*.
@@ -255,6 +259,8 @@ def assign(input, output):
          fluid.layers.assign(hidden, out)
    """
    helper = LayerHelper('assign', **locals())
+    if output is None:
+        output = helper.create_tmp_variable(dtype=input.dtype)
    if isinstance(input, Variable):
        helper.append_op(
            type='assign', inputs={'X': [input]}, outputs={'Out': [output]})
@@ -380,7 +386,7 @@ def argmin(x, axis=0):
    """
    **argmin**
-    This function computes the indices of the min elements 
+    This function computes the indices of the min elements
    of the input tensor's element along the provided axis.
    Args:
@@ -395,7 +401,7 @@ def argmin(x, axis=0):
        .. code-block:: python
          out = fluid.layers.argmin(x=in, axis=0)
-          out = fluid.layers.argmin(x=in, axis=-1)  
+          out = fluid.layers.argmin(x=in, axis=-1)
    """
    helper = LayerHelper("arg_min", **locals())
    out = helper.create_tmp_variable(VarDesc.VarType.INT64)
@@ -411,7 +417,7 @@ def argmax(x, axis=0):
    """
    **argmax**
-    This function computes the indices of the max elements 
+    This function computes the indices of the max elements
    of the input tensor's element along the provided axis.
    Args:
@@ -426,7 +432,7 @@ def argmax(x, axis=0):
        .. code-block:: python
          out = fluid.layers.argmax(x=in, axis=0)
-          out = fluid.layers.argmax(x=in, axis=-1)  
+          out = fluid.layers.argmax(x=in, axis=-1)
    """
    helper = LayerHelper("arg_max", **locals())
    out = helper.create_tmp_variable(VarDesc.VarType.INT64)
@@ -495,9 +501,9 @@ def reverse(x, axis):
    Args:
        x(Vairbale): the input to be reversed.
-        axis(int|tuple|list): Axis that along which order of elements 
+        axis(int|tuple|list): Axis that along which order of elements
-                    is reversed. If it is a tuple or a list, reversing 
+                    is reversed. If it is a tuple or a list, reversing
-                    will be apply on each axis in the tuple or list.  
+                    will be apply on each axis in the tuple or list.
    Returns:
        Variable: The reversed tensor.
@@ -528,9 +534,9 @@ def save(x, file_path, overwrite=True):
    Args:
        x(variable): The Tensor/LoDTensor to be saved.
        file_path(str): The file path where the variable will be saved.
-        overwrite(bool): Whether or not cover the given file when it has already 
+        overwrite(bool): Whether or not cover the given file when it has already
-            existed. If it's set 'False' and the file is existed, a runtime 
+            existed. If it's set 'False' and the file is existed, a runtime
-            error will be thrown. 
+            error will be thrown.
    """
    helper = LayerHelper("save", **locals())
    helper.append_op(
@@ -550,8 +556,8 @@ def save_combine(x, file_path, overwrite=True):
                 a single file.
        file_path(str): The file path where variables will be saved.
        overwrite(bool): Whether or not cover the given file when it has already
-            existed. If it's set 'False' and the file is existed, a runtime 
+            existed. If it's set 'False' and the file is existed, a runtime
-            error will be thrown. 
+            error will be thrown.
    Returns:
        There is no return value.

--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -596,12 +596,12 @@ class Auc(MetricBase):
            tp, fn, tn, fp = 0, 0, 0, 0
            for i, lbl in enumerate(labels):
                if lbl:
-                    if predictions[i, 1] >= thresh:
+                    if preds[i, 1] >= thresh:
                        tp += 1
                    else:
                        fn += 1
                else:
-                    if predictions[i, 1] >= thresh:
+                    if preds[i, 1] >= thresh:
                        fp += 1
                    else:
                        tn += 1

--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -160,7 +160,7 @@ class ParallelExecutor(object):
            build_strategy, num_trainers, trainer_id)
        self.scope = scope
-    def run(self, fetch_list, feed=None, feed_dict=None):
+    def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=False):
        """
        Run a parallel executor with fetch_list.
@@ -196,6 +196,8 @@ class ParallelExecutor(object):
                to each device. Default None.
            feed_dict: Alias for feed parameter, for backward compatibility.
                This parameter has been deprecated. Default None.
+            return_numpy(bool): Whether converts the fetched tensor to numpy.
+                Default: False.
        Returns:
            List: The fetched result list.
@@ -270,6 +272,9 @@ class ParallelExecutor(object):
        if self.is_dist:
            self.bcast_params()
+        if return_numpy:
+            return executor.as_numpy(arr)
        return [arr[i] for i in range(len(arr))]
    def bcast_params(self):

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -15,7 +15,7 @@ if(NOT WITH_DISTRIBUTE)
 endif(NOT WITH_DISTRIBUTE)
 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
-list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184 
+list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184
 list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185
 list(REMOVE_ITEM TEST_OPS test_nce) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778
 list(REMOVE_ITEM TEST_OPS test_recurrent_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152
@@ -43,8 +43,6 @@ list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_dist_train)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
-# TODO(wuyi): this test hungs on CI, will add it back later
-list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
 foreach(TEST_OP ${TEST_OPS})
    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
@@ -52,3 +50,4 @@ py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=$
 py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
+set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+from test_elementwise_add_op import *
+'''
+Some tests differ from the tests defined in test_elementwise_add_op.py
+because MKLDNN does not support tensors of number of dimensions 3.
+Such dimensions cause exceptions in MKLDNN reorder primitive.
+'''
+class TestMKLDNNElementwiseAddOp(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+class TestMKLDNNElementwiseAddOp_scalar(TestElementwiseAddOp_scalar):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+class TestMKLDNNElementwiseAddOp_scalar2(TestElementwiseAddOp_scalar2):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+class TestMKLDNNElementwiseAddOp_Vector(TestElementwiseAddOp_Vector):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+class TesMKLDNNtElementwiseAddOp_broadcast_0(TestElementwiseAddOp_broadcast_0):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(2).astype(self.dtype)
+        self.out = self.x + self.y.reshape(2, 1, 1, 1)
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+class TestMKLDNNElementwiseAddOp_broadcast_1(TestElementwiseAddOp_broadcast_1):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(3).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 3, 1, 1)
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+class TestMKLDNNElementwiseAddOp_broadcast_2(TestElementwiseAddOp_broadcast_2):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1, 1, 4)
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+class TestMKLDNNElementwiseAddOp_broadcast_3(TestElementwiseAddOp_broadcast_3):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+class TestMKLDNNElementwiseAddOp_broadcast_4(TestElementwiseAddOp_broadcast_4):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+class TestMKLDNNElementwiseAddOp_rowwise_add_0(
+        TestElementwiseAddOp_rowwise_add_0):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(3, 4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 3, 4, 1)
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+class TestMKLDNNElementwiseAddOp_rowwise_add_1(
+        TestElementwiseAddOp_rowwise_add_1):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+class TestMKLDNNElementwiseAddOp_channelwise_add(
+        TestElementwiseAddOp_channelwise_add):
+    def init_input_output(self):
+        self.x = np.random.rand(3, 5, 20, 20).astype(self.dtype)
+        self.y = np.random.rand(3, 1, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -18,19 +18,23 @@ from op_test import OpTest
 class TestElementwiseAddOp(OpTest):
+    def init_kernel_type(self):
+        self.use_mkldnn = False
    def setUp(self):
        self.op_type = "elementwise_add"
        self.dtype = np.float32
        self.axis = -1
        self.init_dtype()
        self.init_input_output()
+        self.init_kernel_type()
        self.init_axis()
        self.inputs = {
            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
        }
-        self.attrs = {'axis': self.axis}
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
        self.outputs = {'Out': self.out}
    def test_check_output(self):

--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
@@ -94,7 +94,7 @@ class TestListenAndServOp(OpTest):
        self._wait_ps_ready(p1.pid)
        # raise SIGTERM to pserver
-        os.kill(p1.pid, signal.SIGKILL)
+        os.kill(p1.pid, signal.SIGINT)
        p1.join()
        # run pserver on CPU in async mode
@@ -102,7 +102,7 @@ class TestListenAndServOp(OpTest):
        self._wait_ps_ready(p2.pid)
        # raise SIGTERM to pserver
-        os.kill(p2.pid, signal.SIGKILL)
+        os.kill(p2.pid, signal.SIGTERM)
        p2.join()

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -75,7 +75,9 @@ class TestFetchOp(unittest.TestCase):
                    fetch_list.append(k)
            for data in train_inputs:
-                ret = pe.run(fetch_list, feed=feeder.feed(data))
+                ret = pe.run(fetch_list,
+                             feed=feeder.feed(data),
+                             return_numpy=True)
                for i in range(len(fetch_list)):
                    assert not math.isnan(np.sum(ret[i])) and \
                           not math.isinf(np.sum(ret[i]))

--- a/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from test_sum_op import TestSumOp
+class TestMKLDNN(TestSumOp):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -20,12 +20,15 @@ from op_test import OpTest
 class TestSumOp(OpTest):
    def setUp(self):
        self.op_type = "sum"
+        self.use_mkldnn = False
+        self.init_kernel_type()
        x0 = np.random.random((3, 4)).astype('float32')
        x1 = np.random.random((3, 4)).astype('float32')
        x2 = np.random.random((3, 4)).astype('float32')
        self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
        y = x0 + x1 + x2
        self.outputs = {'Out': y}
+        self.attrs = {'use_mkldnn': self.use_mkldnn}
    def test_check_output(self):
        self.check_output()
@@ -33,6 +36,9 @@ class TestSumOp(OpTest):
    def test_check_grad(self):
        self.check_grad(['x0'], 'Out')
+    def init_kernel_type(self):
+        pass
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -396,7 +396,7 @@ class DistributeTranspiler(object):
                    return varname
            return ""
-        def __clone_lr_op_sub_block__(op, program, new_block):
+        def __clone_lr_op_sub_block__(op, program, lr_block):
            if not op.has_attr('sub_block'):
                return
@@ -405,36 +405,41 @@ class DistributeTranspiler(object):
            assert isinstance(origin_block, Block)
            # we put the new sub block to new block to follow the block
            # hierarchy of the original blocks
-            new_sub_block = program.create_block(new_block.idx)
+            new_sub_block = program.create_block(lr_block.idx)
            # clone vars
            for var in origin_block.vars:
                new_sub_block.clone_variable(var)
            # clone ops
-            for op in origin_block.ops:
+            for origin_op in origin_block.ops:
-                self._clone_lr_op(program, new_sub_block, op)
+                cloned_op = self._clone_lr_op(program, new_sub_block, origin_op)
                # clone sub_block of op
-                __clone_lr_op_sub_block__(op, program, new_sub_block)
+                __clone_lr_op_sub_block__(cloned_op, program, new_sub_block)
            # reset the block of op
            op.set_attr('sub_block', new_sub_block)
        # append lr decay ops to the child block if exists
        lr_ops = self._get_lr_ops()
+        # record optimize blocks and we can run them on pserver parallel
+        optimize_blocks = []
        if len(lr_ops) > 0:
            lr_decay_block = pserver_program.create_block(
                pserver_program.num_blocks - 1)
+            optimize_blocks.append(lr_decay_block)
            for _, op in enumerate(lr_ops):
-                self._append_pserver_non_opt_ops(lr_decay_block, op)
+                cloned_op = self._append_pserver_non_opt_ops(lr_decay_block, op)
                # append sub blocks to pserver_program in lr_decay_op
-                __clone_lr_op_sub_block__(op, pserver_program, lr_decay_block)
+                __clone_lr_op_sub_block__(cloned_op, pserver_program,
+                                          lr_decay_block)
        # append op to the current block
        grad_to_block_id = []
        pre_block_idx = pserver_program.num_blocks - 1
        for idx, opt_op in enumerate(opt_op_on_pserver):
            per_opt_block = pserver_program.create_block(pre_block_idx)
+            optimize_blocks.append(per_opt_block)
            # append grad merging ops before clip and weight decay
            for _, op in enumerate(self.optimize_ops):
                # find the origin @GRAD var before clipping
@@ -453,6 +458,7 @@ class DistributeTranspiler(object):
        if global_ops:
            opt_state_block = pserver_program.create_block(
                pserver_program.num_blocks - 1)
+            optimize_blocks.append(opt_state_block)
            for glb_op in global_ops:
                __append_optimize_op__(glb_op, opt_state_block,
                                       grad_to_block_id, None)
@@ -474,11 +480,11 @@ class DistributeTranspiler(object):
            assert len(prefetch_var_name_to_block_id) == 0
        attrs = {
-            "OptimizeBlock": pserver_program.block(1),
+            "optimize_blocks": optimize_blocks,
            "endpoint": endpoint,
            "Fanin": self.trainer_num,
            "sync_mode": self.sync_mode,
-            "grad_to_block_id": grad_to_block_id
+            "grad_to_block_id": grad_to_block_id,
        }
        if len(prefetch_var_name_to_block_id) > 0:
            attrs['prefetch_var_name_to_block_id'] \
@@ -872,7 +878,8 @@ class DistributeTranspiler(object):
            table_opt_block.append_op(
                type="sum",
                inputs={"X": pserver_side_table_grad_list},
-                outputs={"Out": [grad_var]})
+                outputs={"Out": [grad_var]},
+                attrs={"use_mkldnn": False})
        else:
            # in async_mode, for table gradient, it also need to be splited to each parameter server
            origin_grad_name = grad_var.name
@@ -1104,7 +1111,8 @@ class DistributeTranspiler(object):
            optimize_block.append_op(
                type="sum",
                inputs={"X": vars2merge},
-                outputs={"Out": merged_var})
+                outputs={"Out": merged_var},
+                attrs={"use_mkldnn": False})
            # TODO(panyx0718): What if it's SELECTED_ROWS.
            if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS:
                optimize_block.append_op(
@@ -1209,7 +1217,7 @@ class DistributeTranspiler(object):
                if var not in program.global_block().vars:
                    block.clone_variable(var)
-        block.append_op(
+        return block.append_op(
            type=op.type, inputs=inputs, outputs=outputs, attrs=op.attrs)
    def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
@@ -1247,7 +1255,7 @@ class DistributeTranspiler(object):
                elif not program.global_block().vars.has_key(var.name):
                    program.global_block().clone_variable(var)
-        optimize_block.append_op(
+        return optimize_block.append_op(
            type=opt_op.type,
            inputs=inputs,
            outputs=outputs,
@@ -1291,16 +1299,6 @@ class DistributeTranspiler(object):
                    ufind.union(op1, op2)
        return ufind
-    def _is_opt_role_op(self, op):
-        # NOTE: depend on oprole to find out whether this op is for
-        # optimize
-        op_maker = core.op_proto_and_checker_maker
-        optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
-        if op_maker.kOpRoleAttrName() in op.attrs and \
-            int(op.attrs[op_maker.kOpRoleAttrName()]) == int(optimize_role):
-            return True
-        return False
    def _is_optimizer_op(self, op):
        if "Param" in op.input_names and \
            "LearningRate" in op.input_names:
@@ -1391,7 +1389,10 @@ class DistributeTranspiler(object):
        params_grads = []
        origin_var_dict = self.origin_program.global_block().vars
        for op in block.ops:
-            if self._is_opt_role_op(op):
+            # NOTE(Yancey1989): we can not use op role to distinguish an optimizer op
+            # or not, because all ops in optimizer sub-graph would
+            # sign the optimizer op role
+            if self._is_optimizer_op(op):
                opt_ops.append(op)
                # HACK(wuyi): if we find grad vars from input of optimize
                # ops, we may get the output of clip op. Use syntax "@GRAD"

--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -336,7 +336,7 @@ def _buf2lines(buf, line_break="\n"):
 class PipeReader:
    """
-        PipeReader read data by stream from a command, take it's 
+        PipeReader read data by stream from a command, take it's
        stdout into a pipe buffer and redirect it to the parser to
        parse, then yield data as your desired format.
@@ -352,7 +352,7 @@ class PipeReader:
        An example:
        .. code-block:: python
           def example_reader():
               for f in myfiles:
                   pr = PipeReader("cat %s"%f)

--- a/python/paddle/v2/dataset/cifar.py
+++ b/python/paddle/v2/dataset/cifar.py
@@ -43,7 +43,7 @@ CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
 CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
-def reader_creator(filename, sub_name):
+def reader_creator(filename, sub_name, cycle=False):
    def read_batch(batch):
        data = batch['data']
        labels = batch.get('labels', batch.get('fine_labels', None))
@@ -56,10 +56,13 @@ def reader_creator(filename, sub_name):
            names = (each_item.name for each_item in f
                     if sub_name in each_item.name)
-            for name in names:
+            while True:
-                batch = cPickle.load(f.extractfile(name))
+                for name in names:
-                for item in read_batch(batch):
+                    batch = cPickle.load(f.extractfile(name))
-                    yield item
+                    for item in read_batch(batch):
+                        yield item
+                if not cycle:
+                    break
    return reader
@@ -94,34 +97,40 @@ def test100():
        'test')
-def train10():
+def train10(cycle=False):
    """
    CIFAR-10 training set creator.
    It returns a reader creator, each sample in the reader is image pixels in
    [0, 1] and label in [0, 9].
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
    :return: Training reader creator
    :rtype: callable
    """
    return reader_creator(
        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'data_batch')
+        'data_batch',
+        cycle=cycle)
-def test10():
+def test10(cycle=False):
    """
    CIFAR-10 test set creator.
    It returns a reader creator, each sample in the reader is image pixels in
    [0, 1] and label in [0, 9].
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
    :return: Test reader creator.
    :rtype: callable
    """
    return reader_creator(
        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'test_batch')
+        'test_batch',
+        cycle=cycle)
 def fetch():

--- a/python/paddle/v2/dataset/flowers.py
+++ b/python/paddle/v2/dataset/flowers.py
@@ -76,7 +76,8 @@ def reader_creator(data_file,
                   dataset_name,
                   mapper,
                   buffered_size=1024,
-                   use_xmap=True):
+                   use_xmap=True,
+                   cycle=False):
    '''
    1. read images from tar file and
        merge images into batch files in 102flowers.tgz_batch/
@@ -96,6 +97,8 @@ def reader_creator(data_file,
    :type mapper: callable
    :param buffered_size: the size of buffer used to process images
    :type buffered_size: int
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
    :return: data reader
    :rtype: callable
    '''
@@ -108,15 +111,18 @@ def reader_creator(data_file,
    file_list = batch_images_from_tar(data_file, dataset_name, img2label)
    def reader():
-        for file in open(file_list):
+        while True:
-            file = file.strip()
+            for file in open(file_list):
-            batch = None
+                file = file.strip()
-            with open(file, 'r') as f:
+                batch = None
-                batch = cPickle.load(f)
+                with open(file, 'r') as f:
-            data = batch['data']
+                    batch = cPickle.load(f)
-            labels = batch['label']
+                data = batch['data']
-            for sample, label in itertools.izip(data, batch['label']):
+                labels = batch['label']
-                yield sample, int(label) - 1
+                for sample, label in itertools.izip(data, batch['label']):
+                    yield sample, int(label) - 1
+            if not cycle:
+                break
    if use_xmap:
        cpu_num = int(os.environ.get('CPU_NUM', cpu_count()))
@@ -125,7 +131,7 @@ def reader_creator(data_file,
        return map_readers(mapper, reader)
-def train(mapper=train_mapper, buffered_size=1024, use_xmap=True):
+def train(mapper=train_mapper, buffered_size=1024, use_xmap=True, cycle=False):
    '''
    Create flowers training set reader.
    It returns a reader, each sample in the reader is
@@ -138,17 +144,23 @@ def train(mapper=train_mapper, buffered_size=1024, use_xmap=True):
    :type mapper: callable
    :param buffered_size: the size of buffer used to process images
    :type buffered_size: int
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
    :return: train data reader
    :rtype: callable
    '''
    return reader_creator(
        download(DATA_URL, 'flowers', DATA_MD5),
        download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), TRAIN_FLAG, mapper,
+        download(SETID_URL, 'flowers', SETID_MD5),
-        buffered_size, use_xmap)
+        TRAIN_FLAG,
+        mapper,
+        buffered_size,
+        use_xmap,
+        cycle=cycle)
-def test(mapper=test_mapper, buffered_size=1024, use_xmap=True):
+def test(mapper=test_mapper, buffered_size=1024, use_xmap=True, cycle=False):
    '''
    Create flowers test set reader.
    It returns a reader, each sample in the reader is
@@ -161,14 +173,20 @@ def test(mapper=test_mapper, buffered_size=1024, use_xmap=True):
    :type mapper: callable
    :param buffered_size: the size of buffer used to process images
    :type buffered_size: int
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
    :return: test data reader
    :rtype: callable
    '''
    return reader_creator(
        download(DATA_URL, 'flowers', DATA_MD5),
        download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), TEST_FLAG, mapper,
+        download(SETID_URL, 'flowers', SETID_MD5),
-        buffered_size, use_xmap)
+        TEST_FLAG,
+        mapper,
+        buffered_size,
+        use_xmap,
+        cycle=cycle)
 def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True):