Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into expose_Parameter_2

d0a8eea2 · fengjiayi · e9ed62bf · a64844ad · d0a8eea2 · d0a8eea2
92 changed file
--- a/Dockerfile
+++ b/Dockerfile
@@ -76,7 +76,8 @@ RUN easy_install -U pip && \
    pip install sphinx-rtd-theme==0.1.9 recommonmark

 RUN pip install pre-commit 'ipython==5.3.0' && \
-    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0'
+    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip install opencv-python

 #For docstring checker
 RUN pip install pylint pytest astroid isort

--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -40,12 +40,12 @@ ExternalProject_Add(
    # NOTE(wuyi):
    # this package is generated by following steps:
    # 1. git clone -b v1.8.x https://github.com/grpc/grpc.git
-    # 2. submodule update --init
+    # 2. git submodule update --init
    # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
    #    checkout and clean other dirs under third_party
    # 4. remove .git, and package the directory.
-    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.8.x.tar.gz"
-    URL_MD5  "c9c58ee7d0e8929a63155af6a2ecdbd0"
+    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz"
+    URL_MD5  "1f268a2aff6759839dccd256adcc91cf"
    PREFIX          ${GRPC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CONFIGURE_COMMAND ""

--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -54,7 +54,7 @@ ExternalProject_Add(
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ${MKLDNN_DEPENDS}
    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "db3424ad44901513c03a1ea31ccaacdf633fbe9f"
+    GIT_TAG             "a29d8487a63afca3d5b8c5bbdbb473cf8ccc6e51"
    PREFIX              ${MKLDNN_SOURCES_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}

--- a/doc/fluid/api/evaluator.rst
+++ b/doc/fluid/api/evaluator.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!

-=========
-evaluator
-=========
+=============
+fluid.average
+=============
+
+.. _api_fluid_average_WeightedAverage:
+
+WeightedAverage
+---------------
+
+..  autoclass:: paddle.fluid.average.WeightedAverage
+    :members:
+    :noindex:

--- a/doc/fluid/api/backward.rst
+++ b/doc/fluid/api/backward.rst
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+==============
+fluid.backward
+==============
+
+.. _api_fluid_backward_append_backward:
+
+append_backward
+---------------
+
+..  autofunction:: paddle.fluid.backward.append_backward
+    :noindex:
+
+.. _api_fluid_backward_calc_gradient:
+
+calc_gradient
+-------------
+
+..  autofunction:: paddle.fluid.backward.calc_gradient
+    :noindex:
+
--- a/doc/fluid/api/clip.rst
+++ b/doc/fluid/api/clip.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!

-====
-clip
-====
+==========
+fluid.clip
+==========
+
+.. _api_fluid_clip_ErrorClipByValue:

 ErrorClipByValue
 ----------------
@@ -12,6 +14,8 @@ ErrorClipByValue
    :members:
    :noindex:

+.. _api_fluid_clip_GradientClipByValue:
+
 GradientClipByValue
 -------------------

@@ -19,6 +23,8 @@ GradientClipByValue
    :members:
    :noindex:

+.. _api_fluid_clip_GradientClipByNorm:
+
 GradientClipByNorm
 ------------------

@@ -26,6 +32,8 @@ GradientClipByNorm
    :members:
    :noindex:

+.. _api_fluid_clip_GradientClipByGlobalNorm:
+
 GradientClipByGlobalNorm
 ------------------------

@@ -33,15 +41,3 @@ GradientClipByGlobalNorm
    :members:
    :noindex:

-append_gradient_clip_ops
------------------------
-
-..  autofunction:: paddle.fluid.clip.append_gradient_clip_ops
-    :noindex:
-
-error_clip_callback
-------------------
-
-..  autofunction:: paddle.fluid.clip.error_clip_callback
-    :noindex:
-
--- a/doc/fluid/api/data.rst
+++ b/doc/fluid/api/data.rst
-==================================
-Data Reader Interface and DataSets
-==================================
-
-..  toctree::
-    :maxdepth: 1
-
-    data/data_reader.rst
-    data/image.rst
-    data/dataset.rst
--- a/doc/fluid/api/data_feeder.rst
+++ b/doc/fluid/api/data_feeder.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!

-===========
-data_feeder
-===========
+=================
+fluid.data_feeder
+=================
+
+.. _api_fluid_data_feeder_DataFeeder:

 DataFeeder
 ----------

--- a/doc/fluid/api/detection.rst
+++ b/doc/fluid/api/detection.rst
--- a/doc/fluid/api/executor.rst
+++ b/doc/fluid/api/executor.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!

-========
-executor
-========
+==============
+fluid.executor
+==============
+
+.. _api_fluid_executor_Executor:

 Executor
 --------
@@ -12,24 +14,32 @@ Executor
    :members:
    :noindex:

+.. _api_fluid_executor_global_scope:
+
 global_scope
 ------------

 ..  autofunction:: paddle.fluid.executor.global_scope
    :noindex:

+.. _api_fluid_executor_scope_guard:
+
 scope_guard
 -----------

 ..  autofunction:: paddle.fluid.executor.scope_guard
    :noindex:

-switch_scope
------------
+.. _api_fluid_executor__switch_scope:
+
+_switch_scope
+-------------

-..  autofunction:: paddle.fluid.executor.switch_scope
+..  autofunction:: paddle.fluid.executor._switch_scope
    :noindex:

+.. _api_fluid_executor_fetch_var:
+
 fetch_var
 ---------


--- a/doc/fluid/api/fluid.rst
+++ b/doc/fluid/api/fluid.rst
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=====
+fluid
+=====
+
+.. _api_fluid_Block:
+
+Block
+-----
+
+..  autoclass:: paddle.fluid.Block
+    :members:
+    :noindex:
+
+.. _api_fluid_Variable:
+
+Variable
+--------
+
+..  autoclass:: paddle.fluid.Variable
+    :members:
+    :noindex:
+
+.. _api_fluid_Program:
+
+Program
+-------
+
+..  autoclass:: paddle.fluid.Program
+    :members:
+    :noindex:
+
+.. _api_fluid_Operator:
+
+Operator
+--------
+
+..  autoclass:: paddle.fluid.Operator
+    :members:
+    :noindex:
+
+.. _api_fluid_default_startup_program:
+
+default_startup_program
+-----------------------
+
+..  autofunction:: paddle.fluid.default_startup_program
+    :noindex:
+
+.. _api_fluid_default_main_program:
+
+default_main_program
+--------------------
+
+..  autofunction:: paddle.fluid.default_main_program
+    :noindex:
+
+.. _api_fluid_program_guard:
+
+program_guard
+-------------
+
+..  autofunction:: paddle.fluid.program_guard
+    :noindex:
+
+.. _api_fluid_get_var:
+
+get_var
+-------
+
+..  autofunction:: paddle.fluid.get_var
+    :noindex:
+
+.. _api_fluid_Executor:
+
+Executor
+--------
+
+..  autoclass:: paddle.fluid.Executor
+    :members:
+    :noindex:
+
+.. _api_fluid_global_scope:
+
+global_scope
+------------
+
+..  autofunction:: paddle.fluid.global_scope
+    :noindex:
+
+.. _api_fluid_scope_guard:
+
+scope_guard
+-----------
+
+..  autofunction:: paddle.fluid.scope_guard
+    :noindex:
+
+.. _api_fluid__switch_scope:
+
+_switch_scope
+-------------
+
+..  autofunction:: paddle.fluid._switch_scope
+    :noindex:
+
+.. _api_fluid_fetch_var:
+
+fetch_var
+---------
+
+..  autofunction:: paddle.fluid.fetch_var
+    :noindex:
+
+.. _api_fluid_Go:
+
+Go
+--
+
+..  autoclass:: paddle.fluid.Go
+    :members:
+    :noindex:
+
+.. _api_fluid_make_channel:
+
+make_channel
+------------
+
+..  autofunction:: paddle.fluid.make_channel
+    :noindex:
+
+.. _api_fluid_channel_send:
+
+channel_send
+------------
+
+..  autofunction:: paddle.fluid.channel_send
+    :noindex:
+
+.. _api_fluid_channel_recv:
+
+channel_recv
+------------
+
+..  autofunction:: paddle.fluid.channel_recv
+    :noindex:
+
+.. _api_fluid_channel_close:
+
+channel_close
+-------------
+
+..  autofunction:: paddle.fluid.channel_close
+    :noindex:
+
+.. _api_fluid_Select:
+
+Select
+------
+
+..  autoclass:: paddle.fluid.Select
+    :members:
+    :noindex:
+
+.. _api_fluid_Trainer:
+
+Trainer
+-------
+
+..  autoclass:: paddle.fluid.Trainer
+    :members:
+    :noindex:
+
+.. _api_fluid_BeginEpochEvent:
+
+BeginEpochEvent
+---------------
+
+..  autoclass:: paddle.fluid.BeginEpochEvent
+    :members:
+    :noindex:
+
+.. _api_fluid_EndEpochEvent:
+
+EndEpochEvent
+-------------
+
+..  autoclass:: paddle.fluid.EndEpochEvent
+    :members:
+    :noindex:
+
+.. _api_fluid_BeginStepEvent:
+
+BeginStepEvent
+--------------
+
+..  autoclass:: paddle.fluid.BeginStepEvent
+    :members:
+    :noindex:
+
+.. _api_fluid_EndStepEvent:
+
+EndStepEvent
+------------
+
+..  autoclass:: paddle.fluid.EndStepEvent
+    :members:
+    :noindex:
+
+.. _api_fluid_CheckpointConfig:
+
+CheckpointConfig
+----------------
+
+..  autoclass:: paddle.fluid.CheckpointConfig
+    :members:
+    :noindex:
+
+.. _api_fluid_Inferencer:
+
+Inferencer
+----------
+
+..  autoclass:: paddle.fluid.Inferencer
+    :members:
+    :noindex:
+
+.. _api_fluid_DistributeTranspiler:
+
+DistributeTranspiler
+--------------------
+
+..  autoclass:: paddle.fluid.DistributeTranspiler
+    :members:
+    :noindex:
+
+.. _api_fluid_memory_optimize:
+
+memory_optimize
+---------------
+
+..  autofunction:: paddle.fluid.memory_optimize
+    :noindex:
+
+.. _api_fluid_release_memory:
+
+release_memory
+--------------
+
+..  autofunction:: paddle.fluid.release_memory
+    :noindex:
+
+.. _api_fluid_ParallelExecutor:
+
+ParallelExecutor
+----------------
+
+..  autoclass:: paddle.fluid.ParallelExecutor
+    :members:
+    :noindex:
+
+.. _api_fluid_ExecutionStrategy:
+
+ExecutionStrategy
+-----------------
+
+..  autoclass:: paddle.fluid.ExecutionStrategy
+    :members:
+    :noindex:
+
+.. _api_fluid_BuildStrategy:
+
+BuildStrategy
+-------------
+
+..  autoclass:: paddle.fluid.BuildStrategy
+    :members:
+    :noindex:
+
+.. _api_fluid_create_lod_tensor:
+
+create_lod_tensor
+-----------------
+
+..  autofunction:: paddle.fluid.create_lod_tensor
+    :noindex:
+
+.. _api_fluid_create_random_int_lodtensor:
+
+create_random_int_lodtensor
+---------------------------
+
+..  autofunction:: paddle.fluid.create_random_int_lodtensor
+    :noindex:
+
+.. _api_fluid_LoDTensor:
+
+LoDTensor
+---------
+
+..  autoclass:: paddle.fluid.LoDTensor
+    :members:
+    :noindex:
+
+.. _api_fluid_CPUPlace:
+
+CPUPlace
+--------
+
+..  autoclass:: paddle.fluid.CPUPlace
+    :members:
+    :noindex:
+
+.. _api_fluid_CUDAPlace:
+
+CUDAPlace
+---------
+
+..  autoclass:: paddle.fluid.CUDAPlace
+    :members:
+    :noindex:
+
+.. _api_fluid_CUDAPinnedPlace:
+
+CUDAPinnedPlace
+---------------
+
+..  autoclass:: paddle.fluid.CUDAPinnedPlace
+    :members:
+    :noindex:
+
+.. _api_fluid_Tensor:
+
+Tensor
+------
+
+..  autoclass:: paddle.fluid.Tensor
+    :members:
+    :noindex:
+
+.. _api_fluid_ParamAttr:
+
+ParamAttr
+---------
+
+..  autoclass:: paddle.fluid.ParamAttr
+    :members:
+    :noindex:
+
+.. _api_fluid_WeightNormParamAttr:
+
+WeightNormParamAttr
+-------------------
+
+..  autoclass:: paddle.fluid.WeightNormParamAttr
+    :members:
+    :noindex:
+
+.. _api_fluid_DataFeeder:
+
+DataFeeder
+----------
+
+..  autoclass:: paddle.fluid.DataFeeder
+    :members:
+    :noindex:
+
+.. _api_fluid_Scope:
+
+Scope
+-----
+
+..  autoclass:: paddle.fluid.Scope
+    :members:
+    :noindex:
+
--- a/doc/fluid/api/gen_doc.py
+++ b/doc/fluid/api/gen_doc.py
@@ -29,9 +29,17 @@ def parse_arg():


 class DocGenerator(object):
-    def __init__(self, module_name, stream=sys.stdout):
+    def __init__(self, module_name=None, stream=sys.stdout):
+        if module_name == "":
+            module_name = None
        self.stream = stream
-        self.module_name = module_name
+        if module_name is None:
+            self.module_name = "fluid"
+        else:
+            self.module_name = "fluid." + module_name
+        if module_name is None:
+            self.module = fluid
+        else:
            if not hasattr(fluid, module_name):
                raise ValueError("Cannot find fluid.{0}".format(module_name))
            else:
@@ -41,7 +49,7 @@ class DocGenerator(object):

 ''')

-        self._print_header_(module_name, dot='=', is_title=True)
+        self._print_header_(self.module_name, dot='=', is_title=True)

    def print_submodule(self, submodule_name):
        submodule = getattr(self.module, submodule_name)
@@ -60,25 +68,29 @@ class DocGenerator(object):
        self._print_header_(name, dot='=', is_title=False)

    def print_item(self, name):
-        item = getattr(self.module, name)
+        item = getattr(self.module, name, None)
+        if item is None:
+            return
        if isinstance(item, types.TypeType):
            self.print_class(name)
        elif isinstance(item, types.FunctionType):
            self.print_method(name)
        else:
-            raise RuntimeError("Unsupported item {0}".format(name))
+            pass

    def print_class(self, name):
+        self._print_ref_(name)
        self._print_header_(name, dot='-', is_title=False)
-        self.stream.write('''..  autoclass:: paddle.fluid.{0}.{1}
+        self.stream.write('''..  autoclass:: paddle.{0}.{1}
    :members:
    :noindex:

 '''.format(self.module_name, name))

    def print_method(self, name):
+        self._print_ref_(name)
        self._print_header_(name, dot='-', is_title=False)
-        self.stream.write('''..  autofunction:: paddle.fluid.{0}.{1}
+        self.stream.write('''..  autofunction:: paddle.{0}.{1}
    :noindex:

 '''.format(self.module_name, name))
@@ -94,6 +106,10 @@ class DocGenerator(object):
        self.stream.write('\n')
        self.stream.write('\n')

+    def _print_ref_(self, name):
+        self.stream.write(".. _api_{0}_{1}:\n\n".format("_".join(
+            self.module_name.split(".")), name))
+

 def main():
    args = parse_arg()

--- a/doc/fluid/api/gen_doc.sh
+++ b/doc/fluid/api/gen_doc.sh
 #!/bin/bash
-python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler metric > layers.rst
+python gen_doc.py layers --submodules control_flow device io nn ops tensor learning_rate_scheduler detection metric_op tensor > layers.rst

-for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler
+for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler recordio_writer backward average profiler
 do
  python gen_doc.py ${module} > ${module}.rst
 done
+
+python gen_doc.py "" > fluid.rst
--- a/doc/fluid/api/index_en.rst
+++ b/doc/fluid/api/index_en.rst
-======================
-Fluid
-======================
+=============
+API Reference
+=============

 ..  toctree::
    :maxdepth: 1

+    fluid.rst
    layers.rst
    data_feeder.rst
    executor.rst
@@ -18,3 +19,8 @@ Fluid
    regularizer.rst
    io.rst
    data.rst
+    transpiler.rst
+    recordio_writer.rst
+    backward.rst
+    average.rst
+    profiler.rst
--- a/doc/fluid/api/initializer.rst
+++ b/doc/fluid/api/initializer.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!

-===========
-initializer
-===========
+=================
+fluid.initializer
+=================
+
+.. _api_fluid_initializer_Constant:

 Constant
 --------
@@ -12,6 +14,8 @@ Constant
    :members:
    :noindex:

+.. _api_fluid_initializer_Uniform:
+
 Uniform
 -------

@@ -19,6 +23,8 @@ Uniform
    :members:
    :noindex:

+.. _api_fluid_initializer_Normal:
+
 Normal
 ------

@@ -26,6 +32,8 @@ Normal
    :members:
    :noindex:

+.. _api_fluid_initializer_Xavier:
+
 Xavier
 ------

@@ -33,6 +41,8 @@ Xavier
    :members:
    :noindex:

+.. _api_fluid_initializer_Bilinear:
+
 Bilinear
 --------

@@ -40,18 +50,33 @@ Bilinear
    :members:
    :noindex:

+.. _api_fluid_initializer_MSRA:
+
+MSRA
+----
+
+..  autoclass:: paddle.fluid.initializer.MSRA
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_force_init_on_cpu:
+
 force_init_on_cpu
 -----------------

 ..  autofunction:: paddle.fluid.initializer.force_init_on_cpu
    :noindex:

+.. _api_fluid_initializer_init_on_cpu:
+
 init_on_cpu
 -----------

 ..  autofunction:: paddle.fluid.initializer.init_on_cpu
    :noindex:

+.. _api_fluid_initializer_ConstantInitializer:
+
 ConstantInitializer
 -------------------

@@ -59,6 +84,8 @@ ConstantInitializer
    :members:
    :noindex:

+.. _api_fluid_initializer_UniformInitializer:
+
 UniformInitializer
 ------------------

@@ -66,6 +93,8 @@ UniformInitializer
    :members:
    :noindex:

+.. _api_fluid_initializer_NormalInitializer:
+
 NormalInitializer
 -----------------

@@ -73,6 +102,8 @@ NormalInitializer
    :members:
    :noindex:

+.. _api_fluid_initializer_XavierInitializer:
+
 XavierInitializer
 -----------------

@@ -80,6 +111,8 @@ XavierInitializer
    :members:
    :noindex:

+.. _api_fluid_initializer_BilinearInitializer:
+
 BilinearInitializer
 -------------------

@@ -87,3 +120,12 @@ BilinearInitializer
    :members:
    :noindex:

+.. _api_fluid_initializer_MSRAInitializer:
+
+MSRAInitializer
+---------------
+
+..  autoclass:: paddle.fluid.initializer.MSRAInitializer
+    :members:
+    :noindex:
+
--- a/doc/fluid/api/io.rst
+++ b/doc/fluid/api/io.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!

-==
-io
-==
+========
+fluid.io
+========
+
+.. _api_fluid_io_save_vars:

 save_vars
 ---------
@@ -11,84 +13,112 @@ save_vars
 ..  autofunction:: paddle.fluid.io.save_vars
    :noindex:

+.. _api_fluid_io_save_params:
+
 save_params
 -----------

 ..  autofunction:: paddle.fluid.io.save_params
    :noindex:

+.. _api_fluid_io_save_persistables:
+
 save_persistables
 -----------------

 ..  autofunction:: paddle.fluid.io.save_persistables
    :noindex:

+.. _api_fluid_io_load_vars:
+
 load_vars
 ---------

 ..  autofunction:: paddle.fluid.io.load_vars
    :noindex:

+.. _api_fluid_io_load_params:
+
 load_params
 -----------

 ..  autofunction:: paddle.fluid.io.load_params
    :noindex:

+.. _api_fluid_io_load_persistables:
+
 load_persistables
 -----------------

 ..  autofunction:: paddle.fluid.io.load_persistables
    :noindex:

+.. _api_fluid_io_save_inference_model:
+
 save_inference_model
 --------------------

 ..  autofunction:: paddle.fluid.io.save_inference_model
    :noindex:

+.. _api_fluid_io_load_inference_model:
+
 load_inference_model
 --------------------

 ..  autofunction:: paddle.fluid.io.load_inference_model
    :noindex:

+.. _api_fluid_io_get_inference_program:
+
 get_inference_program
 ---------------------

 ..  autofunction:: paddle.fluid.io.get_inference_program
    :noindex:

+.. _api_fluid_io_save_checkpoint:
+
 save_checkpoint
 ---------------

 ..  autofunction:: paddle.fluid.io.save_checkpoint
    :noindex:

+.. _api_fluid_io_load_checkpoint:
+
 load_checkpoint
 ---------------

 ..  autofunction:: paddle.fluid.io.load_checkpoint
    :noindex:

+.. _api_fluid_io_clean_checkpoint:
+
 clean_checkpoint
 ----------------

 ..  autofunction:: paddle.fluid.io.clean_checkpoint
    :noindex:

+.. _api_fluid_io_load_persist_vars_without_grad:
+
 load_persist_vars_without_grad
 ------------------------------

 ..  autofunction:: paddle.fluid.io.load_persist_vars_without_grad
    :noindex:

+.. _api_fluid_io_save_persist_vars_without_grad:
+
 save_persist_vars_without_grad
 ------------------------------

 ..  autofunction:: paddle.fluid.io.save_persist_vars_without_grad
    :noindex:

+.. _api_fluid_io_get_latest_checkpoint_serial:
+
 get_latest_checkpoint_serial
 ----------------------------


--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!

-======
-layers
-======
+============
+fluid.layers
+============

 control_flow
 ============

+.. _api_fluid_layers_split_lod_tensor:
+
 split_lod_tensor
 ----------------

 ..  autofunction:: paddle.fluid.layers.split_lod_tensor
    :noindex:

+.. _api_fluid_layers_merge_lod_tensor:
+
 merge_lod_tensor
 ----------------

 ..  autofunction:: paddle.fluid.layers.merge_lod_tensor
    :noindex:

+.. _api_fluid_layers_BlockGuard:
+
 BlockGuard
 ----------

@@ -27,6 +33,8 @@ BlockGuard
    :members:
    :noindex:

+.. _api_fluid_layers_BlockGuardWithCompletion:
+
 BlockGuardWithCompletion
 ------------------------

@@ -34,12 +42,7 @@ BlockGuardWithCompletion
    :members:
    :noindex:

-StaticRNNMemoryLink
-------------------
-
-..  autoclass:: paddle.fluid.layers.StaticRNNMemoryLink
-    :members:
-    :noindex:
+.. _api_fluid_layers_WhileGuard:

 WhileGuard
 ----------
@@ -48,6 +51,8 @@ WhileGuard
    :members:
    :noindex:

+.. _api_fluid_layers_While:
+
 While
 -----

@@ -55,6 +60,8 @@ While
    :members:
    :noindex:

+.. _api_fluid_layers_Switch:
+
 Switch
 ------

@@ -62,78 +69,104 @@ Switch
    :members:
    :noindex:

+.. _api_fluid_layers_lod_rank_table:
+
 lod_rank_table
 --------------

 ..  autofunction:: paddle.fluid.layers.lod_rank_table
    :noindex:

+.. _api_fluid_layers_max_sequence_len:
+
 max_sequence_len
 ----------------

 ..  autofunction:: paddle.fluid.layers.max_sequence_len
    :noindex:

+.. _api_fluid_layers_lod_tensor_to_array:
+
 lod_tensor_to_array
 -------------------

 ..  autofunction:: paddle.fluid.layers.lod_tensor_to_array
    :noindex:

+.. _api_fluid_layers_array_to_lod_tensor:
+
 array_to_lod_tensor
 -------------------

 ..  autofunction:: paddle.fluid.layers.array_to_lod_tensor
    :noindex:

+.. _api_fluid_layers_increment:
+
 increment
 ---------

 ..  autofunction:: paddle.fluid.layers.increment
    :noindex:

+.. _api_fluid_layers_array_write:
+
 array_write
 -----------

 ..  autofunction:: paddle.fluid.layers.array_write
    :noindex:

+.. _api_fluid_layers_create_array:
+
 create_array
 ------------

 ..  autofunction:: paddle.fluid.layers.create_array
    :noindex:

+.. _api_fluid_layers_less_than:
+
 less_than
 ---------

 ..  autofunction:: paddle.fluid.layers.less_than
    :noindex:

+.. _api_fluid_layers_equal:
+
 equal
 -----

 ..  autofunction:: paddle.fluid.layers.equal
    :noindex:

+.. _api_fluid_layers_array_read:
+
 array_read
 ----------

 ..  autofunction:: paddle.fluid.layers.array_read
    :noindex:

+.. _api_fluid_layers_shrink_memory:
+
 shrink_memory
 -------------

 ..  autofunction:: paddle.fluid.layers.shrink_memory
    :noindex:

+.. _api_fluid_layers_array_length:
+
 array_length
 ------------

 ..  autofunction:: paddle.fluid.layers.array_length
    :noindex:

+.. _api_fluid_layers_IfElse:
+
 IfElse
 ------

@@ -141,6 +174,8 @@ IfElse
    :members:
    :noindex:

+.. _api_fluid_layers_DynamicRNN:
+
 DynamicRNN
 ----------

@@ -148,6 +183,8 @@ DynamicRNN
    :members:
    :noindex:

+.. _api_fluid_layers_ConditionalBlock:
+
 ConditionalBlock
 ----------------

@@ -155,6 +192,8 @@ ConditionalBlock
    :members:
    :noindex:

+.. _api_fluid_layers_StaticRNN:
+
 StaticRNN
 ---------

@@ -162,12 +201,16 @@ StaticRNN
    :members:
    :noindex:

+.. _api_fluid_layers_reorder_lod_tensor_by_rank:
+
 reorder_lod_tensor_by_rank
 --------------------------

 ..  autofunction:: paddle.fluid.layers.reorder_lod_tensor_by_rank
    :noindex:

+.. _api_fluid_layers_ParallelDo:
+
 ParallelDo
 ----------

@@ -175,12 +218,16 @@ ParallelDo
    :members:
    :noindex:

+.. _api_fluid_layers_Print:
+
 Print
 -----

 ..  autofunction:: paddle.fluid.layers.Print
    :noindex:

+.. _api_fluid_layers_is_empty:
+
 is_empty
 --------

@@ -190,6 +237,8 @@ is_empty
 device
 ======

+.. _api_fluid_layers_get_places:
+
 get_places
 ----------

@@ -199,12 +248,16 @@ get_places
 io
 ==

+.. _api_fluid_layers_data:
+
 data
 ----

 ..  autofunction:: paddle.fluid.layers.data
    :noindex:

+.. _api_fluid_layers_BlockGuardServ:
+
 BlockGuardServ
 --------------

@@ -212,6 +265,8 @@ BlockGuardServ
    :members:
    :noindex:

+.. _api_fluid_layers_ListenAndServ:
+
 ListenAndServ
 -------------

@@ -219,60 +274,80 @@ ListenAndServ
    :members:
    :noindex:

+.. _api_fluid_layers_Send:
+
 Send
 ----

 ..  autofunction:: paddle.fluid.layers.Send
    :noindex:

+.. _api_fluid_layers_Recv:
+
 Recv
 ----

 ..  autofunction:: paddle.fluid.layers.Recv
    :noindex:

+.. _api_fluid_layers_open_recordio_file:
+
 open_recordio_file
 ------------------

 ..  autofunction:: paddle.fluid.layers.open_recordio_file
    :noindex:

+.. _api_fluid_layers_open_files:
+
 open_files
 ----------

 ..  autofunction:: paddle.fluid.layers.open_files
    :noindex:

+.. _api_fluid_layers_read_file:
+
 read_file
 ---------

 ..  autofunction:: paddle.fluid.layers.read_file
    :noindex:

+.. _api_fluid_layers_shuffle:
+
 shuffle
 -------

 ..  autofunction:: paddle.fluid.layers.shuffle
    :noindex:

+.. _api_fluid_layers_batch:
+
 batch
 -----

 ..  autofunction:: paddle.fluid.layers.batch
    :noindex:

+.. _api_fluid_layers_double_buffer:
+
 double_buffer
 -------------

 ..  autofunction:: paddle.fluid.layers.double_buffer
    :noindex:

+.. _api_fluid_layers_random_data_generator:
+
 random_data_generator
 ---------------------

 ..  autofunction:: paddle.fluid.layers.random_data_generator
    :noindex:

+.. _api_fluid_layers_Preprocessor:
+
 Preprocessor
 ------------

@@ -280,6 +355,8 @@ Preprocessor
    :members:
    :noindex:

+.. _api_fluid_layers_load:
+
 load
 ----

@@ -289,584 +366,802 @@ load
 nn
 ==

+.. _api_fluid_layers_fc:
+
 fc
 --

 ..  autofunction:: paddle.fluid.layers.fc
    :noindex:

+.. _api_fluid_layers_embedding:
+
 embedding
 ---------

 ..  autofunction:: paddle.fluid.layers.embedding
    :noindex:

+.. _api_fluid_layers_dynamic_lstm:
+
 dynamic_lstm
 ------------

 ..  autofunction:: paddle.fluid.layers.dynamic_lstm
    :noindex:

+.. _api_fluid_layers_dynamic_lstmp:
+
 dynamic_lstmp
 -------------

 ..  autofunction:: paddle.fluid.layers.dynamic_lstmp
    :noindex:

+.. _api_fluid_layers_dynamic_gru:
+
 dynamic_gru
 -----------

 ..  autofunction:: paddle.fluid.layers.dynamic_gru
    :noindex:

+.. _api_fluid_layers_gru_unit:
+
 gru_unit
 --------

 ..  autofunction:: paddle.fluid.layers.gru_unit
    :noindex:

+.. _api_fluid_layers_linear_chain_crf:
+
 linear_chain_crf
 ----------------

 ..  autofunction:: paddle.fluid.layers.linear_chain_crf
    :noindex:

+.. _api_fluid_layers_crf_decoding:
+
 crf_decoding
 ------------

 ..  autofunction:: paddle.fluid.layers.crf_decoding
    :noindex:

+.. _api_fluid_layers_cos_sim:
+
 cos_sim
 -------

 ..  autofunction:: paddle.fluid.layers.cos_sim
    :noindex:

+.. _api_fluid_layers_cross_entropy:
+
 cross_entropy
 -------------

 ..  autofunction:: paddle.fluid.layers.cross_entropy
    :noindex:

+.. _api_fluid_layers_square_error_cost:
+
 square_error_cost
 -----------------

 ..  autofunction:: paddle.fluid.layers.square_error_cost
    :noindex:

+.. _api_fluid_layers_chunk_eval:
+
 chunk_eval
 ----------

 ..  autofunction:: paddle.fluid.layers.chunk_eval
    :noindex:

+.. _api_fluid_layers_sequence_conv:
+
 sequence_conv
 -------------

 ..  autofunction:: paddle.fluid.layers.sequence_conv
    :noindex:

+.. _api_fluid_layers_conv2d:
+
 conv2d
 ------

 ..  autofunction:: paddle.fluid.layers.conv2d
    :noindex:

+.. _api_fluid_layers_conv3d:
+
 conv3d
 ------

 ..  autofunction:: paddle.fluid.layers.conv3d
    :noindex:

+.. _api_fluid_layers_sequence_pool:
+
 sequence_pool
 -------------

 ..  autofunction:: paddle.fluid.layers.sequence_pool
    :noindex:

+.. _api_fluid_layers_sequence_softmax:
+
 sequence_softmax
 ----------------

 ..  autofunction:: paddle.fluid.layers.sequence_softmax
    :noindex:

+.. _api_fluid_layers_softmax:
+
 softmax
 -------

 ..  autofunction:: paddle.fluid.layers.softmax
    :noindex:

+.. _api_fluid_layers_pool2d:
+
 pool2d
 ------

 ..  autofunction:: paddle.fluid.layers.pool2d
    :noindex:

+.. _api_fluid_layers_pool3d:
+
 pool3d
 ------

 ..  autofunction:: paddle.fluid.layers.pool3d
    :noindex:

+.. _api_fluid_layers_batch_norm:
+
 batch_norm
 ----------

 ..  autofunction:: paddle.fluid.layers.batch_norm
    :noindex:

+.. _api_fluid_layers_beam_search_decode:
+
 beam_search_decode
 ------------------

 ..  autofunction:: paddle.fluid.layers.beam_search_decode
    :noindex:

+.. _api_fluid_layers_conv2d_transpose:
+
 conv2d_transpose
 ----------------

 ..  autofunction:: paddle.fluid.layers.conv2d_transpose
    :noindex:

+.. _api_fluid_layers_conv3d_transpose:
+
 conv3d_transpose
 ----------------

 ..  autofunction:: paddle.fluid.layers.conv3d_transpose
    :noindex:

+.. _api_fluid_layers_sequence_expand:
+
 sequence_expand
 ---------------

 ..  autofunction:: paddle.fluid.layers.sequence_expand
    :noindex:

+.. _api_fluid_layers_lstm_unit:
+
 lstm_unit
 ---------

 ..  autofunction:: paddle.fluid.layers.lstm_unit
    :noindex:

+.. _api_fluid_layers_reduce_sum:
+
 reduce_sum
 ----------

 ..  autofunction:: paddle.fluid.layers.reduce_sum
    :noindex:

+.. _api_fluid_layers_reduce_mean:
+
 reduce_mean
 -----------

 ..  autofunction:: paddle.fluid.layers.reduce_mean
    :noindex:

+.. _api_fluid_layers_reduce_max:
+
 reduce_max
 ----------

 ..  autofunction:: paddle.fluid.layers.reduce_max
    :noindex:

+.. _api_fluid_layers_reduce_min:
+
 reduce_min
 ----------

 ..  autofunction:: paddle.fluid.layers.reduce_min
    :noindex:

+.. _api_fluid_layers_reduce_prod:
+
 reduce_prod
 -----------

 ..  autofunction:: paddle.fluid.layers.reduce_prod
    :noindex:

+.. _api_fluid_layers_sequence_first_step:
+
 sequence_first_step
 -------------------

 ..  autofunction:: paddle.fluid.layers.sequence_first_step
    :noindex:

+.. _api_fluid_layers_sequence_last_step:
+
 sequence_last_step
 ------------------

 ..  autofunction:: paddle.fluid.layers.sequence_last_step
    :noindex:

+.. _api_fluid_layers_dropout:
+
 dropout
 -------

 ..  autofunction:: paddle.fluid.layers.dropout
    :noindex:

+.. _api_fluid_layers_split:
+
 split
 -----

 ..  autofunction:: paddle.fluid.layers.split
    :noindex:

+.. _api_fluid_layers_ctc_greedy_decoder:
+
 ctc_greedy_decoder
 ------------------

 ..  autofunction:: paddle.fluid.layers.ctc_greedy_decoder
    :noindex:

+.. _api_fluid_layers_edit_distance:
+
 edit_distance
 -------------

 ..  autofunction:: paddle.fluid.layers.edit_distance
    :noindex:

+.. _api_fluid_layers_l2_normalize:
+
 l2_normalize
 ------------

 ..  autofunction:: paddle.fluid.layers.l2_normalize
    :noindex:

+.. _api_fluid_layers_matmul:
+
 matmul
 ------

 ..  autofunction:: paddle.fluid.layers.matmul
    :noindex:

+.. _api_fluid_layers_topk:
+
 topk
 ----

 ..  autofunction:: paddle.fluid.layers.topk
    :noindex:

+.. _api_fluid_layers_warpctc:
+
 warpctc
 -------

 ..  autofunction:: paddle.fluid.layers.warpctc
    :noindex:

+.. _api_fluid_layers_sequence_reshape:
+
 sequence_reshape
 ----------------

 ..  autofunction:: paddle.fluid.layers.sequence_reshape
    :noindex:

+.. _api_fluid_layers_transpose:
+
 transpose
 ---------

 ..  autofunction:: paddle.fluid.layers.transpose
    :noindex:

+.. _api_fluid_layers_im2sequence:
+
 im2sequence
 -----------

 ..  autofunction:: paddle.fluid.layers.im2sequence
    :noindex:

+.. _api_fluid_layers_nce:
+
 nce
 ---

 ..  autofunction:: paddle.fluid.layers.nce
    :noindex:

+.. _api_fluid_layers_beam_search:
+
 beam_search
 -----------

 ..  autofunction:: paddle.fluid.layers.beam_search
    :noindex:

+.. _api_fluid_layers_row_conv:
+
 row_conv
 --------

 ..  autofunction:: paddle.fluid.layers.row_conv
    :noindex:

+.. _api_fluid_layers_multiplex:
+
 multiplex
 ---------

 ..  autofunction:: paddle.fluid.layers.multiplex
    :noindex:

+.. _api_fluid_layers_layer_norm:
+
 layer_norm
 ----------

 ..  autofunction:: paddle.fluid.layers.layer_norm
    :noindex:

+.. _api_fluid_layers_softmax_with_cross_entropy:
+
 softmax_with_cross_entropy
 --------------------------

 ..  autofunction:: paddle.fluid.layers.softmax_with_cross_entropy
    :noindex:

+.. _api_fluid_layers_smooth_l1:
+
 smooth_l1
 ---------

 ..  autofunction:: paddle.fluid.layers.smooth_l1
    :noindex:

+.. _api_fluid_layers_one_hot:
+
 one_hot
 -------

 ..  autofunction:: paddle.fluid.layers.one_hot
    :noindex:

+.. _api_fluid_layers_autoincreased_step_counter:
+
 autoincreased_step_counter
 --------------------------

 ..  autofunction:: paddle.fluid.layers.autoincreased_step_counter
    :noindex:

+.. _api_fluid_layers_reshape:
+
 reshape
 -------

 ..  autofunction:: paddle.fluid.layers.reshape
    :noindex:

+.. _api_fluid_layers_lod_reset:
+
 lod_reset
 ---------

 ..  autofunction:: paddle.fluid.layers.lod_reset
    :noindex:

+.. _api_fluid_layers_lrn:
+
 lrn
 ---

 ..  autofunction:: paddle.fluid.layers.lrn
    :noindex:

+.. _api_fluid_layers_pad:
+
 pad
 ---

 ..  autofunction:: paddle.fluid.layers.pad
    :noindex:

+.. _api_fluid_layers_label_smooth:
+
 label_smooth
 ------------

 ..  autofunction:: paddle.fluid.layers.label_smooth
    :noindex:

+.. _api_fluid_layers_roi_pool:
+
 roi_pool
 --------

 ..  autofunction:: paddle.fluid.layers.roi_pool
    :noindex:

+.. _api_fluid_layers_dice_loss:
+
 dice_loss
 ---------

 ..  autofunction:: paddle.fluid.layers.dice_loss
    :noindex:

+.. _api_fluid_layers_image_resize:
+
 image_resize
 ------------

 ..  autofunction:: paddle.fluid.layers.image_resize
    :noindex:

+.. _api_fluid_layers_image_resize_short:
+
 image_resize_short
 ------------------

 ..  autofunction:: paddle.fluid.layers.image_resize_short
    :noindex:

+.. _api_fluid_layers_resize_bilinear:
+
 resize_bilinear
 ---------------

 ..  autofunction:: paddle.fluid.layers.resize_bilinear
    :noindex:

+.. _api_fluid_layers_gather:
+
 gather
 ------

 ..  autofunction:: paddle.fluid.layers.gather
    :noindex:

+.. _api_fluid_layers_random_crop:
+
 random_crop
 -----------

 ..  autofunction:: paddle.fluid.layers.random_crop
    :noindex:

+.. _api_fluid_layers_mean_iou:
+
 mean_iou
 --------

 ..  autofunction:: paddle.fluid.layers.mean_iou
    :noindex:

+.. _api_fluid_layers_relu:
+
+relu
+----
+
+..  autofunction:: paddle.fluid.layers.relu
+    :noindex:
+
+.. _api_fluid_layers_log:
+
+log
+---
+
+..  autofunction:: paddle.fluid.layers.log
+    :noindex:
+
+.. _api_fluid_layers_crop:
+
+crop
+----
+
+..  autofunction:: paddle.fluid.layers.crop
+    :noindex:
+
 ops
 ===

+.. _api_fluid_layers_mean:
+
 mean
 ----

 ..  autofunction:: paddle.fluid.layers.mean
    :noindex:

+.. _api_fluid_layers_mul:
+
 mul
 ---

 ..  autofunction:: paddle.fluid.layers.mul
    :noindex:

+.. _api_fluid_layers_scale:
+
 scale
 -----

 ..  autofunction:: paddle.fluid.layers.scale
    :noindex:

+.. _api_fluid_layers_sigmoid_cross_entropy_with_logits:
+
 sigmoid_cross_entropy_with_logits
 ---------------------------------

 ..  autofunction:: paddle.fluid.layers.sigmoid_cross_entropy_with_logits
    :noindex:

+.. _api_fluid_layers_elementwise_add:
+
 elementwise_add
 ---------------

 ..  autofunction:: paddle.fluid.layers.elementwise_add
    :noindex:

+.. _api_fluid_layers_elementwise_div:
+
 elementwise_div
 ---------------

 ..  autofunction:: paddle.fluid.layers.elementwise_div
    :noindex:

+.. _api_fluid_layers_elementwise_sub:
+
 elementwise_sub
 ---------------

 ..  autofunction:: paddle.fluid.layers.elementwise_sub
    :noindex:

+.. _api_fluid_layers_elementwise_mul:
+
 elementwise_mul
 ---------------

 ..  autofunction:: paddle.fluid.layers.elementwise_mul
    :noindex:

+.. _api_fluid_layers_elementwise_max:
+
 elementwise_max
 ---------------

 ..  autofunction:: paddle.fluid.layers.elementwise_max
    :noindex:

+.. _api_fluid_layers_elementwise_min:
+
 elementwise_min
 ---------------

 ..  autofunction:: paddle.fluid.layers.elementwise_min
    :noindex:

+.. _api_fluid_layers_elementwise_pow:
+
 elementwise_pow
 ---------------

 ..  autofunction:: paddle.fluid.layers.elementwise_pow
    :noindex:

+.. _api_fluid_layers_clip:
+
 clip
 ----

 ..  autofunction:: paddle.fluid.layers.clip
    :noindex:

+.. _api_fluid_layers_clip_by_norm:
+
 clip_by_norm
 ------------

 ..  autofunction:: paddle.fluid.layers.clip_by_norm
    :noindex:

+.. _api_fluid_layers_logical_and:
+
 logical_and
 -----------

 ..  autofunction:: paddle.fluid.layers.logical_and
    :noindex:

+.. _api_fluid_layers_logical_or:
+
 logical_or
 ----------

 ..  autofunction:: paddle.fluid.layers.logical_or
    :noindex:

+.. _api_fluid_layers_logical_xor:
+
 logical_xor
 -----------

 ..  autofunction:: paddle.fluid.layers.logical_xor
    :noindex:

+.. _api_fluid_layers_logical_not:
+
 logical_not
 -----------

 ..  autofunction:: paddle.fluid.layers.logical_not
    :noindex:

+.. _api_fluid_layers_uniform_random_batch_size_like:
+
 uniform_random_batch_size_like
 ------------------------------

 ..  autofunction:: paddle.fluid.layers.uniform_random_batch_size_like
    :noindex:

+.. _api_fluid_layers_gaussian_random:
+
 gaussian_random
 ---------------

 ..  autofunction:: paddle.fluid.layers.gaussian_random
    :noindex:

+.. _api_fluid_layers_gaussian_random_batch_size_like:
+
 gaussian_random_batch_size_like
 -------------------------------

 ..  autofunction:: paddle.fluid.layers.gaussian_random_batch_size_like
    :noindex:

+.. _api_fluid_layers_scatter:
+
 scatter
 -------

 ..  autofunction:: paddle.fluid.layers.scatter
    :noindex:

+.. _api_fluid_layers_sum:
+
 sum
 ---

 ..  autofunction:: paddle.fluid.layers.sum
    :noindex:

+.. _api_fluid_layers_slice:
+
 slice
 -----

 ..  autofunction:: paddle.fluid.layers.slice
    :noindex:

+.. _api_fluid_layers_polygon_box_transform:
+
 polygon_box_transform
 ---------------------

 ..  autofunction:: paddle.fluid.layers.polygon_box_transform
    :noindex:

+.. _api_fluid_layers_shape:
+
 shape
 -----

 ..  autofunction:: paddle.fluid.layers.shape
    :noindex:

+.. _api_fluid_layers_iou_similarity:
+
+iou_similarity
+--------------
+
+..  autofunction:: paddle.fluid.layers.iou_similarity
+    :noindex:
+
+.. _api_fluid_layers_maxout:
+
 maxout
 ------

 ..  autofunction:: paddle.fluid.layers.maxout
    :noindex:

+.. _api_fluid_layers_sigmoid:
+
 sigmoid
 -------

 ..  autofunction:: paddle.fluid.layers.sigmoid
    :noindex:

+.. _api_fluid_layers_logsigmoid:
+
 logsigmoid
 ----------

 ..  autofunction:: paddle.fluid.layers.logsigmoid
    :noindex:

+.. _api_fluid_layers_exp:
+
 exp
 ---

 ..  autofunction:: paddle.fluid.layers.exp
    :noindex:

-relu
----
-
-..  autofunction:: paddle.fluid.layers.relu
-    :noindex:
+.. _api_fluid_layers_tanh:

 tanh
 ----
@@ -874,71 +1169,87 @@ tanh
 ..  autofunction:: paddle.fluid.layers.tanh
    :noindex:

+.. _api_fluid_layers_tanh_shrink:
+
 tanh_shrink
 -----------

 ..  autofunction:: paddle.fluid.layers.tanh_shrink
    :noindex:

+.. _api_fluid_layers_softshrink:
+
 softshrink
 ----------

 ..  autofunction:: paddle.fluid.layers.softshrink
    :noindex:

+.. _api_fluid_layers_sqrt:
+
 sqrt
 ----

 ..  autofunction:: paddle.fluid.layers.sqrt
    :noindex:

+.. _api_fluid_layers_abs:
+
 abs
 ---

 ..  autofunction:: paddle.fluid.layers.abs
    :noindex:

+.. _api_fluid_layers_ceil:
+
 ceil
 ----

 ..  autofunction:: paddle.fluid.layers.ceil
    :noindex:

+.. _api_fluid_layers_floor:
+
 floor
 -----

 ..  autofunction:: paddle.fluid.layers.floor
    :noindex:

+.. _api_fluid_layers_cos:
+
 cos
 ---

 ..  autofunction:: paddle.fluid.layers.cos
    :noindex:

+.. _api_fluid_layers_sin:
+
 sin
 ---

 ..  autofunction:: paddle.fluid.layers.sin
    :noindex:

+.. _api_fluid_layers_round:
+
 round
 -----

 ..  autofunction:: paddle.fluid.layers.round
    :noindex:

+.. _api_fluid_layers_reciprocal:
+
 reciprocal
 ----------

 ..  autofunction:: paddle.fluid.layers.reciprocal
    :noindex:

-log
---
-
-..  autofunction:: paddle.fluid.layers.log
-    :noindex:
+.. _api_fluid_layers_square:

 square
 ------
@@ -946,90 +1257,120 @@ square
 ..  autofunction:: paddle.fluid.layers.square
    :noindex:

+.. _api_fluid_layers_softplus:
+
 softplus
 --------

 ..  autofunction:: paddle.fluid.layers.softplus
    :noindex:

+.. _api_fluid_layers_softsign:
+
 softsign
 --------

 ..  autofunction:: paddle.fluid.layers.softsign
    :noindex:

+.. _api_fluid_layers_brelu:
+
 brelu
 -----

 ..  autofunction:: paddle.fluid.layers.brelu
    :noindex:

+.. _api_fluid_layers_leaky_relu:
+
 leaky_relu
 ----------

 ..  autofunction:: paddle.fluid.layers.leaky_relu
    :noindex:

+.. _api_fluid_layers_soft_relu:
+
 soft_relu
 ---------

 ..  autofunction:: paddle.fluid.layers.soft_relu
    :noindex:

+.. _api_fluid_layers_elu:
+
 elu
 ---

 ..  autofunction:: paddle.fluid.layers.elu
    :noindex:

+.. _api_fluid_layers_relu6:
+
 relu6
 -----

 ..  autofunction:: paddle.fluid.layers.relu6
    :noindex:

+.. _api_fluid_layers_pow:
+
 pow
 ---

 ..  autofunction:: paddle.fluid.layers.pow
    :noindex:

+.. _api_fluid_layers_stanh:
+
 stanh
 -----

 ..  autofunction:: paddle.fluid.layers.stanh
    :noindex:

+.. _api_fluid_layers_hard_sigmoid:
+
 hard_sigmoid
 ------------

 ..  autofunction:: paddle.fluid.layers.hard_sigmoid
    :noindex:

+.. _api_fluid_layers_swish:
+
 swish
 -----

 ..  autofunction:: paddle.fluid.layers.swish
    :noindex:

+.. _api_fluid_layers_uniform_random:
+
 uniform_random
 --------------

 ..  autofunction:: paddle.fluid.layers.uniform_random
    :noindex:

+.. _api_fluid_layers_hard_shrink:
+
 hard_shrink
 -----------

 ..  autofunction:: paddle.fluid.layers.hard_shrink
    :noindex:

+.. _api_fluid_layers_cumsum:
+
 cumsum
 ------

 ..  autofunction:: paddle.fluid.layers.cumsum
    :noindex:

+.. _api_fluid_layers_thresholded_relu:
+
 thresholded_relu
 ----------------

@@ -1039,192 +1380,383 @@ thresholded_relu
 tensor
 ======

+.. _api_fluid_layers_create_tensor:
+
 create_tensor
 -------------

 ..  autofunction:: paddle.fluid.layers.create_tensor
    :noindex:

+.. _api_fluid_layers_create_parameter:
+
 create_parameter
 ----------------

 ..  autofunction:: paddle.fluid.layers.create_parameter
    :noindex:

+.. _api_fluid_layers_create_global_var:
+
 create_global_var
 -----------------

 ..  autofunction:: paddle.fluid.layers.create_global_var
    :noindex:

+.. _api_fluid_layers_cast:
+
 cast
 ----

 ..  autofunction:: paddle.fluid.layers.cast
    :noindex:

+.. _api_fluid_layers_concat:
+
 concat
 ------

 ..  autofunction:: paddle.fluid.layers.concat
    :noindex:

+.. _api_fluid_layers_sums:
+
 sums
 ----

 ..  autofunction:: paddle.fluid.layers.sums
    :noindex:

+.. _api_fluid_layers_assign:
+
 assign
 ------

 ..  autofunction:: paddle.fluid.layers.assign
    :noindex:

+.. _api_fluid_layers_fill_constant_batch_size_like:
+
 fill_constant_batch_size_like
 -----------------------------

 ..  autofunction:: paddle.fluid.layers.fill_constant_batch_size_like
    :noindex:

+.. _api_fluid_layers_fill_constant:
+
 fill_constant
 -------------

 ..  autofunction:: paddle.fluid.layers.fill_constant
    :noindex:

+.. _api_fluid_layers_argmin:
+
 argmin
 ------

 ..  autofunction:: paddle.fluid.layers.argmin
    :noindex:

+.. _api_fluid_layers_argmax:
+
 argmax
 ------

 ..  autofunction:: paddle.fluid.layers.argmax
    :noindex:

+.. _api_fluid_layers_ones:
+
 ones
 ----

 ..  autofunction:: paddle.fluid.layers.ones
    :noindex:

+.. _api_fluid_layers_zeros:
+
 zeros
 -----

 ..  autofunction:: paddle.fluid.layers.zeros
    :noindex:

+.. _api_fluid_layers_reverse:
+
+reverse
+-------
+
+..  autofunction:: paddle.fluid.layers.reverse
+    :noindex:
+
+learning_rate_scheduler
+=======================
+
+.. _api_fluid_layers_exponential_decay:
+
+exponential_decay
+-----------------
+
+..  autofunction:: paddle.fluid.layers.exponential_decay
+    :noindex:
+
+.. _api_fluid_layers_natural_exp_decay:
+
+natural_exp_decay
+-----------------
+
+..  autofunction:: paddle.fluid.layers.natural_exp_decay
+    :noindex:
+
+.. _api_fluid_layers_inverse_time_decay:
+
+inverse_time_decay
+------------------
+
+..  autofunction:: paddle.fluid.layers.inverse_time_decay
+    :noindex:
+
+.. _api_fluid_layers_polynomial_decay:
+
+polynomial_decay
+----------------
+
+..  autofunction:: paddle.fluid.layers.polynomial_decay
+    :noindex:
+
+.. _api_fluid_layers_piecewise_decay:
+
+piecewise_decay
+---------------
+
+..  autofunction:: paddle.fluid.layers.piecewise_decay
+    :noindex:
+
+.. _api_fluid_layers_noam_decay:
+
+noam_decay
+----------
+
+..  autofunction:: paddle.fluid.layers.noam_decay
+    :noindex:
+
+.. _api_fluid_layers_append_LARS:
+
+append_LARS
+-----------
+
+..  autofunction:: paddle.fluid.layers.append_LARS
+    :noindex:
+
 detection
 =========

+.. _api_fluid_layers_prior_box:
+
 prior_box
 ---------

 ..  autofunction:: paddle.fluid.layers.prior_box
    :noindex:

+.. _api_fluid_layers_multi_box_head:
+
 multi_box_head
 --------------

 ..  autofunction:: paddle.fluid.layers.multi_box_head
    :noindex:

+.. _api_fluid_layers_bipartite_match:
+
 bipartite_match
 ---------------

 ..  autofunction:: paddle.fluid.layers.bipartite_match
    :noindex:

+.. _api_fluid_layers_target_assign:
+
 target_assign
 -------------

 ..  autofunction:: paddle.fluid.layers.target_assign
    :noindex:

+.. _api_fluid_layers_detection_output:
+
 detection_output
 ----------------

 ..  autofunction:: paddle.fluid.layers.detection_output
    :noindex:

+.. _api_fluid_layers_ssd_loss:
+
 ssd_loss
 --------

 ..  autofunction:: paddle.fluid.layers.ssd_loss
    :noindex:

+.. _api_fluid_layers_detection_map:
+
 detection_map
 -------------

 ..  autofunction:: paddle.fluid.layers.detection_map
    :noindex:

+.. _api_fluid_layers_iou_similarity:
+
 iou_similarity
 --------------

 ..  autofunction:: paddle.fluid.layers.iou_similarity
    :noindex:

+.. _api_fluid_layers_box_coder:
+
 box_coder
 ---------

 ..  autofunction:: paddle.fluid.layers.box_coder
    :noindex:

-learning_rate_scheduler
-=======================
+metric_op
+=========

-exponential_decay
-----------------
+.. _api_fluid_layers_accuracy:

-..  autofunction:: paddle.fluid.layers.exponential_decay
+accuracy
+--------
+
+..  autofunction:: paddle.fluid.layers.accuracy
    :noindex:

-natural_exp_decay
-----------------
+.. _api_fluid_layers_auc:

-..  autofunction:: paddle.fluid.layers.natural_exp_decay
+auc
+---
+
+..  autofunction:: paddle.fluid.layers.auc
    :noindex:

-inverse_time_decay
------------------
+tensor
+======

-..  autofunction:: paddle.fluid.layers.inverse_time_decay
+.. _api_fluid_layers_create_tensor:
+
+create_tensor
+-------------
+
+..  autofunction:: paddle.fluid.layers.create_tensor
    :noindex:

-polynomial_decay
+.. _api_fluid_layers_create_parameter:
+
+create_parameter
 ----------------

-..  autofunction:: paddle.fluid.layers.polynomial_decay
+..  autofunction:: paddle.fluid.layers.create_parameter
    :noindex:

-piecewise_decay
---------------
+.. _api_fluid_layers_create_global_var:

-..  autofunction:: paddle.fluid.layers.piecewise_decay
+create_global_var
+-----------------
+
+..  autofunction:: paddle.fluid.layers.create_global_var
    :noindex:

-noam_decay
----------
+.. _api_fluid_layers_cast:

-..  autofunction:: paddle.fluid.layers.noam_decay
+cast
+----
+
+..  autofunction:: paddle.fluid.layers.cast
    :noindex:

-metric
-======
+.. _api_fluid_layers_concat:

-accuracy
--------
+concat
+------

-..  autofunction:: paddle.fluid.layers.accuracy
+..  autofunction:: paddle.fluid.layers.concat
    :noindex:

-auc
---
+.. _api_fluid_layers_sums:

-..  autofunction:: paddle.fluid.layers.auc
+sums
+----
+
+..  autofunction:: paddle.fluid.layers.sums
+    :noindex:
+
+.. _api_fluid_layers_assign:
+
+assign
+------
+
+..  autofunction:: paddle.fluid.layers.assign
+    :noindex:
+
+.. _api_fluid_layers_fill_constant_batch_size_like:
+
+fill_constant_batch_size_like
+-----------------------------
+
+..  autofunction:: paddle.fluid.layers.fill_constant_batch_size_like
+    :noindex:
+
+.. _api_fluid_layers_fill_constant:
+
+fill_constant
+-------------
+
+..  autofunction:: paddle.fluid.layers.fill_constant
+    :noindex:
+
+.. _api_fluid_layers_argmin:
+
+argmin
+------
+
+..  autofunction:: paddle.fluid.layers.argmin
+    :noindex:
+
+.. _api_fluid_layers_argmax:
+
+argmax
+------
+
+..  autofunction:: paddle.fluid.layers.argmax
+    :noindex:
+
+.. _api_fluid_layers_ones:
+
+ones
+----
+
+..  autofunction:: paddle.fluid.layers.ones
+    :noindex:
+
+.. _api_fluid_layers_zeros:
+
+zeros
+-----
+
+..  autofunction:: paddle.fluid.layers.zeros
+    :noindex:
+
+.. _api_fluid_layers_reverse:
+
+reverse
+-------
+
+..  autofunction:: paddle.fluid.layers.reverse
    :noindex:

--- a/doc/fluid/api/metrics.rst
+++ b/doc/fluid/api/metrics.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!

-=======
-metrics
-=======
+=============
+fluid.metrics
+=============
+
+.. _api_fluid_metrics_MetricBase:

 MetricBase
 ----------
@@ -12,6 +14,8 @@ MetricBase
    :members:
    :noindex:

+.. _api_fluid_metrics_CompositeMetric:
+
 CompositeMetric
 ---------------

@@ -19,6 +23,26 @@ CompositeMetric
    :members:
    :noindex:

+.. _api_fluid_metrics_Precision:
+
+Precision
+---------
+
+..  autoclass:: paddle.fluid.metrics.Precision
+    :members:
+    :noindex:
+
+.. _api_fluid_metrics_Recall:
+
+Recall
+------
+
+..  autoclass:: paddle.fluid.metrics.Recall
+    :members:
+    :noindex:
+
+.. _api_fluid_metrics_Accuracy:
+
 Accuracy
 --------

@@ -26,6 +50,8 @@ Accuracy
    :members:
    :noindex:

+.. _api_fluid_metrics_ChunkEvaluator:
+
 ChunkEvaluator
 --------------

@@ -33,6 +59,8 @@ ChunkEvaluator
    :members:
    :noindex:

+.. _api_fluid_metrics_EditDistance:
+
 EditDistance
 ------------

@@ -40,6 +68,8 @@ EditDistance
    :members:
    :noindex:

+.. _api_fluid_metrics_DetectionMAP:
+
 DetectionMAP
 ------------

@@ -47,6 +77,8 @@ DetectionMAP
    :members:
    :noindex:

+.. _api_fluid_metrics_Auc:
+
 Auc
 ---


--- a/doc/fluid/api/nets.rst
+++ b/doc/fluid/api/nets.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!

-====
-nets
-====
+==========
+fluid.nets
+==========
+
+.. _api_fluid_nets_simple_img_conv_pool:

 simple_img_conv_pool
 --------------------
@@ -11,18 +13,24 @@ simple_img_conv_pool
 ..  autofunction:: paddle.fluid.nets.simple_img_conv_pool
    :noindex:

+.. _api_fluid_nets_sequence_conv_pool:
+
 sequence_conv_pool
 ------------------

 ..  autofunction:: paddle.fluid.nets.sequence_conv_pool
    :noindex:

+.. _api_fluid_nets_glu:
+
 glu
 ---

 ..  autofunction:: paddle.fluid.nets.glu
    :noindex:

+.. _api_fluid_nets_scaled_dot_product_attention:
+
 scaled_dot_product_attention
 ----------------------------


--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!

-=========
-optimizer
-=========
+===============
+fluid.optimizer
+===============
+
+.. _api_fluid_optimizer_SGD:

 SGD
 ---
@@ -12,6 +14,8 @@ SGD
    :members:
    :noindex:

+.. _api_fluid_optimizer_Momentum:
+
 Momentum
 --------

@@ -19,6 +23,8 @@ Momentum
    :members:
    :noindex:

+.. _api_fluid_optimizer_Adagrad:
+
 Adagrad
 -------

@@ -26,6 +32,8 @@ Adagrad
    :members:
    :noindex:

+.. _api_fluid_optimizer_Adam:
+
 Adam
 ----

@@ -33,6 +41,8 @@ Adam
    :members:
    :noindex:

+.. _api_fluid_optimizer_Adamax:
+
 Adamax
 ------

@@ -40,6 +50,8 @@ Adamax
    :members:
    :noindex:

+.. _api_fluid_optimizer_DecayedAdagrad:
+
 DecayedAdagrad
 --------------

@@ -47,6 +59,17 @@ DecayedAdagrad
    :members:
    :noindex:

+.. _api_fluid_optimizer_Ftrl:
+
+Ftrl
+----
+
+..  autoclass:: paddle.fluid.optimizer.Ftrl
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_SGDOptimizer:
+
 SGDOptimizer
 ------------

@@ -54,6 +77,8 @@ SGDOptimizer
    :members:
    :noindex:

+.. _api_fluid_optimizer_MomentumOptimizer:
+
 MomentumOptimizer
 -----------------

@@ -61,6 +86,8 @@ MomentumOptimizer
    :members:
    :noindex:

+.. _api_fluid_optimizer_AdagradOptimizer:
+
 AdagradOptimizer
 ----------------

@@ -68,6 +95,8 @@ AdagradOptimizer
    :members:
    :noindex:

+.. _api_fluid_optimizer_AdamOptimizer:
+
 AdamOptimizer
 -------------

@@ -75,6 +104,8 @@ AdamOptimizer
    :members:
    :noindex:

+.. _api_fluid_optimizer_AdamaxOptimizer:
+
 AdamaxOptimizer
 ---------------

@@ -82,6 +113,8 @@ AdamaxOptimizer
    :members:
    :noindex:

+.. _api_fluid_optimizer_DecayedAdagradOptimizer:
+
 DecayedAdagradOptimizer
 -----------------------

@@ -89,6 +122,8 @@ DecayedAdagradOptimizer
    :members:
    :noindex:

+.. _api_fluid_optimizer_RMSPropOptimizer:
+
 RMSPropOptimizer
 ----------------

@@ -96,6 +131,17 @@ RMSPropOptimizer
    :members:
    :noindex:

+.. _api_fluid_optimizer_FtrlOptimizer:
+
+FtrlOptimizer
+-------------
+
+..  autoclass:: paddle.fluid.optimizer.FtrlOptimizer
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_Adadelta:
+
 Adadelta
 --------

@@ -103,6 +149,8 @@ Adadelta
    :members:
    :noindex:

+.. _api_fluid_optimizer_ModelAverage:
+
 ModelAverage
 ------------

@@ -110,6 +158,8 @@ ModelAverage
    :members:
    :noindex:

+.. _api_fluid_optimizer_Optimizer:
+
 Optimizer
 ---------

@@ -117,3 +167,12 @@ Optimizer
    :members:
    :noindex:

+.. _api_fluid_optimizer_RMSPropOptimizer:
+
+RMSPropOptimizer
+----------------
+
+..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
+    :members:
+    :noindex:
+
--- a/doc/fluid/api/param_attr.rst
+++ b/doc/fluid/api/param_attr.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!

-==========
-param_attr
-==========
+================
+fluid.param_attr
+================
+
+.. _api_fluid_param_attr_ParamAttr:

 ParamAttr
 ---------
@@ -12,6 +14,8 @@ ParamAttr
    :members:
    :noindex:

+.. _api_fluid_param_attr_WeightNormParamAttr:
+
 WeightNormParamAttr
 -------------------


--- a/doc/fluid/api/profiler.rst
+++ b/doc/fluid/api/profiler.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!

-========
-profiler
-========
+==============
+fluid.profiler
+==============
+
+.. _api_fluid_profiler_cuda_profiler:

 cuda_profiler
 -------------
@@ -11,24 +13,32 @@ cuda_profiler
 ..  autofunction:: paddle.fluid.profiler.cuda_profiler
    :noindex:

+.. _api_fluid_profiler_reset_profiler:
+
 reset_profiler
 --------------

 ..  autofunction:: paddle.fluid.profiler.reset_profiler
    :noindex:

+.. _api_fluid_profiler_profiler:
+
 profiler
 --------

 ..  autofunction:: paddle.fluid.profiler.profiler
    :noindex:

+.. _api_fluid_profiler_start_profiler:
+
 start_profiler
 --------------

 ..  autofunction:: paddle.fluid.profiler.start_profiler
    :noindex:

+.. _api_fluid_profiler_stop_profiler:
+
 stop_profiler
 -------------


--- a/doc/fluid/api/recordio_writer.rst
+++ b/doc/fluid/api/recordio_writer.rst
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=====================
+fluid.recordio_writer
+=====================
+
+.. _api_fluid_recordio_writer_convert_reader_to_recordio_file:
+
+convert_reader_to_recordio_file
+-------------------------------
+
+..  autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_file
+    :noindex:
+
+.. _api_fluid_recordio_writer_convert_reader_to_recordio_files:
+
+convert_reader_to_recordio_files
+--------------------------------
+
+..  autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_files
+    :noindex:
+
--- a/doc/fluid/api/regularizer.rst
+++ b/doc/fluid/api/regularizer.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!

-===========
-regularizer
-===========
+=================
+fluid.regularizer
+=================
+
+.. _api_fluid_regularizer_append_regularization_ops:

 append_regularization_ops
 -------------------------
@@ -11,12 +13,7 @@ append_regularization_ops
 ..  autofunction:: paddle.fluid.regularizer.append_regularization_ops
    :noindex:

-WeightDecayRegularizer
----------------------
-
-..  autoclass:: paddle.fluid.regularizer.WeightDecayRegularizer
-    :members:
-    :noindex:
+.. _api_fluid_regularizer_L1Decay:

 L1Decay
 -------
@@ -25,6 +22,8 @@ L1Decay
    :members:
    :noindex:

+.. _api_fluid_regularizer_L2Decay:
+
 L2Decay
 -------

@@ -32,6 +31,8 @@ L2Decay
    :members:
    :noindex:

+.. _api_fluid_regularizer_L1DecayRegularizer:
+
 L1DecayRegularizer
 ------------------

@@ -39,6 +40,8 @@ L1DecayRegularizer
    :members:
    :noindex:

+.. _api_fluid_regularizer_L2DecayRegularizer:
+
 L2DecayRegularizer
 ------------------


--- a/doc/fluid/api/transpiler.rst
+++ b/doc/fluid/api/transpiler.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!

-==========
-transpiler
-==========
+================
+fluid.transpiler
+================
+
+.. _api_fluid_transpiler_DistributeTranspiler:

 DistributeTranspiler
 --------------------
@@ -12,12 +14,7 @@ DistributeTranspiler
    :members:
    :noindex:

-InferenceTranspiler
-------------------
-
-..  autoclass:: paddle.fluid.transpiler.InferenceTranspiler
-    :members:
-    :noindex:
+.. _api_fluid_transpiler_memory_optimize:

 memory_optimize
 ---------------
@@ -25,12 +22,16 @@ memory_optimize
 ..  autofunction:: paddle.fluid.transpiler.memory_optimize
    :noindex:

+.. _api_fluid_transpiler_release_memory:
+
 release_memory
 --------------

 ..  autofunction:: paddle.fluid.transpiler.release_memory
    :noindex:

+.. _api_fluid_transpiler_HashName:
+
 HashName
 --------

@@ -38,9 +39,12 @@ HashName
    :members:
    :noindex:

+.. _api_fluid_transpiler_RoundRobin:
+
 RoundRobin
 ----------

 ..  autoclass:: paddle.fluid.transpiler.RoundRobin
    :members:
    :noindex:
+
--- a/doc/v2/faq/build_and_install/index_cn.rst
+++ b/doc/v2/faq/build_and_install/index_cn.rst
@@ -213,3 +213,12 @@ virtualenv本身也是Python的一个包，可以用pip进行安装：
 保存并关闭文件。

 这样，每次打开终端时就会自动启动名为‘paddle’的Python环境了。
+
+10. 通过pip安装的PaddlePaddle在  :code:`import paddle.fluid` 报找不到 :code:`libmkldnn.so` 或 :code:`libmklml_intel.so`
+------------------------------------------------------------------------------------------
+出现这种问题的原因是在导入 :code:`paddle.fluid` 时需要加载 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so`，
+但是系统没有找到该文件。一般通过pip安装PaddlePaddle时会将 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so`
+拷贝到 :code:`/usr/local/lib` 路径下，所以解决办法是将该路径加到 :code:`LD_LIBRARY_PATH` 环境变量下，
+即： :code:`export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH` 。
+
+**注意**：如果是在虚拟环境中安装PaddlePaddle， :code:`libmkldnn.so` 和 :code:`libmklml_intel.so` 可能不在 :code:`/usr/local/lib` 路径下。
\ No newline at end of file
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -147,10 +147,9 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                 "Input tensor type is not supported: ", in.type().name());
  memory::data_type out_type = in_type;

-  memory::format in_format =
-      in_tz.size() == 2 ? memory::format::nc : in.format();
-  memory::format out_format =
-      out_tz.size() == 2 ? memory::format::nc : ToMKLDNNFormat(out_layout);
+  auto in_format = MKLDNNFormatForSize(in_tz.size(), in.format());
+  auto out_format =
+      MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));

  void* in_data = GetDataFromTensor(in, in_type);


--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -61,6 +61,13 @@ inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) {
  if (iter != dict.end()) return iter->second;
  return MKLDNNDataType::data_undef;
 }
+
+inline MKLDNNFormat MKLDNNFormatForSize(size_t dims_size,
+                                        MKLDNNFormat default_format) {
+  return (dims_size == 1
+              ? mkldnn::memory::format::x
+              : dims_size == 2 ? mkldnn::memory::format::nc : default_format);
+}
 #endif

 void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,

--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -47,9 +47,13 @@ void DataTransform(const OpKernelType& expected_kernel_type,
 #ifdef PADDLE_WITH_MKLDNN
        // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
        // Just set layout/format. No real transform occur
+
+        auto out_format =
+            MKLDNNFormatForSize(in.dims().size(), ToMKLDNNFormat(lin));
+
        out.ShareDataWith(input_tensor);
        out.set_layout(DataLayout::kMKLDNN);
-        out.set_format(ToMKLDNNFormat(lin));
+        out.set_format(out_format);
 #endif
      } else {
        // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel

--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -103,13 +103,6 @@ void BroadcastOpHandle::RunImpl() {
          });
    }

-    // FIXME(zcd): a temporary fix for some language model that has sparse
-    // parameter.
-    bool use_mutex = true;
-    if (in_var->IsType<paddle::framework::SelectedRows>()) {
-      use_mutex = false;
-    }
-    if (use_mutex) {
    this->RunAndRecordEvent([&] {
      {
        platform::NCCLGroupGuard guard;
@@ -127,26 +120,6 @@ void BroadcastOpHandle::RunImpl() {
            &VariableVisitor::GetMutableTensor(out_var));
      }
    });
-    } else {
-      this->RunAndRecordEventNoMutex([&] {
-        {
-          platform::NCCLGroupGuard guard;
-          for (auto &call : broadcast_calls) {
-            call();
-          }
-        }
-
-        if (!out_handle->IsTheSameVar(*in_var_handle)) {
-          auto out_var = var_scopes.at(in_var_handle->scope_idx_)
-                             ->FindVar(out_var_handles[0]->name_);
-          paddle::framework::TensorCopy(
-              in_tensor, in_var_handle->place_,
-              *(dev_ctxes_.at(in_var_handle->place_)),
-              &VariableVisitor::GetMutableTensor(out_var));
-        }
-      });
-    }
-
 #else
    PADDLE_THROW("CUDA is not enabled.");
 #endif

--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -470,7 +470,7 @@ void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op,
 void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
                                                const OpDesc &op) const {
  int op_dev_id = -1;
-  if (op.Type() == "split_byref") {
+  if (op.Type() == "split_byref" || op.Type() == "split_selected_rows") {
    op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
      op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());

--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -47,7 +47,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 #endif

  std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
-  int GetVarDeviceID(const std::string &varname) const;
+  int GetVarDeviceID(const std::string &varname) const override;

 private:
  void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op,

--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -11,8 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "paddle/fluid/framework/details/op_handle_base.h"
+#include <map>

 namespace paddle {
 namespace framework {
@@ -122,34 +122,16 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
 #ifdef PADDLE_WITH_CUDA
  if (!events_.empty()) {  // Use event
    std::function<void()> method = callback;
-
+    // NOTE(zcd): device context must be ordered here because RecordEvent
+    // will use a mutex to ensure the safe of multi-threads.
+    std::map<platform::DeviceContext *, platform::Place> ordered_ctxes;
    for (auto &p : dev_ctxes_) {
-      method = [method, p, this]() {
-        static_cast<platform::CUDADeviceContext *>(p.second)->RecordEvent(
-            events_.at(boost::get<platform::CUDAPlace>(p.first).device),
-            method);
-      };
-    }
-    method();
-  } else {
-#endif
-    callback();
-#ifdef PADDLE_WITH_CUDA
+      ordered_ctxes.emplace(p.second, p.first);
    }
-#endif
-}
-
-void OpHandleBase::RunAndRecordEventNoMutex(
-    const std::function<void()> &callback) {
-#ifdef PADDLE_WITH_CUDA
-  if (!events_.empty()) {  // Use event
-    std::function<void()> method = callback;
-
-    for (auto &p : dev_ctxes_) {
+    for (auto &p : ordered_ctxes) {
      method = [method, p, this]() {
-        static_cast<platform::CUDADeviceContext *>(p.second)
-            ->RecordEventNoMutex(
-                events_.at(boost::get<platform::CUDAPlace>(p.first).device),
+        static_cast<platform::CUDADeviceContext *>(p.first)->RecordEvent(
+            events_.at(boost::get<platform::CUDAPlace>(p.second).device),
            method);
      };
    }

--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -85,10 +85,6 @@ class OpHandleBase {
 protected:
  void RunAndRecordEvent(const std::function<void()> &callback);

-  // FIXME(zcd): A temporary fix for some language model that has sparse
-  // parameter.
-  void RunAndRecordEventNoMutex(const std::function<void()> &callback);
-
  void RunAndRecordEvent(platform::Place p,
                         const std::function<void()> &callback);


--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -80,9 +80,7 @@ void ReduceOpHandle::RunImpl() {
  }

  if (pre_in_var->IsType<framework::SelectedRows>()) {
-    // FIXME(zcd): A temporary fix for some language model that has sparse
-    // parameter.
-    this->RunAndRecordEventNoMutex([&] {
+    this->RunAndRecordEvent([&] {
      std::vector<const SelectedRows *> in_selected_rows =
          GetInputValues<SelectedRows>(in_var_handles, var_scopes);
      GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_, t_out_p,

--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -27,6 +27,7 @@ enum AttrType {
  BOOLEANS = 7;
  BLOCK = 8;
  LONG = 9;
+  BLOCKS = 10;
 }

 // OpDesc describes an instance of a C++ framework::OperatorBase
@@ -46,6 +47,7 @@ message OpDesc {
    repeated bool bools = 11;
    optional int32 block_idx = 12;
    optional int64 l = 13;
+    repeated int32 blocks_idx = 14;
  };

  message Var {

--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -51,8 +51,6 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) {
 }

 std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
-  PADDLE_ENFORCE(t.type().hash_code() == typeid(float).hash_code());
-
  if (!platform::is_cpu_place(t.place())) {
    LoDTensor tt;
    framework::TensorCopy(t, platform::CPUPlace(), &tt);
@@ -70,7 +68,13 @@ std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
  // only print first ten elements
  int64_t size = t.numel() < 10 ? t.numel() : 10;
  for (int64_t i = 0; i < size; ++i) {
+    if (t.type().hash_code() == typeid(float).hash_code()) {
      os << t.data<float>()[i] << " ";
+    } else if (t.type().hash_code() == typeid(int64_t).hash_code()) {
+      os << t.data<int64_t>()[i] << " ";
+    } else {
+      PADDLE_THROW("LoDTensor data type not in [float, int64_t]");
+    }
  }

  return os;

--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -26,6 +26,20 @@
 namespace paddle {
 namespace framework {

+TEST(LoD, PrintLoDTensor) {
+  LoDTensor tensor1;
+  tensor1.mutable_data<float>(platform::CPUPlace());
+  tensor1.data<float>()[0] = 0.2;
+  tensor1.data<float>()[1] = 0.5;
+  LOG(INFO) << tensor1;
+
+  LoDTensor tensor2;
+  tensor2.mutable_data<int64_t>(platform::CPUPlace());
+  tensor2.data<int64_t>()[0] = 1;
+  tensor2.data<int64_t>()[1] = 2;
+  LOG(INFO) << tensor2;
+}
+
 TEST(LoD, data) {
  LoD lod{{0, 1, 2}};
  lod.push_back({0, 2, 4, 5});
@@ -37,7 +51,7 @@ TEST(LoD, data) {
  }
 }

-TEST(LodExpand, test) {
+TEST(LoD, ExpandLoD) {
  LoD lod{{0, 2}};
  LoDTensor tensor;
  tensor.set_lod(lod);

--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -211,6 +211,12 @@ void OpDesc::SetBlockAttr(const std::string &name, BlockDesc *block) {
  need_update_ = true;
 }

+void OpDesc::SetBlocksAttr(const std::string &name,
+                           std::vector<BlockDesc *> blocks) {
+  this->attrs_[name] = blocks;
+  need_update_ = true;
+}
+
 void OpDesc::SetAttrMap(
    const std::unordered_map<std::string, Attribute> &attr_map) {
  attrs_ = attr_map;
@@ -305,6 +311,13 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
  void operator()(const std::vector<bool> &v) const {
    VectorToRepeated(v, attr_->mutable_bools());
  }
+  void operator()(const std::vector<BlockDesc *> &v) const {
+    std::vector<int> blocks_idx;
+    for (auto blk : v) {
+      blocks_idx.push_back(blk->ID());
+    }
+    VectorToRepeated(blocks_idx, attr_->mutable_blocks_idx());
+  }
  void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); }
  void operator()(int64_t v) const { attr_->set_l(v); }
  void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }

--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -77,6 +77,8 @@ class OpDesc {

  void SetBlockAttr(const std::string &name, BlockDesc *block);

+  void SetBlocksAttr(const std::string &name, std::vector<BlockDesc *> blocks);
+
  Attribute GetAttr(const std::string &name) const;

  Attribute GetNullableAttr(const std::string &name) const;

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -121,7 +121,7 @@ ParallelExecutor::ParallelExecutor(
 #endif
  }

-  builder_ = std::move(builder_factory.Create());
+  builder_ = builder_factory.Create();
  member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
      exec_strategy, member_->local_scopes_, places,
      builder_->Build(main_program)));

--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -35,7 +35,8 @@ using VariableNameMap = std::map<std::string, std::vector<std::string>>;
 using Attribute =
    boost::variant<boost::blank, int, float, std::string, std::vector<int>,
                   std::vector<float>, std::vector<std::string>, bool,
-                   std::vector<bool>, BlockDesc*, int64_t>;
+                   std::vector<bool>, BlockDesc*, int64_t,
+                   std::vector<BlockDesc*>>;

 using AttributeMap = std::unordered_map<std::string, Attribute>;


--- a/paddle/fluid/operators/assign_value_op.cc
+++ b/paddle/fluid/operators/assign_value_op.cc
@@ -70,6 +70,7 @@ $$Out = values$$

 namespace ops = paddle::operators;

-REGISTER_OPERATOR(assign_value, ops::AssignValueOp, ops::AssignValueOpMaker);
+REGISTER_OPERATOR(assign_value, ops::AssignValueOp, ops::AssignValueOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
 REGISTER_OP_CPU_KERNEL(assign_value, ops::AssignValueKernel<int>,
                       ops::AssignValueKernel<float>);
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -18,6 +18,7 @@ limitations under the License. */

 #include <limits>

+#include "glog/logging.h"  // For VLOG
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -75,6 +76,9 @@ bool GRPCClient::AsyncSendVar(const std::string& ep,
    var_h.scope = p_scope;
    var_h.name = var_name_val;
    var_h.ctx = p_ctx;
+    var_h.method = "Send";
+
+    VLOG(3) << var_h.String() << " begin";

    // stub context
    SendProcessor* s = new SendProcessor(ch);
@@ -129,6 +133,9 @@ bool GRPCClient::AsyncGetVar(const std::string& ep,
    var_h.scope = p_scope;
    var_h.name = var_name_val;
    var_h.ctx = p_ctx;
+    var_h.method = "Get";
+
+    VLOG(3) << var_h.String() << " begin";

    // stub context
    GetProcessor* s = new GetProcessor(ch);
@@ -172,6 +179,9 @@ bool GRPCClient::AsyncPrefetchVar(const std::string& ep,
    var_h.scope = p_scope;
    var_h.name = out_var_name_val;
    var_h.ctx = p_ctx;
+    var_h.method = "Prefetch";
+
+    VLOG(3) << var_h.String() << " begin";

    // stub context
    GetProcessor* s = new GetProcessor(ch);
@@ -243,10 +253,11 @@ void GRPCClient::Proceed() {
    GPR_ASSERT(ok);
    PADDLE_ENFORCE(c);
    if (c->status_.ok()) {
+      VLOG(3) << c->var_h_.String() << " process";
      c->Process();
    } else {
-      LOG(FATAL) << "var: " << c->var_h_.String()
-                 << " grpc error:" << c->status_.error_message();
+      LOG(FATAL) << c->var_h_.String()
+                 << " meets grpc error:" << c->status_.error_message();
    }
    delete c;
    {
@@ -258,14 +269,15 @@ void GRPCClient::Proceed() {
 }

 std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
-  // TODO(Yancey1989): make grpc client completely thread-safe
  std::lock_guard<std::mutex> guard(chan_mutex_);
  auto it = channels_.find(ep);
  if (it != channels_.end()) {
    return it->second;
  }

+  // Channel configurations:
  grpc::ChannelArguments args;
+  args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 2000);
  args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE);
  args.SetMaxSendMessageSize(std::numeric_limits<int>::max());
  args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());

--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -47,14 +47,18 @@ namespace operators {
 namespace distributed {

 struct VarHandle {
+  // RPC endpoint.
  std::string ep;
  const platform::DeviceContext* ctx;
  const framework::Scope* scope;
+  // Variable name.
  std::string name;
+  // RPC method name.
+  std::string method;

  std::string String() const {
    std::ostringstream s;
-    s << "name:[" << name << "] ep:[" << ep << "]";
+    s << method << " name:[" << name << "], ep:[" << ep << "]";
    return s.str();
  }
 };
@@ -72,6 +76,7 @@ class BaseProcessor {
  virtual void Prepare(const VarHandle& var_info, int64_t time_out) {
    context_.reset(new grpc::ClientContext());
    var_h_ = var_info;
+    context_->set_wait_for_ready(true);

    std::chrono::system_clock::time_point deadline =
        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
@@ -81,6 +86,7 @@ class BaseProcessor {

  virtual void Prepare(int64_t time_out) {
    context_.reset(new grpc::ClientContext());
+    context_->set_wait_for_ready(true);

    std::chrono::system_clock::time_point deadline =
        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
@@ -172,26 +178,24 @@ class GRPCClient : public RPCClient {

  bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
                    const framework::Scope& scope, const std::string& var_name,
-                    int64_t time_out = RPCClient::rpc_time_out) override;
+                    int64_t time_out = FLAGS_grpc_deadline) override;

  bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx,
                   const framework::Scope& scope, const std::string& var_name,
-                   int64_t time_out = RPCClient::rpc_time_out) override;
+                   int64_t time_out = FLAGS_grpc_deadline) override;

  bool AsyncPrefetchVar(const std::string& ep,
                        const platform::DeviceContext& ctx,
                        const framework::Scope& scope,
                        const std::string& in_var_name,
                        const std::string& out_var_name,
-                        int64_t time_out = RPCClient::rpc_time_out) override;
+                        int64_t time_out = FLAGS_grpc_deadline) override;

-  void AsyncSendBatchBarrier(
-      const std::string& ep,
-      int64_t time_out = RPCClient::rpc_time_out) override;
+  void AsyncSendBatchBarrier(const std::string& ep,
+                             int64_t time_out = FLAGS_grpc_deadline) override;

-  void AsyncSendFetchBarrier(
-      const std::string& ep,
-      int64_t time_out = RPCClient::rpc_time_out) override;
+  void AsyncSendFetchBarrier(const std::string& ep,
+                             int64_t time_out = FLAGS_grpc_deadline) override;

  void Wait() override;

@@ -207,7 +211,7 @@ class GRPCClient : public RPCClient {
  void Proceed();

  void AsyncSendComplete(const std::string& ep,
-                         int64_t time_out = RPCClient::rpc_time_out);
+                         int64_t time_out = FLAGS_grpc_deadline);

  std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);


--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
@@ -41,6 +41,19 @@ class RequestBase {
  virtual ~RequestBase() {}
  virtual void Process() = 0;

+  std::string Status2String(const std::string& method) {
+    std::string status = "Process";
+    if (status_ == FINISH) {
+      status = "Finish";
+    }
+
+    std::ostringstream s;
+    s << method << " name:[" << GetReqName() << "]"
+      << ", ep:[" << ctx_.peer() << "]"
+      << " " << status << " using req_id:" << req_id_;
+    return s.str();
+  }
+
  CallStatus Status() const {
    std::lock_guard<std::mutex> l(status_mu_);
    return status_;
@@ -84,7 +97,7 @@ class RequestSend final : public RequestBase {

  void Process() override {
    std::string varname = GetReqName();
-    VLOG(3) << "RequestSend var_name:" << varname;
+    VLOG(4) << "RequestSend var_name:" << varname;

    auto scope = request_->GetMutableLocalScope();
    auto invar = request_->GetVar();
@@ -119,7 +132,7 @@ class RequestGet final : public RequestBase {
  void Process() override {
    // proc request.
    std::string varname = request_.varname();
-    VLOG(3) << "RequestGet " << varname;
+    VLOG(4) << "RequestGet " << varname;

    auto scope = request_handler_->scope();
    auto invar = scope->FindVar(varname);
@@ -165,7 +178,7 @@ class RequestPrefetch final : public RequestBase {
    // prefetch process...
    std::string in_var_name = request_->Varname();
    std::string out_var_name = request_->OutVarname();
-    VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name
+    VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name
            << " out_var_name: " << out_var_name;

    auto scope = request_->GetMutableLocalScope();
@@ -188,10 +201,10 @@ class RequestPrefetch final : public RequestBase {
 };

 void AsyncGRPCServer::WaitServerReady() {
-  VLOG(3) << "AsyncGRPCServer is wait server ready";
+  VLOG(4) << "AsyncGRPCServer is wait server ready";
  std::unique_lock<std::mutex> lock(this->mutex_ready_);
  condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
-  VLOG(3) << "AsyncGRPCServer WaitSeverReady";
+  VLOG(4) << "AsyncGRPCServer WaitSeverReady";
 }

 void AsyncGRPCServer::StartServer() {
@@ -230,7 +243,7 @@ void AsyncGRPCServer::StartServer() {
    for (int i = 0; i < threadnum; i++) {
      rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind(
          &AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f)));
-      VLOG(3) << t.first << " creates threads!";
+      VLOG(4) << t.first << " creates threads!";
    }
  }

@@ -247,7 +260,7 @@ void AsyncGRPCServer::StartServer() {
    auto& threads = t.second;
    for (size_t i = 0; i < threads.size(); ++i) {
      threads[i]->join();
-      VLOG(3) << t.first << " threads ends!";
+      VLOG(4) << t.first << " threads ends!";
    }
  }
 }
@@ -255,7 +268,7 @@ void AsyncGRPCServer::StartServer() {
 void AsyncGRPCServer::ShutdownQueue() {
  for (auto& t : rpc_cq_) {
    t.second->Shutdown();
-    VLOG(3) << t.first << " shutdown!";
+    VLOG(4) << t.first << " queue shutdown!";
  }
 }

@@ -264,7 +277,7 @@ void AsyncGRPCServer::ShutDownImpl() {
  is_shut_down_ = true;
  ShutdownQueue();

-  VLOG(3) << "server_ shutdown!";
+  VLOG(4) << "server_ shutdown!";
  server_->Shutdown();
 }

@@ -272,7 +285,7 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
                                          int req_id) {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  if (is_shut_down_) {
-    VLOG(3) << "shutdown, do not TryToRegisterNewSendOne";
+    VLOG(4) << "shutdown, do not TryToRegisterNewSendOne";
    return;
  }

@@ -306,14 +319,14 @@ void AsyncGRPCServer::HandleRequest(
  bool ok = false;

  while (true) {
-    VLOG(3) << "HandleRequest " << rpc_name << " wait next";
+    VLOG(4) << "HandleRequest " << rpc_name << " wait next";
    if (!cq->Next(&tag, &ok)) {
      LOG(INFO) << "CompletionQueue " << rpc_name << " shutdown!";
      break;
    }

    int req_id = static_cast<int>(reinterpret_cast<intptr_t>(tag));
-    VLOG(3) << "HandleRequest " << rpc_name << ", req_id:" << req_id
+    VLOG(4) << "HandleRequest " << rpc_name << ", req_id:" << req_id
            << " get next";

    auto& reqs = rpc_reqs_[rpc_name];
@@ -324,22 +337,21 @@ void AsyncGRPCServer::HandleRequest(
      base = reqs[req_id];
    }

+    VLOG(3) << base->Status2String(rpc_name);
+
    // reference:
    // https://github.com/tensorflow/tensorflow/issues/5596
    // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
    // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
    if (!ok) {
      LOG(WARNING) << "completion queue:" << rpc_name
-                   << " recv no regular event:argument name["
-                   << base->GetReqName() << "]";
+                   << " recv no regular event"
+                   << " context:" << base->Status2String(rpc_name);
      TryToRegisterNewOne(rpc_name, req_id);
      delete base;
      continue;
    }

-    VLOG(3) << "queue id:" << rpc_name << ", req_id:" << req_id
-            << ", status:" << base->Status();
-
    switch (base->Status()) {
      case PROCESS: {
        base->Process();

--- a/paddle/fluid/operators/distributed/rpc_client.cc
+++ b/paddle/fluid/operators/distributed/rpc_client.cc
@@ -13,6 +13,10 @@
 // limitations under the License.

 #include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "gflags/gflags.h"
+
+// default to 3min to avoid temprary network failures.
+DEFINE_int32(grpc_deadline, 180000, "deadline timeouts for grpc");

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -15,11 +15,14 @@
 #pragma once

 #include <string>
+#include "gflags/gflags.h"

 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"

+DECLARE_int32(grpc_deadline);
+
 namespace paddle {
 namespace operators {
 namespace distributed {
@@ -32,26 +35,26 @@ class RPCClient {
                            const platform::DeviceContext& ctx,
                            const framework::Scope& scope,
                            const std::string& var_name,
-                            int64_t time_out = rpc_time_out) = 0;
+                            int64_t time_out = FLAGS_grpc_deadline) = 0;

  virtual bool AsyncGetVar(const std::string& ep,
                           const platform::DeviceContext& ctx,
                           const framework::Scope& scope,
                           const std::string& var_name,
-                           int64_t time_out = rpc_time_out) = 0;
+                           int64_t time_out = FLAGS_grpc_deadline) = 0;

  virtual bool AsyncPrefetchVar(const std::string& ep,
                                const platform::DeviceContext& ctx,
                                const framework::Scope& scope,
                                const std::string& in_var_name,
                                const std::string& out_var_name,
-                                int64_t time_out = rpc_time_out) = 0;
+                                int64_t time_out = FLAGS_grpc_deadline) = 0;

-  virtual void AsyncSendBatchBarrier(const std::string& ep,
-                                     int64_t time_out = rpc_time_out) = 0;
+  virtual void AsyncSendBatchBarrier(
+      const std::string& ep, int64_t time_out = FLAGS_grpc_deadline) = 0;

-  virtual void AsyncSendFetchBarrier(const std::string& ep,
-                                     int64_t time_out = rpc_time_out) = 0;
+  virtual void AsyncSendFetchBarrier(
+      const std::string& ep, int64_t time_out = FLAGS_grpc_deadline) = 0;

  // SendComplete tells all the server that current trainer have no more data
  // to train, so that the pserver can reduce it's barrier count, and continue
@@ -60,8 +63,6 @@ class RPCClient {

  virtual void Wait() = 0;

-  static constexpr int64_t rpc_time_out = 120 * 1000;
-
  template <typename T>
  static RPCClient* GetInstance() {
    std::call_once(init_flag_, &RPCClient::Init<T>);

--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -47,11 +47,12 @@ void RPCServer::WaitBarrier(const std::string& rpc_name) {
    return (barrier_counter_[rpc_name] >= client_num_ || exit_flag_.load());
  });

-  VLOG(3) << "batch_barrier_:" << barrier_counter_[rpc_name];
+  VLOG(3) << "batch_barrier_: " << rpc_name << " "
+          << barrier_counter_[rpc_name];
 }

 void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
-  VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
+  VLOG(4) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
  int b = 0;
  std::unique_lock<std::mutex> lock(mutex_);
  b = ++barrier_counter_[rpc_name];
@@ -100,7 +101,7 @@ void RPCServer::SetCond(const std::string& rpc_name) {
 }

 void RPCServer::WaitCond(const std::string& rpc_name) {
-  VLOG(3) << "RPCServer WaitCond " << rpc_name;
+  VLOG(4) << "RPCServer WaitCond " << rpc_name;
  int cond = 0;
  {
    std::unique_lock<std::mutex> lock(mutex_);

--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@@ -76,6 +76,8 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
      if (total_written + size_to_write > length) {
        size_to_write = length - total_written;
      }
+      // This log is useful to see how long a internal block size is of rpc.
+      VLOG(7) << "copy " << size_to_write << " data to CUDAPlace";
      memory::Copy(boost::get<platform::CUDAPlace>(place),
                   reinterpret_cast<void*>(p), cpu, data, size_to_write,
                   gpu_dev_ctx.stream());
@@ -103,6 +105,8 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
    }
    // TODO(gongwb): can we avoid copy?
    platform::CPUPlace cpu;
+    // This log is useful to see how long a internal block size is of rpc.
+    VLOG(7) << "copy " << size_to_write << " data to CPUPlace";
    memory::Copy(cpu, reinterpret_cast<void*>(p), cpu, data, size_to_write);

    p += size_to_write;

--- a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/elementwise_add_op.h"
+#include "paddle/fluid/operators/elementwise_op_function.h"
+
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::DataLayout;
+using framework::Tensor;
+using mkldnn::memory;
+using mkldnn::reorder;
+using mkldnn::primitive;
+using mkldnn::stream;
+using mkldnn::sum;
+
+template <typename T>
+class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    const T* x_data = x->data<T>();
+    const T* y_data = y->data<T>();
+    T* z_data = z->mutable_data<T>(ctx.GetPlace());
+
+    int axis = ctx.Attr<int>("axis");
+
+    auto x_dims = x->dims();
+    auto y_dims = y->dims();
+    auto z_dims = z->dims();
+
+    // Execute default elementwise_add operator when
+    // broadcast operations need to performed.
+    if (x_dims != y_dims) {
+      auto sum_func = [](T a, T b) -> T { return a + b; };
+
+      TransformFunctor<decltype(sum_func), T,
+                       paddle::platform::CPUDeviceContext, T>
+          functor(
+              x, y, z,
+              ctx.template device_context<paddle::platform::CPUDeviceContext>(),
+              sum_func);
+
+      axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+      PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+                     "Axis should be in range [0, x_dims)");
+
+      trim_trailing_singular_dims(&y_dims);
+      axis = (y_dims.size() == 0) ? x_dims.size() : axis;
+
+      int pre, n, post;
+      get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
+
+      if (post == 1) {
+        functor.RunRowWise(n, pre);
+      } else {
+        functor.RunMidWise(n, pre, post);
+      }
+      z->set_layout(DataLayout::kMKLDNN);
+      z->set_format(x->format());
+    } else {
+      PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
+                         x->format() != memory::format::format_undef,
+                     "Wrong layout/format set for X tensor");
+      PADDLE_ENFORCE(y->layout() == DataLayout::kMKLDNN &&
+                         y->format() != memory::format::format_undef,
+                     "Wrong layout/format set for X tensor");
+
+      std::vector<int> src_x_tz = framework::vectorize2int(x_dims);
+      std::vector<int> src_y_tz = framework::vectorize2int(y_dims);
+      std::vector<int> dst_tz = framework::vectorize2int(z_dims);
+
+      std::vector<memory::primitive_desc> srcs_pd;
+      std::vector<memory> srcs;
+      std::vector<float> scales = {1.0f, 1.0f};
+
+      auto src_x_pd = memory::primitive_desc(
+          {{src_x_tz}, memory::data_type::f32, x->format()}, mkldnn_engine);
+      auto src_y_pd = memory::primitive_desc(
+          {{src_y_tz}, memory::data_type::f32, y->format()}, mkldnn_engine);
+      auto src_x_memory =
+          memory(src_x_pd, paddle::platform::to_void_cast(x_data));
+      auto src_y_memory =
+          memory(src_y_pd, paddle::platform::to_void_cast(y_data));
+
+      srcs_pd.push_back(src_x_pd);
+      srcs_pd.push_back(src_y_pd);
+      srcs.push_back(src_x_memory);
+      srcs.push_back(src_y_memory);
+
+      auto dst_md =
+          memory::desc({dst_tz}, memory::data_type::f32, memory::format::any);
+
+      // create primitive descriptor for sum
+      auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_pd);
+
+      // create mkldnn memory for dst
+      memory dst_memory = memory(sum_pd.dst_primitive_desc(), z_data);
+
+      std::vector<primitive::at> inputs;
+      inputs.push_back(srcs[0]);
+      inputs.push_back(srcs[1]);
+
+      // create sum primitive
+      auto sum_prim = sum(sum_pd, inputs, dst_memory);
+
+      std::vector<primitive> pipeline;
+      pipeline.push_back(sum_prim);
+      stream(stream::kind::eager).submit(pipeline).wait();
+
+      z->set_layout(DataLayout::kMKLDNN);
+      z->set_format(
+          (memory::format)dst_memory.get_primitive_desc().desc().data.format);
+    }
+  }
+};
+
+template <typename T>
+class EltwiseAddMKLDNNGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
+
+    auto set_mkldnn_format = [](Tensor* in, const Tensor* out) {
+      in->set_layout(DataLayout::kMKLDNN);
+      in->set_format(out->format());
+    };
+
+    if (x->dims() == y->dims()) {
+      auto blas = math::GetBlas<paddle::platform::CPUDeviceContext, T>(ctx);
+      if (dx) {
+        blas.VCOPY(dout->numel(), dout->data<T>(),
+                   dx->mutable_data<T>(ctx.GetPlace()));
+        set_mkldnn_format(dx, dout);
+      }
+
+      if (dy) {
+        blas.VCOPY(dout->numel(), dout->data<T>(),
+                   dy->mutable_data<T>(ctx.GetPlace()));
+        set_mkldnn_format(dy, dout);
+      }
+    } else {
+      // Execute default kernel when broadcast is needed
+      ElemwiseGradCompute<paddle::platform::CPUDeviceContext, T,
+                          IdentityGrad<T>, IdentityGrad<T>>(
+          ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(),
+          IdentityGrad<T>());
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(elementwise_add, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::EltwiseAddMKLDNNKernel<float>)
+
+REGISTER_OP_KERNEL(elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::EltwiseAddMKLDNNGradKernel<float>)
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -14,8 +14,12 @@ limitations under the License. */

 #pragma once
 #include <string>
+#include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif

 namespace paddle {
 namespace operators {
@@ -40,6 +44,21 @@ class ElementwiseOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("Out", x_dim);
    ctx->ShareLoD("X", /*->*/ "Out");
  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (platform::CanMKLDNNBeUsed(ctx)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };

 class ElementwiseOpInferVarType : public framework::VarTypeInference {
@@ -65,6 +84,8 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
                 "for broadcasting Y onto X.")
        .SetDefault(-1)
        .EqualGreaterThan(-1);
+    AddAttr<bool>("use_mkldnn", "(bool, default false). Used by MKLDNN.")
+        .SetDefault(false);
    AddComment(string::Sprintf(R"DOC(
 Limited Elementwise %s Operator

@@ -138,6 +159,21 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
      ctx->SetOutputDim(y_grad_name, y_dims);
    }
  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (platform::CanMKLDNNBeUsed(ctx)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 }  // namespace operators
 }  // namespace paddle

--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -101,17 +101,16 @@ void ListenAndServOp::RunSyncLoop(
    framework::Scope *recv_scope,
    const std::vector<int> &prefetch_block_id_list) const {
  size_t num_blocks = program->Size();
+  auto optimize_blocks =
+      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
  PADDLE_ENFORCE_GE(num_blocks, 2,
                    "server program should have at least 2 blocks");

-  std::vector<int> optimize_block_id_list;
-  for (int blkid = 1; blkid < num_blocks; ++blkid) {
-    if (std::find(prefetch_block_id_list.begin(), prefetch_block_id_list.end(),
-                  blkid) == prefetch_block_id_list.end()) {
-      optimize_block_id_list.push_back(blkid);
+  std::vector<int> optimize_blocks_idx;
+  for (auto blk : optimize_blocks) {
+    optimize_blocks_idx.push_back(blk->ID());
  }
-  }
-  auto optimize_prepared = executor->Prepare(*program, optimize_block_id_list);
+  auto optimize_prepared = executor->Prepare(*program, optimize_blocks_idx);
  // Insert placeholder for block0 which holds current op itself.
  optimize_prepared.insert(
      optimize_prepared.begin(),
@@ -134,14 +133,14 @@ void ListenAndServOp::RunSyncLoop(
    // and this will still work.
    // The optimize blocks which have the same parent ID would run parallel
    // TODO(Yancey1989): need to use ParallelExecutor for future
-    int32_t last_parent_blkid = program->Block(1).Parent();
+    int32_t last_parent_blkid = optimize_blocks[0]->Parent();
    std::vector<size_t> parallel_blkids;
-    parallel_blkids.push_back(1);
+    parallel_blkids.push_back(optimize_blocks[0]->ID());
    double ts = GetTimestamp();
-    for (size_t i = 1; i < optimize_block_id_list.size(); ++i) {
+    for (size_t i = 1; i < optimize_blocks.size(); ++i) {
      // skip the first optimize block because it is already in the
      // parallel_blkids.
-      int blkid = optimize_block_id_list[i];
+      int blkid = optimize_blocks[i]->ID();
      if (program->Block(blkid).Parent() != last_parent_blkid) {
        ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
                              program, recv_scope);
@@ -164,8 +163,8 @@ void ListenAndServOp::RunSyncLoop(
 }

 void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
-                                   framework::ProgramDesc *program) const {
-  VLOG(3) << "RunAsyncLoop in";
+                                   framework::ProgramDesc *program,
+                                   framework::Scope *recv_scope) const {
  // grad name to block id
  std::unordered_map<std::string, int32_t> grad_to_block_id;
  std::unordered_map<int32_t, std::string> id_to_grad;
@@ -192,6 +191,10 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
    block_list.push_back(blkid);
  }
  auto optimize_prepared = executor->Prepare(*program, block_list);
+  // execute global block if needed
+  if (block_list[0] == 1 && id_to_grad.count(1) == 0) {
+    executor->RunPreparedContext(optimize_prepared[0].get(), recv_scope);
+  }
  std::unordered_map<std::string,
                     std::shared_ptr<framework::ExecutorPrepareContext>>
      grad_to_prepared_ctx;
@@ -203,7 +206,6 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
  request_get_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
  request_prefetch_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);

-  VLOG(3) << "RunAsyncLoop into while";
  while (true) {
    if (rpc_service_->IsExit()) {
      LOG(INFO) << "get exit!rpc_processor break!";
@@ -261,8 +263,11 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
  rpc_service_->RegisterRPC(distributed::kRequestPrefetch,
                            request_prefetch_handler_.get());

-  auto *optimize_block = Attr<framework::BlockDesc *>(kOptimizeBlock);
-  auto *program = optimize_block->Program();
+  auto optimize_blocks =
+      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
+  PADDLE_ENFORCE(optimize_blocks.size() >= 1,
+                 "optimize blocks should be 1 at least on the pserver side.");
+  auto *program = optimize_blocks[0]->Program();
  framework::Executor executor(dev_place);

  // prepare for prefetch
@@ -317,7 +322,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
  if (sync_mode) {
    RunSyncLoop(&executor, program, &recv_scope, prefetch_block_id_list);
  } else {
-    RunAsyncLoop(&executor, program);
+    RunAsyncLoop(&executor, program, &recv_scope);
  }
 }

@@ -339,8 +344,9 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
        "a map from grad name to it's optimize block id")
        .SetDefault({});
    AddAttr<bool>("sync_mode", "if works at sync_mode or not").SetDefault(true);
-    AddAttr<framework::BlockDesc *>(kOptimizeBlock,
-                                    "BlockID to run on server side.");
+    AddAttr<std::vector<framework::BlockDesc *>>(
+        kOptimizeBlocks, "Optimize blocks to run on server side.")
+        .SetDefault({});
    AddAttr<std::vector<std::string>>(kPrefetchVarNameToBlockId,
                                      "prefetch blocks to run on server side.")
        .SetDefault({});

--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -30,7 +30,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-constexpr char kOptimizeBlock[] = "OptimizeBlock";
+constexpr char kOptimizeBlocks[] = "optimize_blocks";
 constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id";

 void RunServer(std::shared_ptr<distributed::RPCServer> service);
@@ -50,7 +50,8 @@ class ListenAndServOp : public framework::OperatorBase {
                   const std::vector<int>& prefetch_block_id_list) const;

  void RunAsyncLoop(framework::Executor* executor,
-                    framework::ProgramDesc* program) const;
+                    framework::ProgramDesc* program,
+                    framework::Scope* recv_scope) const;

  void SavePort() const;


--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@@ -295,7 +295,7 @@ class ParallelDoGradOp : public framework::OperatorBase {

        auto sum_op = framework::OpRegistry::CreateOp(
            "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
-            framework::AttributeMap{});
+            framework::AttributeMap{{"use_mkldnn", {false}}});
        VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]);
        sum_op->Run(*sub_scopes[0], places[0]);
        WaitOnPlace(places[0]);

--- a/paddle/fluid/operators/random_crop_op.cc
+++ b/paddle/fluid/operators/random_crop_op.cc
@@ -37,6 +37,11 @@ class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("SeedOut", "The random seed after random cropping.")
        .AsIntermediate();
    AddAttr<std::vector<int>>("shape", "The shape of a cropped instance.");
+    AddAttr<int>("startup_seed",
+                 "If the input 'Seed' is not initialized, the 'startup_seed' "
+                 "will be used to replace it. Even so, the seed after random "
+                 "crop will also be outputed to the 'SeedOut'.")
+        .SetDefault(0);
    AddComment(R"DOC(
      This operator takes a batch of instance, and do random cropping on each instance.
      It means that cropping positions differs on each instance, which is determined
@@ -49,8 +54,6 @@ class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker {
 class RandomCropOpInferShape : public framework::InferShapeBase {
 public:
  void operator()(framework::InferShapeContext* ctx) const override {
-    auto seed_dim = ctx->GetInputDim("Seed");
-    PADDLE_ENFORCE(seed_dim.size() == 1 && seed_dim[0] == 1);
    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
    auto x_dim = ctx->GetInputDim("X");
    PADDLE_ENFORCE_GT(x_dim.size(), static_cast<int64_t>(shape.size()));
@@ -62,7 +65,6 @@ class RandomCropOpInferShape : public framework::InferShapeBase {
      out_dim[x_i] = shape[shape_i];
    }
    ctx->SetOutputDim("Out", framework::make_ddim(out_dim));
-    ctx->SetOutputDim("SeedOut", framework::make_ddim({1}));
  }
 };


--- a/paddle/fluid/operators/random_crop_op.h
+++ b/paddle/fluid/operators/random_crop_op.h
@@ -142,8 +142,9 @@ template <typename DeviceContext, typename T>
 class RandomCropKernel : public framework::OpKernel<T> {
 public:
  virtual void Compute(const framework::ExecutionContext& ctx) const {
-    auto& seed_tensor = detail::Ref(ctx.Input<framework::LoDTensor>("Seed"));
    int64_t seed = 0;
+    auto& seed_tensor = detail::Ref(ctx.Input<framework::LoDTensor>("Seed"));
+    if (seed_tensor.IsInitialized()) {
      if (platform::is_cpu_place(seed_tensor.place())) {
        seed = *seed_tensor.data<int64_t>();
      } else {
@@ -153,6 +154,11 @@ class RandomCropKernel : public framework::OpKernel<T> {
        framework::TensorCopySync(seed_tensor, platform::CPUPlace(), &cpu_seed);
        seed = *cpu_seed.data<int64_t>();
      }
+    } else {
+      VLOG(5) << "WARNING: The input 'Seed' is not initialized, use attribute "
+                 "'startup_seed' instead.";
+      seed = ctx.Attr<int>("startup_seed");
+    }
    auto shape = ctx.Attr<std::vector<int>>("shape");
    auto& x = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
    auto& out = detail::Ref(ctx.Output<framework::LoDTensor>("Out"));
@@ -171,7 +177,7 @@ class RandomCropKernel : public framework::OpKernel<T> {
    engine.discard(functor.prod_batchsize_dims_ *
                   (functor.rank_ - functor.num_batchsize_dims_));
    *ctx.Output<framework::LoDTensor>("SeedOut")->mutable_data<int64_t>(
-        platform::CPUPlace()) = engine();
+        framework::make_ddim({1}), platform::CPUPlace()) = engine();
  }
 };


--- a/paddle/fluid/operators/reader/create_custom_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc
@@ -39,6 +39,7 @@ class CustomReader : public framework::DecoratedReader {
  const framework::ProgramDesc program_;
  int sub_block_id_;
  framework::Executor exe_;
+  framework::Scope scope_;

  std::vector<std::string> source_var_names_;
  std::vector<std::string> sink_var_names_;
@@ -158,23 +159,24 @@ void CustomReader::ReadNext(std::vector<framework::LoDTensor>* out) {
  // The scope for CustomReader's sub-block should be independent and shouldn't
  // be any other computation scope's child. Otherwise, data preprocessing and
  // compution cannot be concurrent.
-  framework::Scope scope;
+  framework::Scope* exe_scope = &scope_.NewScope();
  // 1. Copy LoDTensors from underlying reader's output to source variables.
  for (size_t i = 0; i < source_var_names_.size(); ++i) {
-    framework::Variable* var = scope.Var(source_var_names_[i]);
+    framework::Variable* var = exe_scope->Var(source_var_names_[i]);
    framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
    tensor->ShareDataWith(underlying_outs[i]);
    tensor->set_lod(underlying_outs[i].lod());
  }
  // 2. Run the sub-block.
-  exe_.Run(program_, &scope, sub_block_id_, false, true);
+  exe_.Run(program_, exe_scope, sub_block_id_, false, true);
  // 3. Copy LoDTensors from sink variables to out.
  out->resize(sink_var_names_.size());
  for (size_t i = 0; i < sink_var_names_.size(); ++i) {
-    const auto& tensor = detail::Ref(scope.FindVar(sink_var_names_[i]))
+    const auto& tensor = detail::Ref(exe_scope->FindVar(sink_var_names_[i]))
                             .Get<framework::LoDTensor>();
    framework::TensorCopySync(tensor, platform::CPUPlace(), &(*out)[i]);
  }
+  scope_.DeleteScope(exe_scope);
 }

 }  // namespace reader

--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -23,13 +23,13 @@ namespace reader {

 // 'Double buffer' means we shall maintain two batches of input data at the same
 // time. So the kCacheSize shoul be at least 2.
-static constexpr size_t kCacheSize = 3;
+static constexpr size_t kCacheSize = 5;
 // There will be two bacthes out of the channel during training:
 // 1. the one waiting to be sent to the channel
 // 2. the one just be received from the channel, which is also being used by
 // subsequent operators.
 // So the channel size should be kChacheSize - 2
-static constexpr size_t kChannelSize = 1;  // kCacheSize - 2
+static constexpr size_t kChannelSize = 3;  // kCacheSize - 2

 class DoubleBufferReader : public framework::DecoratedReader {
 public:

--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -429,7 +429,8 @@ class RecurrentGradOp : public RecurrentBase {

          auto sum_op = framework::OpRegistry::CreateOp(
              "sum", {{"X", {pg_names[param_id], new_inside_name}}},
-              {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
+              {{"Out", {pg_names[param_id]}}},
+              framework::AttributeMap{{"use_mkldnn", {false}}});
          sum_op->Run(cur_scope, place);

          cur_scope.Rename(new_inside_name, inside_grad_name);

--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -129,7 +129,10 @@ void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) {
  // sub program run in listen_and_serv_op, for simple test we use sum
  f::ProgramDesc program;
  const auto &root_block = program.Block(0);
+  std::vector<framework::BlockDesc *> optimize_blocks;
  auto *optimize_block = program.AppendBlock(root_block);
+  optimize_blocks.push_back(optimize_block);
+
  auto *prefetch_block = program.AppendBlock(root_block);
  // X for server side tensors, RX for received tensors, must be of same shape.
  AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block,
@@ -139,7 +142,7 @@ void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) {
  attrs.insert({"Fanin", 1});
  attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
  attrs.insert({"GradList", std::vector<std::string>({"x1"})});
-  attrs.insert({"OptimizeBlock", optimize_block});
+  attrs.insert({"optimize_blocks", optimize_blocks});
  attrs.insert({"PrefetchBlock", prefetch_block});
  attrs.insert({"grad_to_block_id", std::vector<std::string>({""})});
  attrs.insert({"sync_mode", true});

--- a/paddle/fluid/operators/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/softmax_mkldnn_op.cc
@@ -27,8 +27,81 @@ using paddle::platform::MKLDNNMemDesc;
 using mkldnn::memory;  // Note: paddle has also "memory" namespace
 using mkldnn::primitive;
 using mkldnn::softmax_forward;
+using mkldnn::softmax_backward;
 using mkldnn::prop_kind;
 using mkldnn::stream;
+using platform::to_void_cast;
+
+class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
+ public:
+  SoftmaxMKLDNNHandler(
+      std::shared_ptr<mkldnn::softmax_forward::primitive_desc> softmax_pd,
+      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+      const std::string& base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
+        softmax_pd_(softmax_pd) {}
+
+  SoftmaxMKLDNNHandler(
+      std::shared_ptr<mkldnn::softmax_forward::primitive_desc> softmax_pd,
+      std::shared_ptr<mkldnn::softmax_backward::primitive_desc> softmax_bwd_pd,
+      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+      const std::string& base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
+        softmax_pd_(softmax_pd),
+        softmax_bwd_pd_(softmax_bwd_pd) {
+    // If we are in Grad operatgor then update a key with BWD suffix to
+    // distinguish from FWD memory primitives
+    key_ += "-BWD";
+  }
+
+  std::shared_ptr<mkldnn::softmax_forward> AcquireSoftmax(
+      std::shared_ptr<mkldnn::memory> dst_memory_p,
+      std::shared_ptr<mkldnn::memory> src_memory_p) {
+    /*Generate key*/
+    auto prim_key = key_ + "@softmax_p";
+
+    auto softmax_p = std::static_pointer_cast<mkldnn::softmax_forward>(
+        dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((softmax_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find softmax primitive in device context");
+    if (softmax_p == nullptr) {
+      softmax_p = std::make_shared<mkldnn::softmax_forward>(
+          *(softmax_pd_.get()),
+          *(static_cast<mkldnn::memory*>(src_memory_p.get())),
+          *(static_cast<mkldnn::memory*>(dst_memory_p.get())));
+      dev_ctx_.SetBlob(prim_key, softmax_p);
+    } else {
+      is_reusing_ = true;
+    }
+
+    return softmax_p;
+  }
+
+  std::shared_ptr<mkldnn::softmax_backward> AcquireSoftmaxBackward(
+      std::shared_ptr<mkldnn::memory> dst_memory_p,
+      std::shared_ptr<mkldnn::memory> diff_dst_memory_p,
+      std::shared_ptr<mkldnn::memory> diff_src_memory_p) {
+    auto prim_key = key_ + "@softmax_bwd_p";
+    auto softmax_bwd_p = std::static_pointer_cast<mkldnn::softmax_backward>(
+        dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((softmax_bwd_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find softmax backward primitive in device context");
+    if (softmax_bwd_p == nullptr) {
+      softmax_bwd_p = std::make_shared<mkldnn::softmax_backward>(
+          *softmax_bwd_pd_, *(dst_memory_p.get()), *(diff_dst_memory_p.get()),
+          *(diff_src_memory_p.get()));
+      dev_ctx_.SetBlob(prim_key, softmax_bwd_p);
+    } else {
+      is_reusing_ = true;
+    }
+
+    return softmax_bwd_p;
+  }
+
+ private:
+  std::shared_ptr<mkldnn::softmax_forward::primitive_desc> softmax_pd_;
+  std::shared_ptr<mkldnn::softmax_backward::primitive_desc> softmax_bwd_pd_;
+};

 template <typename T>
 class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
@@ -54,56 +127,27 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
    // Same memory descriptor to be used for input and output
    memory::dims softmax_tz = {src_tz[0], src_tz[1]};
    // Generate keys for storing/retriving primitives for this operator
-    // TODO(jczaja): Each MKLDNN operator may have diffrent hashing function
-    auto gethash = [](memory::dims& operand_dims) {
-      return std::string(std::to_string(operand_dims[0]) + "-" +
-                         std::to_string(operand_dims[1]));
-    };
-    const std::string key = gethash(softmax_tz);
-    const std::string key_softmax_p = key + "@softmax_p";
-    const std::string key_softmax_src_mem_p = key + "@softmax_src_mem_p";
-    const std::string key_softmax_dst_mem_p = key + "@softmax_dst_mem_p";
-
-    std::shared_ptr<void> softmax_p = dev_ctx.GetBlob(key_softmax_p);
-    if (softmax_p == nullptr) {
+    const std::string key =
+        platform::MKLDNNHandler::GetHash(softmax_tz, ctx.op().Output("Out"));
+    const std::string key_softmax_pd = key + "@softmax_pd";
+
    // Currently only NC data format is supported
-      auto softmax_md =
-          MKLDNNMemDesc({softmax_tz}, memory::f32, memory::format::nc);
+    auto softmax_md = MKLDNNMemDesc(
+        {softmax_tz}, platform::MKLDNNGetDataType<T>(), memory::format::nc);
    // Normalization is made after innermost dimension eg. C out of NC
    auto softmax_desc = softmax_forward::desc(prop_kind::forward_scoring,
                                              softmax_md, 1 /*dim: C*/);
-      // create memory primitives
-      auto softmax_src_memory_p = std::make_shared<memory>(
-          memory::primitive_desc{softmax_md, mkldnn_engine},
-          static_cast<void*>(const_cast<T*>(input_data)));
-      dev_ctx.SetBlob(key_softmax_src_mem_p, softmax_src_memory_p);
-      auto softmax_dst_memory_p = std::make_shared<memory>(
-          memory::primitive_desc{softmax_md, mkldnn_engine},
-          static_cast<void*>(output_data));
-      dev_ctx.SetBlob(key_softmax_dst_mem_p, softmax_dst_memory_p);
-
-      auto softmax_forward_pd =
-          std::make_shared<softmax_forward::primitive_desc>(softmax_desc,
-                                                            mkldnn_engine);
-      softmax_p = std::make_shared<softmax_forward>(
-          *(softmax_forward_pd.get()),
-          *(static_cast<memory*>(softmax_src_memory_p.get())),
-          *(static_cast<memory*>(softmax_dst_memory_p.get())));
-      dev_ctx.SetBlob(key_softmax_p, softmax_p);
-    } else {
-      // Primitives already exist
-      auto src_memory_p = std::static_pointer_cast<memory>(
-          dev_ctx.GetBlob(key_softmax_src_mem_p));
-      PADDLE_ENFORCE(src_memory_p != nullptr,
-                     "Fail to find softmax src mem_p in device context");
-      auto dst_memory_p = std::static_pointer_cast<memory>(
-          dev_ctx.GetBlob(key_softmax_dst_mem_p));
-      PADDLE_ENFORCE(dst_memory_p != nullptr,
-                     "Fail to find softmax dst mem_p in device context");
-      src_memory_p->set_data_handle(
-          reinterpret_cast<void*>(const_cast<T*>(input_data)));
-      dst_memory_p->set_data_handle(output_data);
-    }
+    auto softmax_pd = std::make_shared<mkldnn::softmax_forward::primitive_desc>(
+        softmax_desc, mkldnn_engine);
+    dev_ctx.SetBlob(key_softmax_pd, softmax_pd);
+
+    SoftmaxMKLDNNHandler handler(softmax_pd, dev_ctx, mkldnn_engine, key);
+    auto softmax_src_memory_p =
+        handler.AcquireSrcMemory(softmax_md, to_void_cast<T>(input_data));
+    auto softmax_dst_memory_p =
+        handler.AcquireDstMemory(softmax_md, to_void_cast<T>(output_data));
+    auto softmax_p =
+        handler.AcquireSoftmax(softmax_dst_memory_p, softmax_src_memory_p);

    std::vector<primitive> pipeline{
        *(static_cast<softmax_forward::primitive*>(softmax_p.get()))};
@@ -120,6 +164,77 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
  }
 };

+template <typename T>
+class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    auto mkldnn_engine = dev_ctx.GetEngine();
+    const Tensor* output = ctx.Input<Tensor>("Out");
+    const T* dst_data = output->data<T>();
+
+    auto* dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
+    const auto* diff_dst_ptr = dout->template data<T>();
+
+    auto* dx =
+        ctx.template Output<framework::Tensor>(framework::GradVarName("X"));
+    T* diff_src_ptr = dx->template mutable_data<T>(ctx.GetPlace());
+
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    std::vector<int> src_tz(dst_tz);
+    PADDLE_ENFORCE(output->dims().size() == 2UL,
+                   "The input of softmax op must be a 2D matrix.");
+    // MKL-DNN does support softmax over selected axis. Having 2D Tensor,
+    // we will make normalization after final eg. axis: 1
+    PADDLE_ENFORCE(((src_tz[0] == dst_tz[0]) && (src_tz[1] == dst_tz[1])),
+                   "Softmax input and output dimensions should match");
+    // Same memory descriptor to be used for input and output
+    memory::dims softmax_tz = {src_tz[0], src_tz[1]};
+    // Currently only supports NC data format
+    // retrieve eltwise primitive desc from device context
+    const std::string key =
+        platform::MKLDNNHandler::GetHash(softmax_tz, ctx.op().Input("Out"));
+    const std::string key_softmax_pd = key + "@softmax_pd";
+
+    auto softmax_pd =
+        std::static_pointer_cast<mkldnn::softmax_forward::primitive_desc>(
+            dev_ctx.GetBlob(key_softmax_pd));
+    PADDLE_ENFORCE(softmax_pd != nullptr,
+                   "Fail to find softmax_pd in device context");
+
+    // TODO(jczaja): Add layouts support when there is a need to do so
+    // Two dimensional softmax does support NC format
+    auto data_softmax_md = MKLDNNMemDesc(
+        {softmax_tz}, platform::MKLDNNGetDataType<T>(), memory::format::nc);
+    auto diff_softmax_md = MKLDNNMemDesc(
+        {softmax_tz}, platform::MKLDNNGetDataType<T>(), memory::format::nc);
+    // Normalization is made after innermost dimension eg. C out of NC
+    auto softmax_bwd_desc =
+        softmax_backward::desc(diff_softmax_md, data_softmax_md, 1 /* dim: C*/);
+    auto softmax_bwd_pd =
+        std::make_shared<mkldnn::softmax_backward::primitive_desc>(
+            softmax_bwd_desc, mkldnn_engine, *softmax_pd);
+
+    SoftmaxMKLDNNHandler handler(softmax_pd, softmax_bwd_pd, dev_ctx,
+                                 mkldnn_engine, key);
+    auto dst_memory_p =
+        handler.AcquireDstMemory(data_softmax_md, to_void_cast<T>(dst_data));
+    auto diff_dst_memory_p = handler.AcquireDiffDstMemory(
+        diff_softmax_md, to_void_cast<T>(diff_dst_ptr));
+    auto diff_src_memory_p = handler.AcquireDiffSrcMemory(
+        diff_softmax_md, to_void_cast<T>(diff_src_ptr));
+
+    // Get primitve from device context
+    auto softmax_bwd_p = handler.AcquireSoftmaxBackward(
+        dst_memory_p, diff_dst_memory_p, diff_src_memory_p);
+
+    std::vector<primitive> pipeline{*softmax_bwd_p};
+    stream(stream::kind::eager).submit(pipeline).wait();
+  }
+};
 }  // namespace operators
 }  // namespace paddle

@@ -127,3 +242,5 @@ namespace ops = paddle::operators;

 REGISTER_OP_KERNEL(softmax, MKLDNN, ::paddle::platform::CPUPlace,
                   ops::SoftmaxMKLDNNKernel<float>);
+REGISTER_OP_KERNEL(softmax_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::SoftmaxMKLDNNGradKernel<float>);
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -145,16 +145,30 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
      const framework::ExecutionContext& ctx) const override {
    // choose cudnn kernel if the runtime supported.
    framework::LibraryType library_{framework::LibraryType::kPlain};
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);

 #ifdef PADDLE_WITH_CUDA
    if (platform::CanCUDNNBeUsed(ctx)) {
      library_ = framework::LibraryType::kCUDNN;
    }
 #endif
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        framework::StringToDataLayout(data_format), library_);
+#ifdef PADDLE_WITH_MKLDNN
+    if (library_ == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kMKLDNN;
+      layout_ = framework::DataLayout::kMKLDNN;
+    }
+#endif
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    if (input_data_type == framework::proto::VarType::FP16) {
+      PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                     "float16 can only be used on GPU place");
+    }
+
+    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
+                                   library_);
  }
 };


--- a/paddle/fluid/operators/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/sum_mkldnn_op.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*Licensed under the Apache License, Version 2.0(the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. */
+
+#include "mkldnn.hpp"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/operators/sum_op.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+using paddle::platform::MKLDNNDeviceContext;
+using paddle::platform::CPUDeviceContext;
+using framework::DataLayout;
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::stream;
+using mkldnn::sum;
+using mkldnn::reorder;
+using platform::to_void_cast;
+
+template <typename T>
+class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+    auto in_vars = ctx.MultiInputVar("X");
+
+    const int N = in_vars.size();
+    auto out_var = ctx.OutputVar("Out");
+    bool in_place = out_var == in_vars[0];
+
+    if (out_var->IsType<framework::LoDTensor>()) {
+      LoDTensor* output = ctx.Output<LoDTensor>("Out");
+      T* output_data = output->mutable_data<T>(ctx.GetPlace());
+
+      std::vector<int> dst_tz = framework::vectorize2int(output->dims());
+      auto src_tz = dst_tz;
+      memory::format output_format{memory::format::format_undef};
+      std::vector<float> scales;
+      std::vector<memory::primitive_desc> srcs_mpd;
+      std::vector<mkldnn::memory> srcs_mem;
+
+      PADDLE_ENFORCE(in_vars[0]->IsType<LoDTensor>(),
+                     "Input[0] must be LoDTensors");
+      auto& input0 = in_vars[0]->Get<LoDTensor>();
+      PADDLE_ENFORCE(input0.layout() == DataLayout::kMKLDNN &&
+                         input0.format() != memory::format::format_undef,
+                     "Wrong layout/format for inputs[0]");
+
+      memory::format input_format = input0.format();
+
+      if (src_tz.size() == 1 && (input_format == memory::format::nchw ||
+                                 input_format == memory::format::nhwc)) {
+        input_format = memory::format::x;
+      }
+      if (src_tz.size() == 2 && (input_format == memory::format::nchw ||
+                                 input_format == memory::format::nhwc)) {
+        input_format = memory::format::nc;
+      }
+
+      for (int i = in_place ? 1 : 0; i < N; i++) {
+        PADDLE_ENFORCE(in_vars[i]->IsType<LoDTensor>(),
+                       "all inputs must be all LoDTensors");
+        auto& input = in_vars[i]->Get<LoDTensor>();
+        PADDLE_ENFORCE(input.layout() == DataLayout::kMKLDNN &&
+                           input.format() != memory::format::format_undef,
+                       "Wrong layout/format for inputs");
+
+        if (input.numel() == 0) {
+          continue;
+        }
+
+        const T* input_data = input.data<T>();
+
+        auto src_md =
+            memory::desc(src_tz, memory::data_type::f32, input_format);
+        auto src_mpd = memory::primitive_desc(src_md, mkldnn_engine);
+        auto src_mem = memory(src_mpd, to_void_cast(input_data));
+        srcs_mpd.push_back(src_mpd);
+        srcs_mem.push_back(src_mem);
+        scales.push_back(1.0);
+      }
+
+      auto dst_md =
+          memory::desc(dst_tz, memory::data_type::f32, memory::format::any);
+
+      auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_mpd);
+
+      std::shared_ptr<memory> dst_mem;
+      if (in_place) {
+        dst_mem.reset(new memory(sum_pd.dst_primitive_desc()));
+      } else {
+        dst_mem.reset(new memory(sum_pd.dst_primitive_desc(), output_data));
+      }
+      std::vector<mkldnn::primitive::at> inputs;
+      for (size_t i = 0; i < srcs_mem.size(); ++i) {
+        inputs.push_back(srcs_mem[i]);
+      }
+
+      auto sum_prim = mkldnn::sum(sum_pd, inputs, *dst_mem);
+      output_format = (memory::format)platform::GetMKLDNNFormat(sum_pd);
+
+      primitive reorder_prim;
+      std::shared_ptr<memory> target_mem;
+      if (in_place) {
+        output_format = input_format;
+        target_mem.reset(new memory(
+            {{{src_tz}, memory::data_type::f32, output_format}, mkldnn_engine},
+            output_data));
+        reorder_prim = reorder(*dst_mem, *target_mem);
+      }
+
+      std::vector<primitive> pipeline;
+      pipeline.push_back(sum_prim);
+      if (in_place) pipeline.push_back(reorder_prim);
+      stream(stream::kind::eager).submit(pipeline).wait();
+
+      output->set_layout(DataLayout::kMKLDNN);
+      output->set_format(output_format);
+    } else if (out_var->IsType<framework::SelectedRows>()) {
+      // TODO(@mozga-intel) Add MKLDNN SelectedRows support
+      std::unique_ptr<framework::SelectedRows> in0;
+      if (in_place) {
+        // If is in_place, we store the input[0] to in0
+        auto& in_sel0 = in_vars[0]->Get<SelectedRows>();
+        auto& rows = in_sel0.rows();
+        in0.reset(new framework::SelectedRows(rows, in_sel0.height()));
+        in0->mutable_value()->ShareDataWith(in_sel0.value());
+      }
+
+      auto get_selected_row = [&](size_t i) -> const SelectedRows& {
+        if (i == 0 && in0) {
+          return *in0.get();
+        } else {
+          return in_vars[i]->Get<SelectedRows>();
+        }
+      };
+      auto* out = ctx.Output<SelectedRows>("Out");
+      out->mutable_rows()->clear();
+      auto* out_value = out->mutable_value();
+
+      // Runtime InferShape
+      size_t first_dim = 0;
+      for (int i = 0; i < N; i++) {
+        auto& sel_row = get_selected_row(i);
+        first_dim += sel_row.rows().size();
+      }
+      auto in_dim =
+          framework::vectorize(get_selected_row(N - 1).value().dims());
+      in_dim[0] = static_cast<int64_t>(first_dim);
+
+      out_value->Resize(framework::make_ddim(in_dim));
+
+      // if all the input sparse vars are empty, no need to
+      // merge these vars.
+      if (first_dim == 0UL) {
+        return;
+      }
+      out_value->mutable_data<T>(ctx.GetPlace());
+      math::SelectedRowsAddTo<CPUDeviceContext, T> functor;
+      int64_t offset = 0;
+      for (int i = 0; i < N; i++) {
+        auto& sel_row = get_selected_row(i);
+        if (sel_row.rows().size() == 0) {
+          continue;
+        }
+        PADDLE_ENFORCE_EQ(out->height(), sel_row.height());
+        functor(ctx.template device_context<CPUDeviceContext>(), sel_row,
+                offset, out);
+        offset += sel_row.value().numel();
+      }
+    } else if (out_var->IsType<framework::LoDTensorArray>()) {
+      // TODO(@mozga-intel) Add MKLDNN LoDTensorArray support
+      auto& out_array = *out_var->GetMutable<framework::LoDTensorArray>();
+      for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) {
+        PADDLE_ENFORCE(in_vars[i]->IsType<framework::LoDTensorArray>(),
+                       "Only support all inputs are TensorArray");
+        auto& in_array = in_vars[i]->Get<framework::LoDTensorArray>();
+
+        for (size_t i = 0; i < in_array.size(); ++i) {
+          if (in_array[i].numel() != 0) {
+            if (i >= out_array.size()) {
+              out_array.resize(i + 1);
+            }
+            if (out_array[i].numel() == 0) {
+              framework::TensorCopy(in_array[i], in_array[i].place(),
+                                    ctx.device_context(), &out_array[i]);
+              out_array[i].set_lod(in_array[i].lod());
+            } else {
+              PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod());
+              auto in = EigenVector<T>::Flatten(in_array[i]);
+              auto result = EigenVector<T>::Flatten(out_array[i]);
+              result.device(*ctx.template device_context<MKLDNNDeviceContext>()
+                                 .eigen_device()) = result + in;
+            }
+          }
+        }
+      }
+    } else {
+      PADDLE_THROW("Unexpected branch, output variable type is %s",
+                   out_var->Type().name());
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_KERNEL(sum, MKLDNN, ::paddle::platform::CPUPlace,
+                   paddle::operators::SumMKLDNNOpKernel<float>);
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -18,6 +18,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"

+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 using framework::Tensor;
@@ -63,6 +67,18 @@ class SumOp : public framework::OperatorWithKernel {
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
    auto x_vars = ctx.MultiInputVar("X");
+
+    framework::LibraryType library{framework::LibraryType::kPlain};
+    framework::DataLayout layout{framework::DataLayout::kAnyLayout};
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (library == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library = framework::LibraryType::kMKLDNN;
+      layout = framework::DataLayout::kMKLDNN;
+    }
+#endif
+
    if (x_vars[0]->IsType<framework::LoDTensor>()) {
      int dtype = -1;
      for (auto& x_var : x_vars) {
@@ -80,26 +96,27 @@ class SumOp : public framework::OperatorWithKernel {
                        "Sum operator should have at least one tensor");

      return framework::OpKernelType(
-          static_cast<framework::proto::VarType::Type>(dtype),
-          ctx.device_context());
+          static_cast<framework::proto::VarType::Type>(dtype), ctx.GetPlace(),
+          layout, library);
    } else if (x_vars[0]->IsType<framework::SelectedRows>()) {
      for (auto& var : x_vars) {
        auto& value = var->Get<framework::SelectedRows>().value();
        if (value.IsInitialized()) {
          return framework::OpKernelType(framework::ToDataType(value.type()),
-                                         ctx.device_context());
+                                         ctx.device_context(), layout, library);
        }
      }
      // if input sparse vars are not initialized, use an default kernel type.
      return framework::OpKernelType(framework::proto::VarType::FP32,
-                                     ctx.device_context());
+                                     ctx.device_context(), layout, library);
    } else if (x_vars[0]->IsType<framework::LoDTensorArray>()) {
      for (auto& x_var : x_vars) {
        auto& array = x_var->Get<framework::LoDTensorArray>();
        for (auto& each : array) {
          if (each.numel() != 0) {
            return framework::OpKernelType(framework::ToDataType(each.type()),
-                                           ctx.device_context());
+                                           ctx.device_context(), layout,
+                                           library);
          }
        }
      }
@@ -116,6 +133,9 @@ class SumOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "(vector<Tensor>) The input tensors of sum operator.")
        .AsDuplicable();
    AddOutput("Out", "(Tensor) The output tensor of sum operator.").Reuse("X");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
    AddComment(R"DOC(
 Sum operator.

@@ -132,7 +152,6 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
                  framework::BlockDesc* block) const override {
    auto& inputs = op_desc.Input("X");
    auto var_type = framework::proto::VarType::SELECTED_ROWS;
-
    for (auto& name : op_desc.Input("X")) {
      VLOG(10) << name << " "
               << block->FindRecursiveOrCreateVar(name).GetType();
@@ -206,6 +225,7 @@ namespace ops = paddle::operators;

 REGISTER_OPERATOR(sum, ops::SumOp, ops::SumOpMaker, ops::SumGradMaker,
                  ops::SumOpVarTypeInference);
+
 REGISTER_OP_CPU_KERNEL(
    sum, ops::SumKernel<paddle::platform::CPUDeviceContext, float>,
    ops::SumKernel<paddle::platform::CPUDeviceContext, double>,

--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
@@ -203,11 +203,11 @@ class WhileGradOp : public framework::OperatorBase {
                ->set_lod(inside_tensor.lod());
          }
        }
-
        auto new_inside_name = cur_scope.Rename(inside_grad_name);
        auto sum_op = framework::OpRegistry::CreateOp(
            "sum", {{"X", {pg_names[param_id], new_inside_name}}},
-            {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
+            {{"Out", {pg_names[param_id]}}},
+            framework::AttributeMap{{"use_mkldnn", {false}}});
        sum_op->Run(cur_scope, dev_place);
        cur_scope.Rename(new_inside_name, inside_grad_name);
      }

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -106,14 +106,6 @@ class CUDADeviceContext : public DeviceContext {
    PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
  }

-  // FIXME(zcd): A temporary fix for some language model that has sparse
-  // parameter.
-  template <typename Callback>
-  void RecordEventNoMutex(cudaEvent_t ev, Callback callback) {
-    callback();
-    PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
-  }
-
 private:
  CUDAPlace place_;


--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)

-list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc nccl.cc)
+list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc)
+
+# There is no macOS version of NCCL.
+if (NOT APPLE)
+  list(APPEND CUDA_SRCS nccl.cc)
+endif()
+
 if (TENSORRT_FOUND)
  list(APPEND CUDA_SRCS tensorrt.cc)
 endif()

-
 configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
 if (CUPTI_FOUND)
    list(APPEND CUDA_SRCS cupti.cc)

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -44,8 +44,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/dynload/curand.h"
+#ifndef __APPLE__
 #include "paddle/fluid/platform/dynload/nccl.h"
-#endif
+#endif  // __APPLE__
+#endif  // PADDLE_WITH_CUDA

 namespace paddle {
 namespace platform {
@@ -174,6 +176,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
  throw std::runtime_error(err + string::Sprintf(args...));
 }

+#ifndef __APPLE__
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
    ncclResult_t stat, const Args&... args) {
@@ -184,7 +187,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
                             string::Sprintf(args...));
  }
 }
-
+#endif  // __APPLE__
 #endif  // PADDLE_WITH_CUDA

 template <typename T>

--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -99,5 +99,143 @@ inline mkldnn::memory::format GetMKLDNNFormat(const mkldnn::memory memory) {
      memory.get_primitive_desc().desc().data.format);
 }

+inline mkldnn::memory::format GetMKLDNNFormat(
+    const mkldnn::sum::primitive_desc& memory) {
+  return static_cast<mkldnn::memory::format>(
+      memory.dst_primitive_desc().desc().data.format);
+}
+
+class MKLDNNHandler {
+ public:
+  MKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+                const std::string& base_key)
+      : dev_ctx_(dev_ctx),
+        engine_(engine),
+        key_(base_key),
+        is_reusing_(false) {}
+
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_src_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireWeightsMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_weights_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDstMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_dst_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_diff_dst_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_diff_src_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireMemoryFromPrimitive(
+      mkldnn::memory::primitive_desc mdp, void* ptr,
+      const std::string& suffix) {
+    auto local_key = key_ + suffix;
+    auto mem_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find mem primitive in device context");
+    if (mem_p == nullptr) {
+      mem_p = std::make_shared<mkldnn::memory>(mdp, ptr);
+      dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      mem_p->set_data_handle(ptr);
+      // Mark that reusing happenned. All primitives from operator instance
+      // should be reused or none of them. So we check consistency
+      is_reusing_ = true;
+    }
+    return mem_p;
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireMemory(const mkldnn::memory::desc& md,
+                                                void* ptr,
+                                                const std::string& suffix) {
+    /*Generate key*/
+    auto local_key = key_ + suffix;
+    auto mem_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find mem primitive in device context");
+    if (mem_p == nullptr) {
+      mem_p = std::make_shared<mkldnn::memory>(
+          mkldnn::memory::primitive_desc{md, engine_}, ptr);
+      dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      mem_p->set_data_handle(ptr);
+      // Mark that reusing happenned. All primitives from operator instance
+      // should be reused or none of them. So we check consistency
+      is_reusing_ = true;
+    }
+    return mem_p;
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireMemory(
+      mkldnn::memory::primitive_desc& mpd,
+      mkldnn::memory::primitive_desc& user_mpd,
+      const std::shared_ptr<mkldnn::memory> user_memory_p,
+      const std::string& suffix, std::vector<mkldnn::primitive>& pipeline) {
+    // create reorder primitive if the input format is not the preferred one
+    auto local_key = key_ + suffix;
+    auto key_reorder_p = key_ + suffix + "reorder_p";
+
+    auto target_memory_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((target_memory_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find mem primitive in device context");
+    if (target_memory_p == nullptr) {
+      target_memory_p = user_memory_p;
+      std::shared_ptr<mkldnn::primitive> reorder_p;
+      if (mpd != user_mpd) {
+        target_memory_p = std::make_shared<mkldnn::memory>(mpd);
+
+        auto reorder_p =
+            std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p);
+        dev_ctx_.SetBlob(key_reorder_p, reorder_p);
+        pipeline.push_back(*reorder_p);
+      }
+      dev_ctx_.SetBlob(local_key, target_memory_p);
+    } else {
+      // Make reorder if needed
+      auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
+          dev_ctx_.GetBlob(key_reorder_p));
+      if (reorder_p != nullptr) {
+        pipeline.push_back(*reorder_p);
+      }
+      is_reusing_ = true;
+    }
+    return target_memory_p;
+  }
+
+  static std::string GetHash(mkldnn::memory::dims& operand_dims,
+                             const std::string& suffix) {
+    auto dims2str = [](const mkldnn::memory::dims& operand_dims) {
+      std::string dstr = "";
+      for (size_t i = 0; i < operand_dims.size(); ++i) {
+        dstr += std::to_string(operand_dims[i]) + "-";
+      }
+      return dstr;
+    };
+    return dims2str(operand_dims) + suffix;
+  };
+
+ protected:
+  const MKLDNNDeviceContext& dev_ctx_;
+  mkldnn::engine engine_;
+  std::string key_;
+  bool is_reusing_;
+};
+
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -268,7 +268,8 @@ void BindOpDesc(pybind11::module *m) {
      .value("STRINGS", pd::proto::AttrType::STRINGS)
      .value("BOOL", pd::proto::AttrType::BOOLEAN)
      .value("BOOLS", pd::proto::AttrType::BOOLEANS)
-      .value("BLOCK", pd::proto::AttrType::BLOCK);
+      .value("BLOCK", pd::proto::AttrType::BLOCK)
+      .value("BLOCKS", pd::proto::AttrType::BLOCKS);

  pybind11::class_<pd::OpDesc> op_desc(*m, "OpDesc", "");
  op_desc
@@ -293,6 +294,7 @@ void BindOpDesc(pybind11::module *m) {
      .def("set_attr", &pd::OpDesc::SetAttr)
      .def("attr", &pd::OpDesc::GetAttr)
      .def("set_block_attr", &pd::OpDesc::SetBlockAttr)
+      .def("set_blocks_attr", &pd::OpDesc::SetBlocksAttr)
      .def("set_serialized_attr",
           [](pd::OpDesc &self, const std::string &name,
              const pybind11::bytes &seriralized) {

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -167,9 +167,6 @@ PYBIND11_PLUGIN(core) {
      .def("set_lod",
           [](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
             // the input lod is offset-based level-of-detail info
-             LOG(WARNING)
-                 << "set_lod is deprecated and will be removed by 9.2018, "
-                    "please switch to set_recursive_sequence_lengths.";
             LoD new_lod;
             new_lod.reserve(lod.size());
             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
@@ -196,8 +193,6 @@ PYBIND11_PLUGIN(core) {
      .def("lod",
           [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
             // output the offset-based lod info
-             LOG(WARNING) << "lod is deprecated and will be removed by 9.2018, "
-                             "please switch to recursive_sequence_lengths.";
             LoD lod = self.lod();
             std::vector<std::vector<size_t>> new_lod;
             new_lod.reserve(lod.size());

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -133,7 +133,7 @@ EOF
        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
        -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
-        -DWITH_ANAKIN=ON
+        -DWITH_ANAKIN=${WITH_ANAKIN:-ON}
 }

 function abort(){

--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -132,9 +132,9 @@ def _addup_repetitive_outputs_(op_descs):
    for idx, op_desc in enumerate(op_descs):
        for var_name in op_desc.input_arg_names():
            if len(renamed_vars[var_name]) > 1:
-                pending_sum_ops.append(
-                    (_create_op_desc_("sum", {"X": renamed_vars[var_name]},
-                                      {"Out": [var_name]}, {}), idx))
+                pending_sum_ops.append((_create_op_desc_(
+                    "sum", {"X": renamed_vars[var_name]}, {"Out": [var_name]},
+                    {"use_mkldnn": False}), idx))
                renamed_vars[var_name] = [var_name]
        for var_name in op_desc.output_arg_names():
            if var_name == core.empty_var_name(
@@ -161,8 +161,9 @@ def _addup_repetitive_outputs_(op_descs):
                renamed_vars[var_name].append(new_name)
    for var_name, inputs in renamed_vars.iteritems():
        if len(inputs) > 1:
-            pending_sum_ops.append((_create_op_desc_(
-                "sum", {"X": inputs}, {"Out": [var_name]}, {}), len(op_descs)))
+            pending_sum_ops.append(
+                (_create_op_desc_("sum", {"X": inputs}, {"Out": [var_name]},
+                                  {"use_mkldnn": False}), len(op_descs)))
    # sum_op descs are sorted according to their insert position
    for p in reversed(pending_sum_ops):
        op_descs.insert(p[1], p[0])

--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -78,6 +78,8 @@ def as_numpy(tensor):
    Returns:
        numpy.ndarray
    """
+    if isinstance(tensor, core.LoDTensorArray):
+        return [as_numpy(t) for t in tensor]
    if isinstance(tensor, list):
        return [as_numpy(t) for t in tensor]
    assert isinstance(tensor, core.LoDTensor)

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -559,15 +559,9 @@ class Operator(object):
                if (attr_name not in self.attrs) or (
                        self.attrs[attr_name] is None):
                    continue
-                if isinstance(self.attrs[attr_name], Block):
-                    self.desc.set_block_attr(attr_name,
-                                             self.attrs[attr_name].desc)
-                elif isinstance(self.attrs[attr_name], core.BlockDesc) or \
-                        isinstance(self.attrs[attr_name], core.ProgramDesc):
-                    self.desc.set_serialized_attr(
-                        attr_name, self.attrs[attr_name].serialize_to_string())
-                else:
-                    self.desc.set_attr(attr_name, self.attrs[attr_name])
+                attr_val = self.attrs[attr_name]
+                self._update_desc_attr(attr_name, attr_val)
+
        self.desc.check_attrs()
        if self.has_kernel(type):
            self.desc.infer_var_type(self.block.desc)
@@ -714,8 +708,24 @@ class Operator(object):
            ValueError: If the type of value doesn't match with desc.attr_type(name).
        """
        self.attrs[name] = val
+        self._update_desc_attr(name, val)
+
+    def _update_desc_attr(self, name, val):
+        """
+        Update the value of desc's attribute by attribute's name.
+
+        Args:
+            name(str): the attribute name.
+            val(bool|int|str|float|list): the value of the attribute.
+
+        Raises:
+            ValueError: If the type of value doesn't match with desc.attr_type(name).
+        """
        if isinstance(val, Block):
            self.desc.set_block_attr(name, val.desc)
+        elif isinstance(val, list) and val and all(
+                isinstance(v, Block) for v in val):
+            self.desc.set_blocks_attr(name, [v.desc for v in val])
        elif isinstance(val, core.BlockDesc) or \
                isinstance(val, core.ProgramDesc):
            self.desc.set_serialized_attr(name, val.serialize_to_string())
@@ -1388,7 +1398,11 @@ class Program(object):
        * Set for_test to True when we want to clone the program for testing.

        Notes: This API DOES NOT prune any operator. Use
-        :code:`clone(for_test=True)` before backward and optimization please.
+        :code:`clone(for_test=True)` before backward and optimization please. e.g.
+
+            >>> test_program = fluid.default_main_program().clone(for_test=True)
+            >>> optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+            >>> optimizer.minimize()

        Args:
            for_test(bool): True if change the :code:`is_test` attribute of

--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -186,7 +186,6 @@ class ListenAndServ(object):
        main_program = self.helper.main_program
        current_block = main_program.current_block()
        parent_block = self.parent_block()
-        empty_block = Program().global_block()

        parent_block.append_op(
            type='listen_and_serv',
@@ -195,8 +194,9 @@ class ListenAndServ(object):
            attrs={
                'endpoint': self.endpoint,
                'Fanin': self.fan_in,
-                'OptimizeBlock': current_block,
-                'PrefetchBlock': empty_block,
+                'optimize_blocks': [
+                    current_block
+                ],  # did not support multiple optimize blocks in layers
                'sync_mode': True,  # did not support async now in layers
                'grad_to_block_id': [""]
            })
@@ -469,10 +469,13 @@ def open_files(filenames,
       lod_levels(list): List of ints which declaring data lod_level.
       dtypes(list): List of strs which declaring data type.
       thread_num(int): The maximal concurrent prefetch thread number.
-       buffer_size(int): The size of prefetch buffer.
+       buffer_size(int|None): The size of prefetch buffer. If it is setted None, 
+            buffer size will be thread_num * 3.
+            Default: None
       pass_num(int): Number of passes to run.
       for_parallel(Bool): Set it as True if you are going to run 
            subsequent operators in parallel.
+            Default: True

    Returns:
       Variable: A Reader Variable via which we can get file data.
@@ -492,7 +495,7 @@ def open_files(filenames,
         image, label = fluid.layers.io.read_file(reader)
    """
    if buffer_size is None:
-        buffer_size = thread_num
+        buffer_size = thread_num * 3
    if isinstance(filenames, basestring):
        filenames = [filenames]
    dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -23,6 +23,7 @@ from layer_function_generator import autodoc, templatedoc
 from tensor import concat
 import utils
 import random
+from .. import unique_name

 __all__ = [
    'fc',
@@ -198,7 +199,10 @@ def fc(input,
    else:
        pre_bias = helper.create_tmp_variable(dtype)
        helper.append_op(
-            type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
+            type="sum",
+            inputs={"X": mul_results},
+            outputs={"Out": pre_bias},
+            attrs={"use_mkldnn": use_mkldnn})
    # add bias
    pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)
    # add activation
@@ -4263,14 +4267,18 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
                                say :attr:`actual_shape` has a higher priority
                                than :attr:`shape`.
        act (str): The non-linear activation to be applied to output variable.
-        inplace(bool): If this flag is set true, a new output tensor is created
-                       whose data is copied from input x, otherwise the output
-                       shares data with input without copying.
+        inplace(bool): If this flag is set true, the output
+                       shares data with input without copying, otherwise
+                       a new output tensor is created
+                       whose data is copied from input x.
        name (str): The name of this layer. It is optional.

    Returns:
        Variable: The output tensor.

+    Raises:
+        TypeError: if actual_shape is neither Variable nor None.
+
    Examples:
        .. code-block:: python

@@ -4282,6 +4290,11 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):

    if not (isinstance(shape, list) or isinstance(shape, tuple)):
        raise ValueError("Input shape must be a python lsit or tuple.")
+    inputs = {"X": x}
+    if isinstance(actual_shape, Variable):
+        inputs["Shape"] = actual_shape
+    elif actual_shape is not None:
+        raise TypeError("actual_shape should either be Variable or None")

    # Validate the shape
    unk_dim_idx = -1
@@ -4302,9 +4315,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
    reshaped = helper.create_tmp_variable(dtype=x.dtype)
    helper.append_op(
        type="reshape",
-        inputs={"X": x,
-                "Shape": actual_shape}
-        if isinstance(actual_shape, Variable) else {"X": x},
+        inputs=inputs,
        attrs={"shape": shape,
               "inplace": inplace},
        outputs={"Out": reshaped})
@@ -4886,34 +4897,26 @@ def random_crop(x, shape, seed=None):
        >>> cropped_img = fluid.layers.random_crop(img, shape=[3, 224, 224])
    """
    helper = LayerHelper("random_crop", **locals())
-    dtype = helper.input_dtype()
+    dtype = x.dtype
    out = helper.create_tmp_variable(dtype)
    if seed is None:
        seed = random.randint(-65536, 65535)
-
+    op_attrs = {"shape": shape}
    if isinstance(seed, int):
-        seed_value = seed
-        seed = helper.create_tmp_variable(dtype="int64")
-        helper.append_op(
-            type="fill_constant",
-            inputs={},
-            outputs={"Out": seed},
-            attrs={
-                "dtype": seed.dtype,
-                "shape": [1],
-                "value": float(seed_value),
-                "force_cpu": True
-            })
+        op_attrs["startup_seed"] = seed
+        seed = helper.create_variable(
+            name=unique_name.generate("random_crop_seed"),
+            dtype="int64",
+            persistable=True)
    elif not isinstance(seed, Variable):
        raise ValueError("'seed' must be a Variable or an int.")
-    seed_out = helper.create_tmp_variable(dtype="int64")
    helper.append_op(
        type="random_crop",
        inputs={"X": x,
                "Seed": seed},
        outputs={"Out": out,
-                 "SeedOut": seed_out},
-        attrs={"shape": shape})
+                 "SeedOut": seed},
+        attrs=op_attrs)
    return out



--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -230,11 +230,15 @@ def sums(input, out=None):
    helper = LayerHelper('sum', **locals())
    if out is None:
        out = helper.create_tmp_variable(dtype=helper.input_dtype())
-    helper.append_op(type='sum', inputs={'X': input}, outputs={'Out': out})
+    helper.append_op(
+        type='sum',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={'use_mkldnn': False})
    return out


-def assign(input, output):
+def assign(input, output=None):
    """
    **Assign**

@@ -242,7 +246,7 @@ def assign(input, output):

    Args:
        input(Variable|numpy.ndarray): The source variable
-        output(Variable): The destination variable
+        output(Variable|None): The destination variable

    Returns:
        Variable: The destination variable that was supplied as the *output*.
@@ -255,6 +259,8 @@ def assign(input, output):
          fluid.layers.assign(hidden, out)
    """
    helper = LayerHelper('assign', **locals())
+    if output is None:
+        output = helper.create_tmp_variable(dtype=input.dtype)
    if isinstance(input, Variable):
        helper.append_op(
            type='assign', inputs={'X': [input]}, outputs={'Out': [output]})

--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -596,12 +596,12 @@ class Auc(MetricBase):
            tp, fn, tn, fp = 0, 0, 0, 0
            for i, lbl in enumerate(labels):
                if lbl:
-                    if predictions[i, 1] >= thresh:
+                    if preds[i, 1] >= thresh:
                        tp += 1
                    else:
                        fn += 1
                else:
-                    if predictions[i, 1] >= thresh:
+                    if preds[i, 1] >= thresh:
                        fp += 1
                    else:
                        tn += 1

--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -160,7 +160,7 @@ class ParallelExecutor(object):
            build_strategy, num_trainers, trainer_id)
        self.scope = scope

-    def run(self, fetch_list, feed=None, feed_dict=None):
+    def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=False):
        """
        Run a parallel executor with fetch_list.

@@ -196,6 +196,8 @@ class ParallelExecutor(object):
                to each device. Default None.
            feed_dict: Alias for feed parameter, for backward compatibility.
                This parameter has been deprecated. Default None.
+            return_numpy(bool): Whether converts the fetched tensor to numpy.
+                Default: False.

        Returns:
            List: The fetched result list.
@@ -270,6 +272,9 @@ class ParallelExecutor(object):
        if self.is_dist:
            self.bcast_params()

+        if return_numpy:
+            return executor.as_numpy(arr)
+
        return [arr[i] for i in range(len(arr))]

    def bcast_params(self):

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -43,8 +43,6 @@ list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_dist_train)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
-# TODO(wuyi): this test hungs on CI, will add it back later
-list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
 foreach(TEST_OP ${TEST_OPS})
    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
@@ -52,3 +50,4 @@ py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=$
 py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
+set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+from test_elementwise_add_op import *
+'''
+Some tests differ from the tests defined in test_elementwise_add_op.py
+because MKLDNN does not support tensors of number of dimensions 3.
+Such dimensions cause exceptions in MKLDNN reorder primitive.
+'''
+
+
+class TestMKLDNNElementwiseAddOp(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_scalar(TestElementwiseAddOp_scalar):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_scalar2(TestElementwiseAddOp_scalar2):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_Vector(TestElementwiseAddOp_Vector):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TesMKLDNNtElementwiseAddOp_broadcast_0(TestElementwiseAddOp_broadcast_0):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(2).astype(self.dtype)
+        self.out = self.x + self.y.reshape(2, 1, 1, 1)
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_broadcast_1(TestElementwiseAddOp_broadcast_1):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(3).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 3, 1, 1)
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_broadcast_2(TestElementwiseAddOp_broadcast_2):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1, 1, 4)
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_broadcast_3(TestElementwiseAddOp_broadcast_3):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_broadcast_4(TestElementwiseAddOp_broadcast_4):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_rowwise_add_0(
+        TestElementwiseAddOp_rowwise_add_0):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(3, 4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 3, 4, 1)
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_rowwise_add_1(
+        TestElementwiseAddOp_rowwise_add_1):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_channelwise_add(
+        TestElementwiseAddOp_channelwise_add):
+    def init_input_output(self):
+        self.x = np.random.rand(3, 5, 20, 20).astype(self.dtype)
+        self.y = np.random.rand(3, 1, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -18,19 +18,23 @@ from op_test import OpTest


 class TestElementwiseAddOp(OpTest):
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
    def setUp(self):
        self.op_type = "elementwise_add"
        self.dtype = np.float32
        self.axis = -1
        self.init_dtype()
        self.init_input_output()
+        self.init_kernel_type()
        self.init_axis()

        self.inputs = {
            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
        }
-        self.attrs = {'axis': self.axis}
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
        self.outputs = {'Out': self.out}

    def test_check_output(self):

--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
@@ -94,7 +94,7 @@ class TestListenAndServOp(OpTest):
        self._wait_ps_ready(p1.pid)

        # raise SIGTERM to pserver
-        os.kill(p1.pid, signal.SIGKILL)
+        os.kill(p1.pid, signal.SIGINT)
        p1.join()

        # run pserver on CPU in async mode
@@ -102,7 +102,7 @@ class TestListenAndServOp(OpTest):
        self._wait_ps_ready(p2.pid)

        # raise SIGTERM to pserver
-        os.kill(p2.pid, signal.SIGKILL)
+        os.kill(p2.pid, signal.SIGTERM)
        p2.join()



--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -75,7 +75,9 @@ class TestFetchOp(unittest.TestCase):
                    fetch_list.append(k)

            for data in train_inputs:
-                ret = pe.run(fetch_list, feed=feeder.feed(data))
+                ret = pe.run(fetch_list,
+                             feed=feeder.feed(data),
+                             return_numpy=True)
                for i in range(len(fetch_list)):
                    assert not math.isnan(np.sum(ret[i])) and \
                           not math.isinf(np.sum(ret[i]))

--- a/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from test_sum_op import TestSumOp
+
+
+class TestMKLDNN(TestSumOp):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -20,12 +20,15 @@ from op_test import OpTest
 class TestSumOp(OpTest):
    def setUp(self):
        self.op_type = "sum"
+        self.use_mkldnn = False
+        self.init_kernel_type()
        x0 = np.random.random((3, 4)).astype('float32')
        x1 = np.random.random((3, 4)).astype('float32')
        x2 = np.random.random((3, 4)).astype('float32')
        self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
        y = x0 + x1 + x2
        self.outputs = {'Out': y}
+        self.attrs = {'use_mkldnn': self.use_mkldnn}

    def test_check_output(self):
        self.check_output()
@@ -33,6 +36,9 @@ class TestSumOp(OpTest):
    def test_check_grad(self):
        self.check_grad(['x0'], 'Out')

+    def init_kernel_type(self):
+        pass
+

 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -396,7 +396,7 @@ class DistributeTranspiler(object):
                    return varname
            return ""

-        def __clone_lr_op_sub_block__(op, program, new_block):
+        def __clone_lr_op_sub_block__(op, program, lr_block):
            if not op.has_attr('sub_block'):
                return

@@ -405,36 +405,41 @@ class DistributeTranspiler(object):
            assert isinstance(origin_block, Block)
            # we put the new sub block to new block to follow the block
            # hierarchy of the original blocks
-            new_sub_block = program.create_block(new_block.idx)
+            new_sub_block = program.create_block(lr_block.idx)

            # clone vars
            for var in origin_block.vars:
                new_sub_block.clone_variable(var)

            # clone ops
-            for op in origin_block.ops:
-                self._clone_lr_op(program, new_sub_block, op)
+            for origin_op in origin_block.ops:
+                cloned_op = self._clone_lr_op(program, new_sub_block, origin_op)
                # clone sub_block of op
-                __clone_lr_op_sub_block__(op, program, new_sub_block)
+                __clone_lr_op_sub_block__(cloned_op, program, new_sub_block)

            # reset the block of op
            op.set_attr('sub_block', new_sub_block)

        # append lr decay ops to the child block if exists
        lr_ops = self._get_lr_ops()
+        # record optimize blocks and we can run them on pserver parallel
+        optimize_blocks = []
        if len(lr_ops) > 0:
            lr_decay_block = pserver_program.create_block(
                pserver_program.num_blocks - 1)
+            optimize_blocks.append(lr_decay_block)
            for _, op in enumerate(lr_ops):
-                self._append_pserver_non_opt_ops(lr_decay_block, op)
+                cloned_op = self._append_pserver_non_opt_ops(lr_decay_block, op)
                # append sub blocks to pserver_program in lr_decay_op
-                __clone_lr_op_sub_block__(op, pserver_program, lr_decay_block)
+                __clone_lr_op_sub_block__(cloned_op, pserver_program,
+                                          lr_decay_block)

        # append op to the current block
        grad_to_block_id = []
        pre_block_idx = pserver_program.num_blocks - 1
        for idx, opt_op in enumerate(opt_op_on_pserver):
            per_opt_block = pserver_program.create_block(pre_block_idx)
+            optimize_blocks.append(per_opt_block)
            # append grad merging ops before clip and weight decay
            for _, op in enumerate(self.optimize_ops):
                # find the origin @GRAD var before clipping
@@ -453,6 +458,7 @@ class DistributeTranspiler(object):
        if global_ops:
            opt_state_block = pserver_program.create_block(
                pserver_program.num_blocks - 1)
+            optimize_blocks.append(opt_state_block)
            for glb_op in global_ops:
                __append_optimize_op__(glb_op, opt_state_block,
                                       grad_to_block_id, None)
@@ -474,11 +480,11 @@ class DistributeTranspiler(object):
            assert len(prefetch_var_name_to_block_id) == 0

        attrs = {
-            "OptimizeBlock": pserver_program.block(1),
+            "optimize_blocks": optimize_blocks,
            "endpoint": endpoint,
            "Fanin": self.trainer_num,
            "sync_mode": self.sync_mode,
-            "grad_to_block_id": grad_to_block_id
+            "grad_to_block_id": grad_to_block_id,
        }
        if len(prefetch_var_name_to_block_id) > 0:
            attrs['prefetch_var_name_to_block_id'] \
@@ -872,7 +878,8 @@ class DistributeTranspiler(object):
            table_opt_block.append_op(
                type="sum",
                inputs={"X": pserver_side_table_grad_list},
-                outputs={"Out": [grad_var]})
+                outputs={"Out": [grad_var]},
+                attrs={"use_mkldnn": False})
        else:
            # in async_mode, for table gradient, it also need to be splited to each parameter server
            origin_grad_name = grad_var.name
@@ -1104,7 +1111,8 @@ class DistributeTranspiler(object):
            optimize_block.append_op(
                type="sum",
                inputs={"X": vars2merge},
-                outputs={"Out": merged_var})
+                outputs={"Out": merged_var},
+                attrs={"use_mkldnn": False})
            # TODO(panyx0718): What if it's SELECTED_ROWS.
            if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS:
                optimize_block.append_op(
@@ -1209,7 +1217,7 @@ class DistributeTranspiler(object):
                if var not in program.global_block().vars:
                    block.clone_variable(var)

-        block.append_op(
+        return block.append_op(
            type=op.type, inputs=inputs, outputs=outputs, attrs=op.attrs)

    def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
@@ -1247,7 +1255,7 @@ class DistributeTranspiler(object):
                elif not program.global_block().vars.has_key(var.name):
                    program.global_block().clone_variable(var)

-        optimize_block.append_op(
+        return optimize_block.append_op(
            type=opt_op.type,
            inputs=inputs,
            outputs=outputs,
@@ -1291,16 +1299,6 @@ class DistributeTranspiler(object):
                    ufind.union(op1, op2)
        return ufind

-    def _is_opt_role_op(self, op):
-        # NOTE: depend on oprole to find out whether this op is for
-        # optimize
-        op_maker = core.op_proto_and_checker_maker
-        optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
-        if op_maker.kOpRoleAttrName() in op.attrs and \
-            int(op.attrs[op_maker.kOpRoleAttrName()]) == int(optimize_role):
-            return True
-        return False
-
    def _is_optimizer_op(self, op):
        if "Param" in op.input_names and \
            "LearningRate" in op.input_names:
@@ -1391,7 +1389,10 @@ class DistributeTranspiler(object):
        params_grads = []
        origin_var_dict = self.origin_program.global_block().vars
        for op in block.ops:
-            if self._is_opt_role_op(op):
+            # NOTE(Yancey1989): we can not use op role to distinguish an optimizer op
+            # or not, because all ops in optimizer sub-graph would
+            # sign the optimizer op role
+            if self._is_optimizer_op(op):
                opt_ops.append(op)
                # HACK(wuyi): if we find grad vars from input of optimize
                # ops, we may get the output of clip op. Use syntax "@GRAD"

--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
--- a/python/paddle/v2/dataset/cifar.py
+++ b/python/paddle/v2/dataset/cifar.py
@@ -43,7 +43,7 @@ CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
 CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'


-def reader_creator(filename, sub_name):
+def reader_creator(filename, sub_name, cycle=False):
    def read_batch(batch):
        data = batch['data']
        labels = batch.get('labels', batch.get('fine_labels', None))
@@ -56,10 +56,13 @@ def reader_creator(filename, sub_name):
            names = (each_item.name for each_item in f
                     if sub_name in each_item.name)

+            while True:
                for name in names:
                    batch = cPickle.load(f.extractfile(name))
                    for item in read_batch(batch):
                        yield item
+                if not cycle:
+                    break

    return reader

@@ -94,34 +97,40 @@ def test100():
        'test')


-def train10():
+def train10(cycle=False):
    """
    CIFAR-10 training set creator.

    It returns a reader creator, each sample in the reader is image pixels in
    [0, 1] and label in [0, 9].

+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
    :return: Training reader creator
    :rtype: callable
    """
    return reader_creator(
        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'data_batch')
+        'data_batch',
+        cycle=cycle)


-def test10():
+def test10(cycle=False):
    """
    CIFAR-10 test set creator.

    It returns a reader creator, each sample in the reader is image pixels in
    [0, 1] and label in [0, 9].

+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
    :return: Test reader creator.
    :rtype: callable
    """
    return reader_creator(
        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'test_batch')
+        'test_batch',
+        cycle=cycle)


 def fetch():

--- a/python/paddle/v2/dataset/flowers.py
+++ b/python/paddle/v2/dataset/flowers.py
@@ -76,7 +76,8 @@ def reader_creator(data_file,
                   dataset_name,
                   mapper,
                   buffered_size=1024,
-                   use_xmap=True):
+                   use_xmap=True,
+                   cycle=False):
    '''
    1. read images from tar file and
        merge images into batch files in 102flowers.tgz_batch/
@@ -96,6 +97,8 @@ def reader_creator(data_file,
    :type mapper: callable
    :param buffered_size: the size of buffer used to process images
    :type buffered_size: int
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
    :return: data reader
    :rtype: callable
    '''
@@ -108,6 +111,7 @@ def reader_creator(data_file,
    file_list = batch_images_from_tar(data_file, dataset_name, img2label)

    def reader():
+        while True:
            for file in open(file_list):
                file = file.strip()
                batch = None
@@ -117,6 +121,8 @@ def reader_creator(data_file,
                labels = batch['label']
                for sample, label in itertools.izip(data, batch['label']):
                    yield sample, int(label) - 1
+            if not cycle:
+                break

    if use_xmap:
        cpu_num = int(os.environ.get('CPU_NUM', cpu_count()))
@@ -125,7 +131,7 @@ def reader_creator(data_file,
        return map_readers(mapper, reader)


-def train(mapper=train_mapper, buffered_size=1024, use_xmap=True):
+def train(mapper=train_mapper, buffered_size=1024, use_xmap=True, cycle=False):
    '''
    Create flowers training set reader.
    It returns a reader, each sample in the reader is
@@ -138,17 +144,23 @@ def train(mapper=train_mapper, buffered_size=1024, use_xmap=True):
    :type mapper: callable
    :param buffered_size: the size of buffer used to process images
    :type buffered_size: int
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
    :return: train data reader
    :rtype: callable
    '''
    return reader_creator(
        download(DATA_URL, 'flowers', DATA_MD5),
        download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), TRAIN_FLAG, mapper,
-        buffered_size, use_xmap)
+        download(SETID_URL, 'flowers', SETID_MD5),
+        TRAIN_FLAG,
+        mapper,
+        buffered_size,
+        use_xmap,
+        cycle=cycle)


-def test(mapper=test_mapper, buffered_size=1024, use_xmap=True):
+def test(mapper=test_mapper, buffered_size=1024, use_xmap=True, cycle=False):
    '''
    Create flowers test set reader.
    It returns a reader, each sample in the reader is
@@ -161,14 +173,20 @@ def test(mapper=test_mapper, buffered_size=1024, use_xmap=True):
    :type mapper: callable
    :param buffered_size: the size of buffer used to process images
    :type buffered_size: int
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
    :return: test data reader
    :rtype: callable
    '''
    return reader_creator(
        download(DATA_URL, 'flowers', DATA_MD5),
        download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), TEST_FLAG, mapper,
-        buffered_size, use_xmap)
+        download(SETID_URL, 'flowers', SETID_MD5),
+        TEST_FLAG,
+        mapper,
+        buffered_size,
+        use_xmap,
+        cycle=cycle)


 def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True):