Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into expose_Parameter_2

d0a8eea2 · fengjiayi · e9ed62bf · a64844ad · d0a8eea2 · d0a8eea2
92 changed file
--- a/Dockerfile
+++ b/Dockerfile
@@ -76,7 +76,8 @@ RUN easy_install -U pip && \
    pip install sphinx-rtd-theme==0.1.9 recommonmark

 RUN pip install pre-commit 'ipython==5.3.0' && \
-    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0'
+    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip install opencv-python

 #For docstring checker
 RUN pip install pylint pytest astroid isort

--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -40,12 +40,12 @@ ExternalProject_Add(
    # NOTE(wuyi):
    # this package is generated by following steps:
    # 1. git clone -b v1.8.x https://github.com/grpc/grpc.git
-    # 2. submodule update --init
+    # 2. git submodule update --init
    # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
    #    checkout and clean other dirs under third_party
    # 4. remove .git, and package the directory.
-    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.8.x.tar.gz"
-    URL_MD5  "c9c58ee7d0e8929a63155af6a2ecdbd0"
+    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz"
+    URL_MD5  "1f268a2aff6759839dccd256adcc91cf"
    PREFIX          ${GRPC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CONFIGURE_COMMAND ""

--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -54,7 +54,7 @@ ExternalProject_Add(
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ${MKLDNN_DEPENDS}
    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "db3424ad44901513c03a1ea31ccaacdf633fbe9f"
+    GIT_TAG             "a29d8487a63afca3d5b8c5bbdbb473cf8ccc6e51"
    PREFIX              ${MKLDNN_SOURCES_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}

--- a/doc/fluid/api/evaluator.rst
+++ b/doc/fluid/api/evaluator.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!

-=========
-evaluator
-=========
+=============
+fluid.average
+=============
+
+.. _api_fluid_average_WeightedAverage:
+
+WeightedAverage
+---------------
+
+..  autoclass:: paddle.fluid.average.WeightedAverage
+    :members:
+    :noindex:

--- a/doc/fluid/api/backward.rst
+++ b/doc/fluid/api/backward.rst
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+==============
+fluid.backward
+==============
+
+.. _api_fluid_backward_append_backward:
+
+append_backward
+---------------
+
+..  autofunction:: paddle.fluid.backward.append_backward
+    :noindex:
+
+.. _api_fluid_backward_calc_gradient:
+
+calc_gradient
+-------------
+
+..  autofunction:: paddle.fluid.backward.calc_gradient
+    :noindex:
+
--- a/doc/fluid/api/clip.rst
+++ b/doc/fluid/api/clip.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!

-====
-clip
-====
+==========
+fluid.clip
+==========
+
+.. _api_fluid_clip_ErrorClipByValue:

 ErrorClipByValue
 ----------------
@@ -12,6 +14,8 @@ ErrorClipByValue
    :members:
    :noindex:

+.. _api_fluid_clip_GradientClipByValue:
+
 GradientClipByValue
 -------------------

@@ -19,6 +23,8 @@ GradientClipByValue
    :members:
    :noindex:

+.. _api_fluid_clip_GradientClipByNorm:
+
 GradientClipByNorm
 ------------------

@@ -26,6 +32,8 @@ GradientClipByNorm
    :members:
    :noindex:

+.. _api_fluid_clip_GradientClipByGlobalNorm:
+
 GradientClipByGlobalNorm
 ------------------------

@@ -33,15 +41,3 @@ GradientClipByGlobalNorm
    :members:
    :noindex:

-append_gradient_clip_ops
------------------------
-
-..  autofunction:: paddle.fluid.clip.append_gradient_clip_ops
-    :noindex:
-
-error_clip_callback
-------------------
-
-..  autofunction:: paddle.fluid.clip.error_clip_callback
-    :noindex:
-
--- a/doc/fluid/api/data.rst
+++ b/doc/fluid/api/data.rst
-==================================
-Data Reader Interface and DataSets
-==================================
-
-..  toctree::
-    :maxdepth: 1
-
-    data/data_reader.rst
-    data/image.rst
-    data/dataset.rst
--- a/doc/fluid/api/data_feeder.rst
+++ b/doc/fluid/api/data_feeder.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!

-===========
-data_feeder
-===========
+=================
+fluid.data_feeder
+=================
+
+.. _api_fluid_data_feeder_DataFeeder:

 DataFeeder
 ----------

--- a/doc/fluid/api/detection.rst
+++ b/doc/fluid/api/detection.rst
--- a/doc/fluid/api/executor.rst
+++ b/doc/fluid/api/executor.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!

-========
-executor
-========
+==============
+fluid.executor
+==============
+
+.. _api_fluid_executor_Executor:

 Executor
 --------
@@ -12,24 +14,32 @@ Executor
    :members:
    :noindex:

+.. _api_fluid_executor_global_scope:
+
 global_scope
 ------------

 ..  autofunction:: paddle.fluid.executor.global_scope
    :noindex:

+.. _api_fluid_executor_scope_guard:
+
 scope_guard
 -----------

 ..  autofunction:: paddle.fluid.executor.scope_guard
    :noindex:

-switch_scope
------------
+.. _api_fluid_executor__switch_scope:
+
+_switch_scope
+-------------

-..  autofunction:: paddle.fluid.executor.switch_scope
+..  autofunction:: paddle.fluid.executor._switch_scope
    :noindex:

+.. _api_fluid_executor_fetch_var:
+
 fetch_var
 ---------


--- a/doc/fluid/api/fluid.rst
+++ b/doc/fluid/api/fluid.rst
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=====
+fluid
+=====
+
+.. _api_fluid_Block:
+
+Block
+-----
+
+..  autoclass:: paddle.fluid.Block
+    :members:
+    :noindex:
+
+.. _api_fluid_Variable:
+
+Variable
+--------
+
+..  autoclass:: paddle.fluid.Variable
+    :members:
+    :noindex:
+
+.. _api_fluid_Program:
+
+Program
+-------
+
+..  autoclass:: paddle.fluid.Program
+    :members:
+    :noindex:
+
+.. _api_fluid_Operator:
+
+Operator
+--------
+
+..  autoclass:: paddle.fluid.Operator
+    :members:
+    :noindex:
+
+.. _api_fluid_default_startup_program:
+
+default_startup_program
+-----------------------
+
+..  autofunction:: paddle.fluid.default_startup_program
+    :noindex:
+
+.. _api_fluid_default_main_program:
+
+default_main_program
+--------------------
+
+..  autofunction:: paddle.fluid.default_main_program
+    :noindex:
+
+.. _api_fluid_program_guard:
+
+program_guard
+-------------
+
+..  autofunction:: paddle.fluid.program_guard
+    :noindex:
+
+.. _api_fluid_get_var:
+
+get_var
+-------
+
+..  autofunction:: paddle.fluid.get_var
+    :noindex:
+
+.. _api_fluid_Executor:
+
+Executor
+--------
+
+..  autoclass:: paddle.fluid.Executor
+    :members:
+    :noindex:
+
+.. _api_fluid_global_scope:
+
+global_scope
+------------
+
+..  autofunction:: paddle.fluid.global_scope
+    :noindex:
+
+.. _api_fluid_scope_guard:
+
+scope_guard
+-----------
+
+..  autofunction:: paddle.fluid.scope_guard
+    :noindex:
+
+.. _api_fluid__switch_scope:
+
+_switch_scope
+-------------
+
+..  autofunction:: paddle.fluid._switch_scope
+    :noindex:
+
+.. _api_fluid_fetch_var:
+
+fetch_var
+---------
+
+..  autofunction:: paddle.fluid.fetch_var
+    :noindex:
+
+.. _api_fluid_Go:
+
+Go
+--
+
+..  autoclass:: paddle.fluid.Go
+    :members:
+    :noindex:
+
+.. _api_fluid_make_channel:
+
+make_channel
+------------
+
+..  autofunction:: paddle.fluid.make_channel
+    :noindex:
+
+.. _api_fluid_channel_send:
+
+channel_send
+------------
+
+..  autofunction:: paddle.fluid.channel_send
+    :noindex:
+
+.. _api_fluid_channel_recv:
+
+channel_recv
+------------
+
+..  autofunction:: paddle.fluid.channel_recv
+    :noindex:
+
+.. _api_fluid_channel_close:
+
+channel_close
+-------------
+
+..  autofunction:: paddle.fluid.channel_close
+    :noindex:
+
+.. _api_fluid_Select:
+
+Select
+------
+
+..  autoclass:: paddle.fluid.Select
+    :members:
+    :noindex:
+
+.. _api_fluid_Trainer:
+
+Trainer
+-------
+
+..  autoclass:: paddle.fluid.Trainer
+    :members:
+    :noindex:
+
+.. _api_fluid_BeginEpochEvent:
+
+BeginEpochEvent
+---------------
+
+..  autoclass:: paddle.fluid.BeginEpochEvent
+    :members:
+    :noindex:
+
+.. _api_fluid_EndEpochEvent:
+
+EndEpochEvent
+-------------
+
+..  autoclass:: paddle.fluid.EndEpochEvent
+    :members:
+    :noindex:
+
+.. _api_fluid_BeginStepEvent:
+
+BeginStepEvent
+--------------
+
+..  autoclass:: paddle.fluid.BeginStepEvent
+    :members:
+    :noindex:
+
+.. _api_fluid_EndStepEvent:
+
+EndStepEvent
+------------
+
+..  autoclass:: paddle.fluid.EndStepEvent
+    :members:
+    :noindex:
+
+.. _api_fluid_CheckpointConfig:
+
+CheckpointConfig
+----------------
+
+..  autoclass:: paddle.fluid.CheckpointConfig
+    :members:
+    :noindex:
+
+.. _api_fluid_Inferencer:
+
+Inferencer
+----------
+
+..  autoclass:: paddle.fluid.Inferencer
+    :members:
+    :noindex:
+
+.. _api_fluid_DistributeTranspiler:
+
+DistributeTranspiler
+--------------------
+
+..  autoclass:: paddle.fluid.DistributeTranspiler
+    :members:
+    :noindex:
+
+.. _api_fluid_memory_optimize:
+
+memory_optimize
+---------------
+
+..  autofunction:: paddle.fluid.memory_optimize
+    :noindex:
+
+.. _api_fluid_release_memory:
+
+release_memory
+--------------
+
+..  autofunction:: paddle.fluid.release_memory
+    :noindex:
+
+.. _api_fluid_ParallelExecutor:
+
+ParallelExecutor
+----------------
+
+..  autoclass:: paddle.fluid.ParallelExecutor
+    :members:
+    :noindex:
+
+.. _api_fluid_ExecutionStrategy:
+
+ExecutionStrategy
+-----------------
+
+..  autoclass:: paddle.fluid.ExecutionStrategy
+    :members:
+    :noindex:
+
+.. _api_fluid_BuildStrategy:
+
+BuildStrategy
+-------------
+
+..  autoclass:: paddle.fluid.BuildStrategy
+    :members:
+    :noindex:
+
+.. _api_fluid_create_lod_tensor:
+
+create_lod_tensor
+-----------------
+
+..  autofunction:: paddle.fluid.create_lod_tensor
+    :noindex:
+
+.. _api_fluid_create_random_int_lodtensor:
+
+create_random_int_lodtensor
+---------------------------
+
+..  autofunction:: paddle.fluid.create_random_int_lodtensor
+    :noindex:
+
+.. _api_fluid_LoDTensor:
+
+LoDTensor
+---------
+
+..  autoclass:: paddle.fluid.LoDTensor
+    :members:
+    :noindex:
+
+.. _api_fluid_CPUPlace:
+
+CPUPlace
+--------
+
+..  autoclass:: paddle.fluid.CPUPlace
+    :members:
+    :noindex:
+
+.. _api_fluid_CUDAPlace:
+
+CUDAPlace
+---------
+
+..  autoclass:: paddle.fluid.CUDAPlace
+    :members:
+    :noindex:
+
+.. _api_fluid_CUDAPinnedPlace:
+
+CUDAPinnedPlace
+---------------
+
+..  autoclass:: paddle.fluid.CUDAPinnedPlace
+    :members:
+    :noindex:
+
+.. _api_fluid_Tensor:
+
+Tensor
+------
+
+..  autoclass:: paddle.fluid.Tensor
+    :members:
+    :noindex:
+
+.. _api_fluid_ParamAttr:
+
+ParamAttr
+---------
+
+..  autoclass:: paddle.fluid.ParamAttr
+    :members:
+    :noindex:
+
+.. _api_fluid_WeightNormParamAttr:
+
+WeightNormParamAttr
+-------------------
+
+..  autoclass:: paddle.fluid.WeightNormParamAttr
+    :members:
+    :noindex:
+
+.. _api_fluid_DataFeeder:
+
+DataFeeder
+----------
+
+..  autoclass:: paddle.fluid.DataFeeder
+    :members:
+    :noindex:
+
+.. _api_fluid_Scope:
+
+Scope
+-----
+
+..  autoclass:: paddle.fluid.Scope
+    :members:
+    :noindex:
+
--- a/doc/fluid/api/gen_doc.py
+++ b/doc/fluid/api/gen_doc.py
@@ -29,19 +29,27 @@ def parse_arg():


 class DocGenerator(object):
-    def __init__(self, module_name, stream=sys.stdout):
+    def __init__(self, module_name=None, stream=sys.stdout):
+        if module_name == "":
+            module_name = None
        self.stream = stream
-        self.module_name = module_name
-        if not hasattr(fluid, module_name):
-            raise ValueError("Cannot find fluid.{0}".format(module_name))
+        if module_name is None:
+            self.module_name = "fluid"
        else:
-            self.module = getattr(fluid, module_name)
+            self.module_name = "fluid." + module_name
+        if module_name is None:
+            self.module = fluid
+        else:
+            if not hasattr(fluid, module_name):
+                raise ValueError("Cannot find fluid.{0}".format(module_name))
+            else:
+                self.module = getattr(fluid, module_name)
        self.stream.write('''..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!

 ''')

-        self._print_header_(module_name, dot='=', is_title=True)
+        self._print_header_(self.module_name, dot='=', is_title=True)

    def print_submodule(self, submodule_name):
        submodule = getattr(self.module, submodule_name)
@@ -60,25 +68,29 @@ class DocGenerator(object):
        self._print_header_(name, dot='=', is_title=False)

    def print_item(self, name):
-        item = getattr(self.module, name)
+        item = getattr(self.module, name, None)
+        if item is None:
+            return
        if isinstance(item, types.TypeType):
            self.print_class(name)
        elif isinstance(item, types.FunctionType):
            self.print_method(name)
        else:
-            raise RuntimeError("Unsupported item {0}".format(name))
+            pass

    def print_class(self, name):
+        self._print_ref_(name)
        self._print_header_(name, dot='-', is_title=False)
-        self.stream.write('''..  autoclass:: paddle.fluid.{0}.{1}
+        self.stream.write('''..  autoclass:: paddle.{0}.{1}
    :members:
    :noindex:

 '''.format(self.module_name, name))

    def print_method(self, name):
+        self._print_ref_(name)
        self._print_header_(name, dot='-', is_title=False)
-        self.stream.write('''..  autofunction:: paddle.fluid.{0}.{1}
+        self.stream.write('''..  autofunction:: paddle.{0}.{1}
    :noindex:

 '''.format(self.module_name, name))
@@ -94,6 +106,10 @@ class DocGenerator(object):
        self.stream.write('\n')
        self.stream.write('\n')

+    def _print_ref_(self, name):
+        self.stream.write(".. _api_{0}_{1}:\n\n".format("_".join(
+            self.module_name.split(".")), name))
+

 def main():
    args = parse_arg()

--- a/doc/fluid/api/gen_doc.sh
+++ b/doc/fluid/api/gen_doc.sh
 #!/bin/bash
-python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler metric > layers.rst
+python gen_doc.py layers --submodules control_flow device io nn ops tensor learning_rate_scheduler detection metric_op tensor > layers.rst

-for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler
+for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler recordio_writer backward average profiler
 do
  python gen_doc.py ${module} > ${module}.rst
 done
+
+python gen_doc.py "" > fluid.rst
--- a/doc/fluid/api/index_en.rst
+++ b/doc/fluid/api/index_en.rst
-======================
-Fluid
-======================
+=============
+API Reference
+=============

 ..  toctree::
    :maxdepth: 1

+    fluid.rst
    layers.rst
    data_feeder.rst
    executor.rst
@@ -18,3 +19,8 @@ Fluid
    regularizer.rst
    io.rst
    data.rst
+    transpiler.rst
+    recordio_writer.rst
+    backward.rst
+    average.rst
+    profiler.rst
--- a/doc/fluid/api/initializer.rst
+++ b/doc/fluid/api/initializer.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!

-===========
-initializer
-===========
+=================
+fluid.initializer
+=================
+
+.. _api_fluid_initializer_Constant:

 Constant
 --------
@@ -12,6 +14,8 @@ Constant
    :members:
    :noindex:

+.. _api_fluid_initializer_Uniform:
+
 Uniform
 -------

@@ -19,6 +23,8 @@ Uniform
    :members:
    :noindex:

+.. _api_fluid_initializer_Normal:
+
 Normal
 ------

@@ -26,6 +32,8 @@ Normal
    :members:
    :noindex:

+.. _api_fluid_initializer_Xavier:
+
 Xavier
 ------

@@ -33,6 +41,8 @@ Xavier
    :members:
    :noindex:

+.. _api_fluid_initializer_Bilinear:
+
 Bilinear
 --------

@@ -40,18 +50,33 @@ Bilinear
    :members:
    :noindex:

+.. _api_fluid_initializer_MSRA:
+
+MSRA
+----
+
+..  autoclass:: paddle.fluid.initializer.MSRA
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_force_init_on_cpu:
+
 force_init_on_cpu
 -----------------

 ..  autofunction:: paddle.fluid.initializer.force_init_on_cpu
    :noindex:

+.. _api_fluid_initializer_init_on_cpu:
+
 init_on_cpu
 -----------

 ..  autofunction:: paddle.fluid.initializer.init_on_cpu
    :noindex:

+.. _api_fluid_initializer_ConstantInitializer:
+
 ConstantInitializer
 -------------------

@@ -59,6 +84,8 @@ ConstantInitializer
    :members:
    :noindex:

+.. _api_fluid_initializer_UniformInitializer:
+
 UniformInitializer
 ------------------

@@ -66,6 +93,8 @@ UniformInitializer
    :members:
    :noindex:

+.. _api_fluid_initializer_NormalInitializer:
+
 NormalInitializer
 -----------------

@@ -73,6 +102,8 @@ NormalInitializer
    :members:
    :noindex:

+.. _api_fluid_initializer_XavierInitializer:
+
 XavierInitializer
 -----------------

@@ -80,6 +111,8 @@ XavierInitializer
    :members:
    :noindex:

+.. _api_fluid_initializer_BilinearInitializer:
+
 BilinearInitializer
 -------------------

@@ -87,3 +120,12 @@ BilinearInitializer
    :members:
    :noindex:

+.. _api_fluid_initializer_MSRAInitializer:
+
+MSRAInitializer
+---------------
+
+..  autoclass:: paddle.fluid.initializer.MSRAInitializer
+    :members:
+    :noindex:
+
--- a/doc/fluid/api/io.rst
+++ b/doc/fluid/api/io.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!

-==
-io
-==
+========
+fluid.io
+========
+
+.. _api_fluid_io_save_vars:

 save_vars
 ---------
@@ -11,84 +13,112 @@ save_vars
 ..  autofunction:: paddle.fluid.io.save_vars
    :noindex:

+.. _api_fluid_io_save_params:
+
 save_params
 -----------

 ..  autofunction:: paddle.fluid.io.save_params
    :noindex:

+.. _api_fluid_io_save_persistables:
+
 save_persistables
 -----------------

 ..  autofunction:: paddle.fluid.io.save_persistables
    :noindex:

+.. _api_fluid_io_load_vars:
+
 load_vars
 ---------

 ..  autofunction:: paddle.fluid.io.load_vars
    :noindex:

+.. _api_fluid_io_load_params:
+
 load_params
 -----------

 ..  autofunction:: paddle.fluid.io.load_params
    :noindex:

+.. _api_fluid_io_load_persistables:
+
 load_persistables
 -----------------

 ..  autofunction:: paddle.fluid.io.load_persistables
    :noindex:

+.. _api_fluid_io_save_inference_model:
+
 save_inference_model
 --------------------

 ..  autofunction:: paddle.fluid.io.save_inference_model
    :noindex:

+.. _api_fluid_io_load_inference_model:
+
 load_inference_model
 --------------------

 ..  autofunction:: paddle.fluid.io.load_inference_model
    :noindex:

+.. _api_fluid_io_get_inference_program:
+
 get_inference_program
 ---------------------

 ..  autofunction:: paddle.fluid.io.get_inference_program
    :noindex:

+.. _api_fluid_io_save_checkpoint:
+
 save_checkpoint
 ---------------

 ..  autofunction:: paddle.fluid.io.save_checkpoint
    :noindex:

+.. _api_fluid_io_load_checkpoint:
+
 load_checkpoint
 ---------------

 ..  autofunction:: paddle.fluid.io.load_checkpoint
    :noindex:

+.. _api_fluid_io_clean_checkpoint:
+
 clean_checkpoint
 ----------------

 ..  autofunction:: paddle.fluid.io.clean_checkpoint
    :noindex:

+.. _api_fluid_io_load_persist_vars_without_grad:
+
 load_persist_vars_without_grad
 ------------------------------

 ..  autofunction:: paddle.fluid.io.load_persist_vars_without_grad
    :noindex:

+.. _api_fluid_io_save_persist_vars_without_grad:
+
 save_persist_vars_without_grad
 ------------------------------

 ..  autofunction:: paddle.fluid.io.save_persist_vars_without_grad
    :noindex:

+.. _api_fluid_io_get_latest_checkpoint_serial:
+
 get_latest_checkpoint_serial
 ----------------------------


--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
--- a/doc/fluid/api/metrics.rst
+++ b/doc/fluid/api/metrics.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!

-=======
-metrics
-=======
+=============
+fluid.metrics
+=============
+
+.. _api_fluid_metrics_MetricBase:

 MetricBase
 ----------
@@ -12,6 +14,8 @@ MetricBase
    :members:
    :noindex:

+.. _api_fluid_metrics_CompositeMetric:
+
 CompositeMetric
 ---------------

@@ -19,6 +23,26 @@ CompositeMetric
    :members:
    :noindex:

+.. _api_fluid_metrics_Precision:
+
+Precision
+---------
+
+..  autoclass:: paddle.fluid.metrics.Precision
+    :members:
+    :noindex:
+
+.. _api_fluid_metrics_Recall:
+
+Recall
+------
+
+..  autoclass:: paddle.fluid.metrics.Recall
+    :members:
+    :noindex:
+
+.. _api_fluid_metrics_Accuracy:
+
 Accuracy
 --------

@@ -26,6 +50,8 @@ Accuracy
    :members:
    :noindex:

+.. _api_fluid_metrics_ChunkEvaluator:
+
 ChunkEvaluator
 --------------

@@ -33,6 +59,8 @@ ChunkEvaluator
    :members:
    :noindex:

+.. _api_fluid_metrics_EditDistance:
+
 EditDistance
 ------------

@@ -40,6 +68,8 @@ EditDistance
    :members:
    :noindex:

+.. _api_fluid_metrics_DetectionMAP:
+
 DetectionMAP
 ------------

@@ -47,6 +77,8 @@ DetectionMAP
    :members:
    :noindex:

+.. _api_fluid_metrics_Auc:
+
 Auc
 ---


--- a/doc/fluid/api/nets.rst
+++ b/doc/fluid/api/nets.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!

-====
-nets
-====
+==========
+fluid.nets
+==========
+
+.. _api_fluid_nets_simple_img_conv_pool:

 simple_img_conv_pool
 --------------------
@@ -11,18 +13,24 @@ simple_img_conv_pool
 ..  autofunction:: paddle.fluid.nets.simple_img_conv_pool
    :noindex:

+.. _api_fluid_nets_sequence_conv_pool:
+
 sequence_conv_pool
 ------------------

 ..  autofunction:: paddle.fluid.nets.sequence_conv_pool
    :noindex:

+.. _api_fluid_nets_glu:
+
 glu
 ---

 ..  autofunction:: paddle.fluid.nets.glu
    :noindex:

+.. _api_fluid_nets_scaled_dot_product_attention:
+
 scaled_dot_product_attention
 ----------------------------


--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!

-=========
-optimizer
-=========
+===============
+fluid.optimizer
+===============
+
+.. _api_fluid_optimizer_SGD:

 SGD
 ---
@@ -12,6 +14,8 @@ SGD
    :members:
    :noindex:

+.. _api_fluid_optimizer_Momentum:
+
 Momentum
 --------

@@ -19,6 +23,8 @@ Momentum
    :members:
    :noindex:

+.. _api_fluid_optimizer_Adagrad:
+
 Adagrad
 -------

@@ -26,6 +32,8 @@ Adagrad
    :members:
    :noindex:

+.. _api_fluid_optimizer_Adam:
+
 Adam
 ----

@@ -33,6 +41,8 @@ Adam
    :members:
    :noindex:

+.. _api_fluid_optimizer_Adamax:
+
 Adamax
 ------

@@ -40,6 +50,8 @@ Adamax
    :members:
    :noindex:

+.. _api_fluid_optimizer_DecayedAdagrad:
+
 DecayedAdagrad
 --------------

@@ -47,6 +59,17 @@ DecayedAdagrad
    :members:
    :noindex:

+.. _api_fluid_optimizer_Ftrl:
+
+Ftrl
+----
+
+..  autoclass:: paddle.fluid.optimizer.Ftrl
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_SGDOptimizer:
+
 SGDOptimizer
 ------------

@@ -54,6 +77,8 @@ SGDOptimizer
    :members:
    :noindex:

+.. _api_fluid_optimizer_MomentumOptimizer:
+
 MomentumOptimizer
 -----------------

@@ -61,6 +86,8 @@ MomentumOptimizer
    :members:
    :noindex:

+.. _api_fluid_optimizer_AdagradOptimizer:
+
 AdagradOptimizer
 ----------------

@@ -68,6 +95,8 @@ AdagradOptimizer
    :members:
    :noindex:

+.. _api_fluid_optimizer_AdamOptimizer:
+
 AdamOptimizer
 -------------

@@ -75,6 +104,8 @@ AdamOptimizer
    :members:
    :noindex:

+.. _api_fluid_optimizer_AdamaxOptimizer:
+
 AdamaxOptimizer
 ---------------

@@ -82,6 +113,8 @@ AdamaxOptimizer
    :members:
    :noindex:

+.. _api_fluid_optimizer_DecayedAdagradOptimizer:
+
 DecayedAdagradOptimizer
 -----------------------

@@ -89,6 +122,8 @@ DecayedAdagradOptimizer
    :members:
    :noindex:

+.. _api_fluid_optimizer_RMSPropOptimizer:
+
 RMSPropOptimizer
 ----------------

@@ -96,6 +131,17 @@ RMSPropOptimizer
    :members:
    :noindex:

+.. _api_fluid_optimizer_FtrlOptimizer:
+
+FtrlOptimizer
+-------------
+
+..  autoclass:: paddle.fluid.optimizer.FtrlOptimizer
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_Adadelta:
+
 Adadelta
 --------

@@ -103,6 +149,8 @@ Adadelta
    :members:
    :noindex:

+.. _api_fluid_optimizer_ModelAverage:
+
 ModelAverage
 ------------

@@ -110,6 +158,8 @@ ModelAverage
    :members:
    :noindex:

+.. _api_fluid_optimizer_Optimizer:
+
 Optimizer
 ---------

@@ -117,3 +167,12 @@ Optimizer
    :members:
    :noindex:

+.. _api_fluid_optimizer_RMSPropOptimizer:
+
+RMSPropOptimizer
+----------------
+
+..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
+    :members:
+    :noindex:
+
--- a/doc/fluid/api/param_attr.rst
+++ b/doc/fluid/api/param_attr.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!

-==========
-param_attr
-==========
+================
+fluid.param_attr
+================
+
+.. _api_fluid_param_attr_ParamAttr:

 ParamAttr
 ---------
@@ -12,6 +14,8 @@ ParamAttr
    :members:
    :noindex:

+.. _api_fluid_param_attr_WeightNormParamAttr:
+
 WeightNormParamAttr
 -------------------


--- a/doc/fluid/api/profiler.rst
+++ b/doc/fluid/api/profiler.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!

-========
-profiler
-========
+==============
+fluid.profiler
+==============
+
+.. _api_fluid_profiler_cuda_profiler:

 cuda_profiler
 -------------
@@ -11,24 +13,32 @@ cuda_profiler
 ..  autofunction:: paddle.fluid.profiler.cuda_profiler
    :noindex:

+.. _api_fluid_profiler_reset_profiler:
+
 reset_profiler
 --------------

 ..  autofunction:: paddle.fluid.profiler.reset_profiler
    :noindex:

+.. _api_fluid_profiler_profiler:
+
 profiler
 --------

 ..  autofunction:: paddle.fluid.profiler.profiler
    :noindex:

+.. _api_fluid_profiler_start_profiler:
+
 start_profiler
 --------------

 ..  autofunction:: paddle.fluid.profiler.start_profiler
    :noindex:

+.. _api_fluid_profiler_stop_profiler:
+
 stop_profiler
 -------------


--- a/doc/fluid/api/recordio_writer.rst
+++ b/doc/fluid/api/recordio_writer.rst
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=====================
+fluid.recordio_writer
+=====================
+
+.. _api_fluid_recordio_writer_convert_reader_to_recordio_file:
+
+convert_reader_to_recordio_file
+-------------------------------
+
+..  autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_file
+    :noindex:
+
+.. _api_fluid_recordio_writer_convert_reader_to_recordio_files:
+
+convert_reader_to_recordio_files
+--------------------------------
+
+..  autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_files
+    :noindex:
+
--- a/doc/fluid/api/regularizer.rst
+++ b/doc/fluid/api/regularizer.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!

-===========
-regularizer
-===========
+=================
+fluid.regularizer
+=================
+
+.. _api_fluid_regularizer_append_regularization_ops:

 append_regularization_ops
 -------------------------
@@ -11,12 +13,7 @@ append_regularization_ops
 ..  autofunction:: paddle.fluid.regularizer.append_regularization_ops
    :noindex:

-WeightDecayRegularizer
----------------------
-
-..  autoclass:: paddle.fluid.regularizer.WeightDecayRegularizer
-    :members:
-    :noindex:
+.. _api_fluid_regularizer_L1Decay:

 L1Decay
 -------
@@ -25,6 +22,8 @@ L1Decay
    :members:
    :noindex:

+.. _api_fluid_regularizer_L2Decay:
+
 L2Decay
 -------

@@ -32,6 +31,8 @@ L2Decay
    :members:
    :noindex:

+.. _api_fluid_regularizer_L1DecayRegularizer:
+
 L1DecayRegularizer
 ------------------

@@ -39,6 +40,8 @@ L1DecayRegularizer
    :members:
    :noindex:

+.. _api_fluid_regularizer_L2DecayRegularizer:
+
 L2DecayRegularizer
 ------------------


--- a/doc/fluid/api/transpiler.rst
+++ b/doc/fluid/api/transpiler.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!

-==========
-transpiler
-==========
+================
+fluid.transpiler
+================
+
+.. _api_fluid_transpiler_DistributeTranspiler:

 DistributeTranspiler
 --------------------
@@ -12,12 +14,7 @@ DistributeTranspiler
    :members:
    :noindex:

-InferenceTranspiler
-------------------
-
-..  autoclass:: paddle.fluid.transpiler.InferenceTranspiler
-    :members:
-    :noindex:
+.. _api_fluid_transpiler_memory_optimize:

 memory_optimize
 ---------------
@@ -25,12 +22,16 @@ memory_optimize
 ..  autofunction:: paddle.fluid.transpiler.memory_optimize
    :noindex:

+.. _api_fluid_transpiler_release_memory:
+
 release_memory
 --------------

 ..  autofunction:: paddle.fluid.transpiler.release_memory
    :noindex:

+.. _api_fluid_transpiler_HashName:
+
 HashName
 --------

@@ -38,9 +39,12 @@ HashName
    :members:
    :noindex:

+.. _api_fluid_transpiler_RoundRobin:
+
 RoundRobin
 ----------

 ..  autoclass:: paddle.fluid.transpiler.RoundRobin
    :members:
    :noindex:
+
--- a/doc/v2/faq/build_and_install/index_cn.rst
+++ b/doc/v2/faq/build_and_install/index_cn.rst
@@ -213,3 +213,12 @@ virtualenv本身也是Python的一个包，可以用pip进行安装：
 保存并关闭文件。

 这样，每次打开终端时就会自动启动名为‘paddle’的Python环境了。
+
+10. 通过pip安装的PaddlePaddle在  :code:`import paddle.fluid` 报找不到 :code:`libmkldnn.so` 或 :code:`libmklml_intel.so`
+------------------------------------------------------------------------------------------
+出现这种问题的原因是在导入 :code:`paddle.fluid` 时需要加载 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so`，
+但是系统没有找到该文件。一般通过pip安装PaddlePaddle时会将 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so`
+拷贝到 :code:`/usr/local/lib` 路径下，所以解决办法是将该路径加到 :code:`LD_LIBRARY_PATH` 环境变量下，
+即： :code:`export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH` 。
+
+**注意**：如果是在虚拟环境中安装PaddlePaddle， :code:`libmkldnn.so` 和 :code:`libmklml_intel.so` 可能不在 :code:`/usr/local/lib` 路径下。
\ No newline at end of file
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -147,10 +147,9 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                 "Input tensor type is not supported: ", in.type().name());
  memory::data_type out_type = in_type;

-  memory::format in_format =
-      in_tz.size() == 2 ? memory::format::nc : in.format();
-  memory::format out_format =
-      out_tz.size() == 2 ? memory::format::nc : ToMKLDNNFormat(out_layout);
+  auto in_format = MKLDNNFormatForSize(in_tz.size(), in.format());
+  auto out_format =
+      MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));

  void* in_data = GetDataFromTensor(in, in_type);


--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -61,6 +61,13 @@ inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) {
  if (iter != dict.end()) return iter->second;
  return MKLDNNDataType::data_undef;
 }
+
+inline MKLDNNFormat MKLDNNFormatForSize(size_t dims_size,
+                                        MKLDNNFormat default_format) {
+  return (dims_size == 1
+              ? mkldnn::memory::format::x
+              : dims_size == 2 ? mkldnn::memory::format::nc : default_format);
+}
 #endif

 void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,

--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -47,9 +47,13 @@ void DataTransform(const OpKernelType& expected_kernel_type,
 #ifdef PADDLE_WITH_MKLDNN
        // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
        // Just set layout/format. No real transform occur
+
+        auto out_format =
+            MKLDNNFormatForSize(in.dims().size(), ToMKLDNNFormat(lin));
+
        out.ShareDataWith(input_tensor);
        out.set_layout(DataLayout::kMKLDNN);
-        out.set_format(ToMKLDNNFormat(lin));
+        out.set_format(out_format);
 #endif
      } else {
        // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel

--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -103,50 +103,23 @@ void BroadcastOpHandle::RunImpl() {
          });
    }

-    // FIXME(zcd): a temporary fix for some language model that has sparse
-    // parameter.
-    bool use_mutex = true;
-    if (in_var->IsType<paddle::framework::SelectedRows>()) {
-      use_mutex = false;
-    }
-    if (use_mutex) {
-      this->RunAndRecordEvent([&] {
-        {
-          platform::NCCLGroupGuard guard;
-          for (auto &call : broadcast_calls) {
-            call();
-          }
-        }
-
-        if (!out_handle->IsTheSameVar(*in_var_handle)) {
-          auto out_var = var_scopes.at(in_var_handle->scope_idx_)
-                             ->FindVar(out_var_handles[0]->name_);
-          paddle::framework::TensorCopy(
-              in_tensor, in_var_handle->place_,
-              *(dev_ctxes_.at(in_var_handle->place_)),
-              &VariableVisitor::GetMutableTensor(out_var));
-        }
-      });
-    } else {
-      this->RunAndRecordEventNoMutex([&] {
-        {
-          platform::NCCLGroupGuard guard;
-          for (auto &call : broadcast_calls) {
-            call();
-          }
-        }
-
-        if (!out_handle->IsTheSameVar(*in_var_handle)) {
-          auto out_var = var_scopes.at(in_var_handle->scope_idx_)
-                             ->FindVar(out_var_handles[0]->name_);
-          paddle::framework::TensorCopy(
-              in_tensor, in_var_handle->place_,
-              *(dev_ctxes_.at(in_var_handle->place_)),
-              &VariableVisitor::GetMutableTensor(out_var));
+    this->RunAndRecordEvent([&] {
+      {
+        platform::NCCLGroupGuard guard;
+        for (auto &call : broadcast_calls) {
+          call();
        }
-      });
-    }
+      }

+      if (!out_handle->IsTheSameVar(*in_var_handle)) {
+        auto out_var = var_scopes.at(in_var_handle->scope_idx_)
+                           ->FindVar(out_var_handles[0]->name_);
+        paddle::framework::TensorCopy(
+            in_tensor, in_var_handle->place_,
+            *(dev_ctxes_.at(in_var_handle->place_)),
+            &VariableVisitor::GetMutableTensor(out_var));
+      }
+    });
 #else
    PADDLE_THROW("CUDA is not enabled.");
 #endif

--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -470,7 +470,7 @@ void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op,
 void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
                                                const OpDesc &op) const {
  int op_dev_id = -1;
-  if (op.Type() == "split_byref") {
+  if (op.Type() == "split_byref" || op.Type() == "split_selected_rows") {
    op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
      op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());

--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -47,7 +47,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 #endif

  std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
-  int GetVarDeviceID(const std::string &varname) const;
+  int GetVarDeviceID(const std::string &varname) const override;

 private:
  void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op,

--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -11,8 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "paddle/fluid/framework/details/op_handle_base.h"
+#include <map>

 namespace paddle {
 namespace framework {
@@ -122,35 +122,17 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
 #ifdef PADDLE_WITH_CUDA
  if (!events_.empty()) {  // Use event
    std::function<void()> method = callback;
-
+    // NOTE(zcd): device context must be ordered here because RecordEvent
+    // will use a mutex to ensure the safe of multi-threads.
+    std::map<platform::DeviceContext *, platform::Place> ordered_ctxes;
    for (auto &p : dev_ctxes_) {
-      method = [method, p, this]() {
-        static_cast<platform::CUDADeviceContext *>(p.second)->RecordEvent(
-            events_.at(boost::get<platform::CUDAPlace>(p.first).device),
-            method);
-      };
+      ordered_ctxes.emplace(p.second, p.first);
    }
-    method();
-  } else {
-#endif
-    callback();
-#ifdef PADDLE_WITH_CUDA
-  }
-#endif
-}
-
-void OpHandleBase::RunAndRecordEventNoMutex(
-    const std::function<void()> &callback) {
-#ifdef PADDLE_WITH_CUDA
-  if (!events_.empty()) {  // Use event
-    std::function<void()> method = callback;
-
-    for (auto &p : dev_ctxes_) {
+    for (auto &p : ordered_ctxes) {
      method = [method, p, this]() {
-        static_cast<platform::CUDADeviceContext *>(p.second)
-            ->RecordEventNoMutex(
-                events_.at(boost::get<platform::CUDAPlace>(p.first).device),
-                method);
+        static_cast<platform::CUDADeviceContext *>(p.first)->RecordEvent(
+            events_.at(boost::get<platform::CUDAPlace>(p.second).device),
+            method);
      };
    }
    method();

--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -85,10 +85,6 @@ class OpHandleBase {
 protected:
  void RunAndRecordEvent(const std::function<void()> &callback);

-  // FIXME(zcd): A temporary fix for some language model that has sparse
-  // parameter.
-  void RunAndRecordEventNoMutex(const std::function<void()> &callback);
-
  void RunAndRecordEvent(platform::Place p,
                         const std::function<void()> &callback);


--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -80,9 +80,7 @@ void ReduceOpHandle::RunImpl() {
  }

  if (pre_in_var->IsType<framework::SelectedRows>()) {
-    // FIXME(zcd): A temporary fix for some language model that has sparse
-    // parameter.
-    this->RunAndRecordEventNoMutex([&] {
+    this->RunAndRecordEvent([&] {
      std::vector<const SelectedRows *> in_selected_rows =
          GetInputValues<SelectedRows>(in_var_handles, var_scopes);
      GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_, t_out_p,

--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -27,6 +27,7 @@ enum AttrType {
  BOOLEANS = 7;
  BLOCK = 8;
  LONG = 9;
+  BLOCKS = 10;
 }

 // OpDesc describes an instance of a C++ framework::OperatorBase
@@ -46,6 +47,7 @@ message OpDesc {
    repeated bool bools = 11;
    optional int32 block_idx = 12;
    optional int64 l = 13;
+    repeated int32 blocks_idx = 14;
  };

  message Var {

--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -51,8 +51,6 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) {
 }

 std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
-  PADDLE_ENFORCE(t.type().hash_code() == typeid(float).hash_code());
-
  if (!platform::is_cpu_place(t.place())) {
    LoDTensor tt;
    framework::TensorCopy(t, platform::CPUPlace(), &tt);
@@ -70,7 +68,13 @@ std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
  // only print first ten elements
  int64_t size = t.numel() < 10 ? t.numel() : 10;
  for (int64_t i = 0; i < size; ++i) {
-    os << t.data<float>()[i] << " ";
+    if (t.type().hash_code() == typeid(float).hash_code()) {
+      os << t.data<float>()[i] << " ";
+    } else if (t.type().hash_code() == typeid(int64_t).hash_code()) {
+      os << t.data<int64_t>()[i] << " ";
+    } else {
+      PADDLE_THROW("LoDTensor data type not in [float, int64_t]");
+    }
  }

  return os;

--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -26,6 +26,20 @@
 namespace paddle {
 namespace framework {

+TEST(LoD, PrintLoDTensor) {
+  LoDTensor tensor1;
+  tensor1.mutable_data<float>(platform::CPUPlace());
+  tensor1.data<float>()[0] = 0.2;
+  tensor1.data<float>()[1] = 0.5;
+  LOG(INFO) << tensor1;
+
+  LoDTensor tensor2;
+  tensor2.mutable_data<int64_t>(platform::CPUPlace());
+  tensor2.data<int64_t>()[0] = 1;
+  tensor2.data<int64_t>()[1] = 2;
+  LOG(INFO) << tensor2;
+}
+
 TEST(LoD, data) {
  LoD lod{{0, 1, 2}};
  lod.push_back({0, 2, 4, 5});
@@ -37,7 +51,7 @@ TEST(LoD, data) {
  }
 }

-TEST(LodExpand, test) {
+TEST(LoD, ExpandLoD) {
  LoD lod{{0, 2}};
  LoDTensor tensor;
  tensor.set_lod(lod);

--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -211,6 +211,12 @@ void OpDesc::SetBlockAttr(const std::string &name, BlockDesc *block) {
  need_update_ = true;
 }

+void OpDesc::SetBlocksAttr(const std::string &name,
+                           std::vector<BlockDesc *> blocks) {
+  this->attrs_[name] = blocks;
+  need_update_ = true;
+}
+
 void OpDesc::SetAttrMap(
    const std::unordered_map<std::string, Attribute> &attr_map) {
  attrs_ = attr_map;
@@ -305,6 +311,13 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
  void operator()(const std::vector<bool> &v) const {
    VectorToRepeated(v, attr_->mutable_bools());
  }
+  void operator()(const std::vector<BlockDesc *> &v) const {
+    std::vector<int> blocks_idx;
+    for (auto blk : v) {
+      blocks_idx.push_back(blk->ID());
+    }
+    VectorToRepeated(blocks_idx, attr_->mutable_blocks_idx());
+  }
  void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); }
  void operator()(int64_t v) const { attr_->set_l(v); }
  void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }

--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -77,6 +77,8 @@ class OpDesc {

  void SetBlockAttr(const std::string &name, BlockDesc *block);

+  void SetBlocksAttr(const std::string &name, std::vector<BlockDesc *> blocks);
+
  Attribute GetAttr(const std::string &name) const;

  Attribute GetNullableAttr(const std::string &name) const;

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -121,7 +121,7 @@ ParallelExecutor::ParallelExecutor(
 #endif
  }

-  builder_ = std::move(builder_factory.Create());
+  builder_ = builder_factory.Create();
  member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
      exec_strategy, member_->local_scopes_, places,
      builder_->Build(main_program)));

--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -35,7 +35,8 @@ using VariableNameMap = std::map<std::string, std::vector<std::string>>;
 using Attribute =
    boost::variant<boost::blank, int, float, std::string, std::vector<int>,
                   std::vector<float>, std::vector<std::string>, bool,
-                   std::vector<bool>, BlockDesc*, int64_t>;
+                   std::vector<bool>, BlockDesc*, int64_t,
+                   std::vector<BlockDesc*>>;

 using AttributeMap = std::unordered_map<std::string, Attribute>;


--- a/paddle/fluid/operators/assign_value_op.cc
+++ b/paddle/fluid/operators/assign_value_op.cc
@@ -70,6 +70,7 @@ $$Out = values$$

 namespace ops = paddle::operators;

-REGISTER_OPERATOR(assign_value, ops::AssignValueOp, ops::AssignValueOpMaker);
+REGISTER_OPERATOR(assign_value, ops::AssignValueOp, ops::AssignValueOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
 REGISTER_OP_CPU_KERNEL(assign_value, ops::AssignValueKernel<int>,
                       ops::AssignValueKernel<float>);
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -18,6 +18,7 @@ limitations under the License. */

 #include <limits>

+#include "glog/logging.h"  // For VLOG
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -75,6 +76,9 @@ bool GRPCClient::AsyncSendVar(const std::string& ep,
    var_h.scope = p_scope;
    var_h.name = var_name_val;
    var_h.ctx = p_ctx;
+    var_h.method = "Send";
+
+    VLOG(3) << var_h.String() << " begin";

    // stub context
    SendProcessor* s = new SendProcessor(ch);
@@ -129,6 +133,9 @@ bool GRPCClient::AsyncGetVar(const std::string& ep,
    var_h.scope = p_scope;
    var_h.name = var_name_val;
    var_h.ctx = p_ctx;
+    var_h.method = "Get";
+
+    VLOG(3) << var_h.String() << " begin";

    // stub context
    GetProcessor* s = new GetProcessor(ch);
@@ -172,6 +179,9 @@ bool GRPCClient::AsyncPrefetchVar(const std::string& ep,
    var_h.scope = p_scope;
    var_h.name = out_var_name_val;
    var_h.ctx = p_ctx;
+    var_h.method = "Prefetch";
+
+    VLOG(3) << var_h.String() << " begin";

    // stub context
    GetProcessor* s = new GetProcessor(ch);
@@ -243,10 +253,11 @@ void GRPCClient::Proceed() {
    GPR_ASSERT(ok);
    PADDLE_ENFORCE(c);
    if (c->status_.ok()) {
+      VLOG(3) << c->var_h_.String() << " process";
      c->Process();
    } else {
-      LOG(FATAL) << "var: " << c->var_h_.String()
-                 << " grpc error:" << c->status_.error_message();
+      LOG(FATAL) << c->var_h_.String()
+                 << " meets grpc error:" << c->status_.error_message();
    }
    delete c;
    {
@@ -258,14 +269,15 @@ void GRPCClient::Proceed() {
 }

 std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
-  // TODO(Yancey1989): make grpc client completely thread-safe
  std::lock_guard<std::mutex> guard(chan_mutex_);
  auto it = channels_.find(ep);
  if (it != channels_.end()) {
    return it->second;
  }

+  // Channel configurations:
  grpc::ChannelArguments args;
+  args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 2000);
  args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE);
  args.SetMaxSendMessageSize(std::numeric_limits<int>::max());
  args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());

--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -47,14 +47,18 @@ namespace operators {
 namespace distributed {

 struct VarHandle {
+  // RPC endpoint.
  std::string ep;
  const platform::DeviceContext* ctx;
  const framework::Scope* scope;
+  // Variable name.
  std::string name;
+  // RPC method name.
+  std::string method;

  std::string String() const {
    std::ostringstream s;
-    s << "name:[" << name << "] ep:[" << ep << "]";
+    s << method << " name:[" << name << "], ep:[" << ep << "]";
    return s.str();
  }
 };
@@ -72,6 +76,7 @@ class BaseProcessor {
  virtual void Prepare(const VarHandle& var_info, int64_t time_out) {
    context_.reset(new grpc::ClientContext());
    var_h_ = var_info;
+    context_->set_wait_for_ready(true);

    std::chrono::system_clock::time_point deadline =
        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
@@ -81,6 +86,7 @@ class BaseProcessor {

  virtual void Prepare(int64_t time_out) {
    context_.reset(new grpc::ClientContext());
+    context_->set_wait_for_ready(true);

    std::chrono::system_clock::time_point deadline =
        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
@@ -172,26 +178,24 @@ class GRPCClient : public RPCClient {

  bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
                    const framework::Scope& scope, const std::string& var_name,
-                    int64_t time_out = RPCClient::rpc_time_out) override;
+                    int64_t time_out = FLAGS_grpc_deadline) override;

  bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx,
                   const framework::Scope& scope, const std::string& var_name,
-                   int64_t time_out = RPCClient::rpc_time_out) override;
+                   int64_t time_out = FLAGS_grpc_deadline) override;

  bool AsyncPrefetchVar(const std::string& ep,
                        const platform::DeviceContext& ctx,
                        const framework::Scope& scope,
                        const std::string& in_var_name,
                        const std::string& out_var_name,
-                        int64_t time_out = RPCClient::rpc_time_out) override;
+                        int64_t time_out = FLAGS_grpc_deadline) override;

-  void AsyncSendBatchBarrier(
-      const std::string& ep,
-      int64_t time_out = RPCClient::rpc_time_out) override;
+  void AsyncSendBatchBarrier(const std::string& ep,
+                             int64_t time_out = FLAGS_grpc_deadline) override;

-  void AsyncSendFetchBarrier(
-      const std::string& ep,
-      int64_t time_out = RPCClient::rpc_time_out) override;
+  void AsyncSendFetchBarrier(const std::string& ep,
+                             int64_t time_out = FLAGS_grpc_deadline) override;

  void Wait() override;

@@ -207,7 +211,7 @@ class GRPCClient : public RPCClient {
  void Proceed();

  void AsyncSendComplete(const std::string& ep,
-                         int64_t time_out = RPCClient::rpc_time_out);
+                         int64_t time_out = FLAGS_grpc_deadline);

  std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);


--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
@@ -41,6 +41,19 @@ class RequestBase {
  virtual ~RequestBase() {}
  virtual void Process() = 0;

+  std::string Status2String(const std::string& method) {
+    std::string status = "Process";
+    if (status_ == FINISH) {
+      status = "Finish";
+    }
+
+    std::ostringstream s;
+    s << method << " name:[" << GetReqName() << "]"
+      << ", ep:[" << ctx_.peer() << "]"
+      << " " << status << " using req_id:" << req_id_;
+    return s.str();
+  }
+
  CallStatus Status() const {
    std::lock_guard<std::mutex> l(status_mu_);
    return status_;
@@ -84,7 +97,7 @@ class RequestSend final : public RequestBase {

  void Process() override {
    std::string varname = GetReqName();
-    VLOG(3) << "RequestSend var_name:" << varname;
+    VLOG(4) << "RequestSend var_name:" << varname;

    auto scope = request_->GetMutableLocalScope();
    auto invar = request_->GetVar();
@@ -119,7 +132,7 @@ class RequestGet final : public RequestBase {
  void Process() override {
    // proc request.
    std::string varname = request_.varname();
-    VLOG(3) << "RequestGet " << varname;
+    VLOG(4) << "RequestGet " << varname;

    auto scope = request_handler_->scope();
    auto invar = scope->FindVar(varname);
@@ -165,7 +178,7 @@ class RequestPrefetch final : public RequestBase {
    // prefetch process...
    std::string in_var_name = request_->Varname();
    std::string out_var_name = request_->OutVarname();
-    VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name
+    VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name
            << " out_var_name: " << out_var_name;

    auto scope = request_->GetMutableLocalScope();
@@ -188,10 +201,10 @@ class RequestPrefetch final : public RequestBase {
 };

 void AsyncGRPCServer::WaitServerReady() {
-  VLOG(3) << "AsyncGRPCServer is wait server ready";
+  VLOG(4) << "AsyncGRPCServer is wait server ready";
  std::unique_lock<std::mutex> lock(this->mutex_ready_);
  condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
-  VLOG(3) << "AsyncGRPCServer WaitSeverReady";
+  VLOG(4) << "AsyncGRPCServer WaitSeverReady";
 }

 void AsyncGRPCServer::StartServer() {
@@ -230,7 +243,7 @@ void AsyncGRPCServer::StartServer() {
    for (int i = 0; i < threadnum; i++) {
      rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind(
          &AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f)));
-      VLOG(3) << t.first << " creates threads!";
+      VLOG(4) << t.first << " creates threads!";
    }
  }

@@ -247,7 +260,7 @@ void AsyncGRPCServer::StartServer() {
    auto& threads = t.second;
    for (size_t i = 0; i < threads.size(); ++i) {
      threads[i]->join();
-      VLOG(3) << t.first << " threads ends!";
+      VLOG(4) << t.first << " threads ends!";
    }
  }
 }
@@ -255,7 +268,7 @@ void AsyncGRPCServer::StartServer() {
 void AsyncGRPCServer::ShutdownQueue() {
  for (auto& t : rpc_cq_) {
    t.second->Shutdown();
-    VLOG(3) << t.first << " shutdown!";
+    VLOG(4) << t.first << " queue shutdown!";
  }
 }

@@ -264,7 +277,7 @@ void AsyncGRPCServer::ShutDownImpl() {
  is_shut_down_ = true;
  ShutdownQueue();

-  VLOG(3) << "server_ shutdown!";
+  VLOG(4) << "server_ shutdown!";
  server_->Shutdown();
 }

@@ -272,7 +285,7 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
                                          int req_id) {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  if (is_shut_down_) {
-    VLOG(3) << "shutdown, do not TryToRegisterNewSendOne";
+    VLOG(4) << "shutdown, do not TryToRegisterNewSendOne";
    return;
  }

@@ -306,14 +319,14 @@ void AsyncGRPCServer::HandleRequest(
  bool ok = false;

  while (true) {
-    VLOG(3) << "HandleRequest " << rpc_name << " wait next";
+    VLOG(4) << "HandleRequest " << rpc_name << " wait next";
    if (!cq->Next(&tag, &ok)) {
      LOG(INFO) << "CompletionQueue " << rpc_name << " shutdown!";
      break;
    }

    int req_id = static_cast<int>(reinterpret_cast<intptr_t>(tag));
-    VLOG(3) << "HandleRequest " << rpc_name << ", req_id:" << req_id
+    VLOG(4) << "HandleRequest " << rpc_name << ", req_id:" << req_id
            << " get next";

    auto& reqs = rpc_reqs_[rpc_name];
@@ -324,22 +337,21 @@ void AsyncGRPCServer::HandleRequest(
      base = reqs[req_id];
    }

+    VLOG(3) << base->Status2String(rpc_name);
+
    // reference:
    // https://github.com/tensorflow/tensorflow/issues/5596
    // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
    // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
    if (!ok) {
      LOG(WARNING) << "completion queue:" << rpc_name
-                   << " recv no regular event:argument name["
-                   << base->GetReqName() << "]";
+                   << " recv no regular event"
+                   << " context:" << base->Status2String(rpc_name);
      TryToRegisterNewOne(rpc_name, req_id);
      delete base;
      continue;
    }

-    VLOG(3) << "queue id:" << rpc_name << ", req_id:" << req_id
-            << ", status:" << base->Status();
-
    switch (base->Status()) {
      case PROCESS: {
        base->Process();

--- a/paddle/fluid/operators/distributed/rpc_client.cc
+++ b/paddle/fluid/operators/distributed/rpc_client.cc
@@ -13,6 +13,10 @@
 // limitations under the License.

 #include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "gflags/gflags.h"
+
+// default to 3min to avoid temprary network failures.
+DEFINE_int32(grpc_deadline, 180000, "deadline timeouts for grpc");

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -15,11 +15,14 @@
 #pragma once

 #include <string>
+#include "gflags/gflags.h"

 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"

+DECLARE_int32(grpc_deadline);
+
 namespace paddle {
 namespace operators {
 namespace distributed {
@@ -32,26 +35,26 @@ class RPCClient {
                            const platform::DeviceContext& ctx,
                            const framework::Scope& scope,
                            const std::string& var_name,
-                            int64_t time_out = rpc_time_out) = 0;
+                            int64_t time_out = FLAGS_grpc_deadline) = 0;

  virtual bool AsyncGetVar(const std::string& ep,
                           const platform::DeviceContext& ctx,
                           const framework::Scope& scope,
                           const std::string& var_name,
-                           int64_t time_out = rpc_time_out) = 0;
+                           int64_t time_out = FLAGS_grpc_deadline) = 0;

  virtual bool AsyncPrefetchVar(const std::string& ep,
                                const platform::DeviceContext& ctx,
                                const framework::Scope& scope,
                                const std::string& in_var_name,
                                const std::string& out_var_name,
-                                int64_t time_out = rpc_time_out) = 0;
+                                int64_t time_out = FLAGS_grpc_deadline) = 0;

-  virtual void AsyncSendBatchBarrier(const std::string& ep,
-                                     int64_t time_out = rpc_time_out) = 0;
+  virtual void AsyncSendBatchBarrier(
+      const std::string& ep, int64_t time_out = FLAGS_grpc_deadline) = 0;

-  virtual void AsyncSendFetchBarrier(const std::string& ep,
-                                     int64_t time_out = rpc_time_out) = 0;
+  virtual void AsyncSendFetchBarrier(
+      const std::string& ep, int64_t time_out = FLAGS_grpc_deadline) = 0;

  // SendComplete tells all the server that current trainer have no more data
  // to train, so that the pserver can reduce it's barrier count, and continue
@@ -60,8 +63,6 @@ class RPCClient {

  virtual void Wait() = 0;

-  static constexpr int64_t rpc_time_out = 120 * 1000;
-
  template <typename T>
  static RPCClient* GetInstance() {
    std::call_once(init_flag_, &RPCClient::Init<T>);

--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -47,11 +47,12 @@ void RPCServer::WaitBarrier(const std::string& rpc_name) {
    return (barrier_counter_[rpc_name] >= client_num_ || exit_flag_.load());
  });

-  VLOG(3) << "batch_barrier_:" << barrier_counter_[rpc_name];
+  VLOG(3) << "batch_barrier_: " << rpc_name << " "
+          << barrier_counter_[rpc_name];
 }

 void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
-  VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
+  VLOG(4) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
  int b = 0;
  std::unique_lock<std::mutex> lock(mutex_);
  b = ++barrier_counter_[rpc_name];
@@ -100,7 +101,7 @@ void RPCServer::SetCond(const std::string& rpc_name) {
 }

 void RPCServer::WaitCond(const std::string& rpc_name) {
-  VLOG(3) << "RPCServer WaitCond " << rpc_name;
+  VLOG(4) << "RPCServer WaitCond " << rpc_name;
  int cond = 0;
  {
    std::unique_lock<std::mutex> lock(mutex_);

--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@@ -76,6 +76,8 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
      if (total_written + size_to_write > length) {
        size_to_write = length - total_written;
      }
+      // This log is useful to see how long a internal block size is of rpc.
+      VLOG(7) << "copy " << size_to_write << " data to CUDAPlace";
      memory::Copy(boost::get<platform::CUDAPlace>(place),
                   reinterpret_cast<void*>(p), cpu, data, size_to_write,
                   gpu_dev_ctx.stream());
@@ -103,6 +105,8 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
    }
    // TODO(gongwb): can we avoid copy?
    platform::CPUPlace cpu;
+    // This log is useful to see how long a internal block size is of rpc.
+    VLOG(7) << "copy " << size_to_write << " data to CPUPlace";
    memory::Copy(cpu, reinterpret_cast<void*>(p), cpu, data, size_to_write);

    p += size_to_write;

--- a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/elementwise_add_op.h"
+#include "paddle/fluid/operators/elementwise_op_function.h"
+
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::DataLayout;
+using framework::Tensor;
+using mkldnn::memory;
+using mkldnn::reorder;
+using mkldnn::primitive;
+using mkldnn::stream;
+using mkldnn::sum;
+
+template <typename T>
+class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    const T* x_data = x->data<T>();
+    const T* y_data = y->data<T>();
+    T* z_data = z->mutable_data<T>(ctx.GetPlace());
+
+    int axis = ctx.Attr<int>("axis");
+
+    auto x_dims = x->dims();
+    auto y_dims = y->dims();
+    auto z_dims = z->dims();
+
+    // Execute default elementwise_add operator when
+    // broadcast operations need to performed.
+    if (x_dims != y_dims) {
+      auto sum_func = [](T a, T b) -> T { return a + b; };
+
+      TransformFunctor<decltype(sum_func), T,
+                       paddle::platform::CPUDeviceContext, T>
+          functor(
+              x, y, z,
+              ctx.template device_context<paddle::platform::CPUDeviceContext>(),
+              sum_func);
+
+      axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+      PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+                     "Axis should be in range [0, x_dims)");
+
+      trim_trailing_singular_dims(&y_dims);
+      axis = (y_dims.size() == 0) ? x_dims.size() : axis;
+
+      int pre, n, post;
+      get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
+
+      if (post == 1) {
+        functor.RunRowWise(n, pre);
+      } else {
+        functor.RunMidWise(n, pre, post);
+      }
+      z->set_layout(DataLayout::kMKLDNN);
+      z->set_format(x->format());
+    } else {
+      PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
+                         x->format() != memory::format::format_undef,
+                     "Wrong layout/format set for X tensor");
+      PADDLE_ENFORCE(y->layout() == DataLayout::kMKLDNN &&
+                         y->format() != memory::format::format_undef,
+                     "Wrong layout/format set for X tensor");
+
+      std::vector<int> src_x_tz = framework::vectorize2int(x_dims);
+      std::vector<int> src_y_tz = framework::vectorize2int(y_dims);
+      std::vector<int> dst_tz = framework::vectorize2int(z_dims);
+
+      std::vector<memory::primitive_desc> srcs_pd;
+      std::vector<memory> srcs;
+      std::vector<float> scales = {1.0f, 1.0f};
+
+      auto src_x_pd = memory::primitive_desc(
+          {{src_x_tz}, memory::data_type::f32, x->format()}, mkldnn_engine);
+      auto src_y_pd = memory::primitive_desc(
+          {{src_y_tz}, memory::data_type::f32, y->format()}, mkldnn_engine);
+      auto src_x_memory =
+          memory(src_x_pd, paddle::platform::to_void_cast(x_data));
+      auto src_y_memory =
+          memory(src_y_pd, paddle::platform::to_void_cast(y_data));
+
+      srcs_pd.push_back(src_x_pd);
+      srcs_pd.push_back(src_y_pd);
+      srcs.push_back(src_x_memory);
+      srcs.push_back(src_y_memory);
+
+      auto dst_md =
+          memory::desc({dst_tz}, memory::data_type::f32, memory::format::any);
+
+      // create primitive descriptor for sum
+      auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_pd);
+
+      // create mkldnn memory for dst
+      memory dst_memory = memory(sum_pd.dst_primitive_desc(), z_data);
+
+      std::vector<primitive::at> inputs;
+      inputs.push_back(srcs[0]);
+      inputs.push_back(srcs[1]);
+
+      // create sum primitive
+      auto sum_prim = sum(sum_pd, inputs, dst_memory);
+
+      std::vector<primitive> pipeline;
+      pipeline.push_back(sum_prim);
+      stream(stream::kind::eager).submit(pipeline).wait();
+
+      z->set_layout(DataLayout::kMKLDNN);
+      z->set_format(
+          (memory::format)dst_memory.get_primitive_desc().desc().data.format);
+    }
+  }
+};
+
+template <typename T>
+class EltwiseAddMKLDNNGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
+
+    auto set_mkldnn_format = [](Tensor* in, const Tensor* out) {
+      in->set_layout(DataLayout::kMKLDNN);
+      in->set_format(out->format());
+    };
+
+    if (x->dims() == y->dims()) {
+      auto blas = math::GetBlas<paddle::platform::CPUDeviceContext, T>(ctx);
+      if (dx) {
+        blas.VCOPY(dout->numel(), dout->data<T>(),
+                   dx->mutable_data<T>(ctx.GetPlace()));
+        set_mkldnn_format(dx, dout);
+      }
+
+      if (dy) {
+        blas.VCOPY(dout->numel(), dout->data<T>(),
+                   dy->mutable_data<T>(ctx.GetPlace()));
+        set_mkldnn_format(dy, dout);
+      }
+    } else {
+      // Execute default kernel when broadcast is needed
+      ElemwiseGradCompute<paddle::platform::CPUDeviceContext, T,
+                          IdentityGrad<T>, IdentityGrad<T>>(
+          ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(),
+          IdentityGrad<T>());
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(elementwise_add, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::EltwiseAddMKLDNNKernel<float>)
+
+REGISTER_OP_KERNEL(elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::EltwiseAddMKLDNNGradKernel<float>)
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -14,8 +14,12 @@ limitations under the License. */

 #pragma once
 #include <string>
+#include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif

 namespace paddle {
 namespace operators {
@@ -40,6 +44,21 @@ class ElementwiseOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("Out", x_dim);
    ctx->ShareLoD("X", /*->*/ "Out");
  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (platform::CanMKLDNNBeUsed(ctx)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };

 class ElementwiseOpInferVarType : public framework::VarTypeInference {
@@ -65,6 +84,8 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
                 "for broadcasting Y onto X.")
        .SetDefault(-1)
        .EqualGreaterThan(-1);
+    AddAttr<bool>("use_mkldnn", "(bool, default false). Used by MKLDNN.")
+        .SetDefault(false);
    AddComment(string::Sprintf(R"DOC(
 Limited Elementwise %s Operator

@@ -138,6 +159,21 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
      ctx->SetOutputDim(y_grad_name, y_dims);
    }
  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (platform::CanMKLDNNBeUsed(ctx)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 }  // namespace operators
 }  // namespace paddle

--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -101,17 +101,16 @@ void ListenAndServOp::RunSyncLoop(
    framework::Scope *recv_scope,
    const std::vector<int> &prefetch_block_id_list) const {
  size_t num_blocks = program->Size();
+  auto optimize_blocks =
+      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
  PADDLE_ENFORCE_GE(num_blocks, 2,
                    "server program should have at least 2 blocks");

-  std::vector<int> optimize_block_id_list;
-  for (int blkid = 1; blkid < num_blocks; ++blkid) {
-    if (std::find(prefetch_block_id_list.begin(), prefetch_block_id_list.end(),
-                  blkid) == prefetch_block_id_list.end()) {
-      optimize_block_id_list.push_back(blkid);
-    }
+  std::vector<int> optimize_blocks_idx;
+  for (auto blk : optimize_blocks) {
+    optimize_blocks_idx.push_back(blk->ID());
  }
-  auto optimize_prepared = executor->Prepare(*program, optimize_block_id_list);
+  auto optimize_prepared = executor->Prepare(*program, optimize_blocks_idx);
  // Insert placeholder for block0 which holds current op itself.
  optimize_prepared.insert(
      optimize_prepared.begin(),
@@ -134,14 +133,14 @@ void ListenAndServOp::RunSyncLoop(
    // and this will still work.
    // The optimize blocks which have the same parent ID would run parallel
    // TODO(Yancey1989): need to use ParallelExecutor for future
-    int32_t last_parent_blkid = program->Block(1).Parent();
+    int32_t last_parent_blkid = optimize_blocks[0]->Parent();
    std::vector<size_t> parallel_blkids;
-    parallel_blkids.push_back(1);
+    parallel_blkids.push_back(optimize_blocks[0]->ID());
    double ts = GetTimestamp();
-    for (size_t i = 1; i < optimize_block_id_list.size(); ++i) {
+    for (size_t i = 1; i < optimize_blocks.size(); ++i) {
      // skip the first optimize block because it is already in the
      // parallel_blkids.
-      int blkid = optimize_block_id_list[i];
+      int blkid = optimize_blocks[i]->ID();
      if (program->Block(blkid).Parent() != last_parent_blkid) {
        ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
                              program, recv_scope);
@@ -164,8 +163,8 @@ void ListenAndServOp::RunSyncLoop(
 }

 void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
-                                   framework::ProgramDesc *program) const {
-  VLOG(3) << "RunAsyncLoop in";
+                                   framework::ProgramDesc *program,
+                                   framework::Scope *recv_scope) const {
  // grad name to block id
  std::unordered_map<std::string, int32_t> grad_to_block_id;
  std::unordered_map<int32_t, std::string> id_to_grad;
@@ -192,6 +191,10 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
    block_list.push_back(blkid);
  }
  auto optimize_prepared = executor->Prepare(*program, block_list);
+  // execute global block if needed
+  if (block_list[0] == 1 && id_to_grad.count(1) == 0) {
+    executor->RunPreparedContext(optimize_prepared[0].get(), recv_scope);
+  }
  std::unordered_map<std::string,
                     std::shared_ptr<framework::ExecutorPrepareContext>>
      grad_to_prepared_ctx;
@@ -203,7 +206,6 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
  request_get_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
  request_prefetch_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);

-  VLOG(3) << "RunAsyncLoop into while";
  while (true) {
    if (rpc_service_->IsExit()) {
      LOG(INFO) << "get exit!rpc_processor break!";
@@ -261,8 +263,11 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
  rpc_service_->RegisterRPC(distributed::kRequestPrefetch,
                            request_prefetch_handler_.get());

-  auto *optimize_block = Attr<framework::BlockDesc *>(kOptimizeBlock);
-  auto *program = optimize_block->Program();
+  auto optimize_blocks =
+      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
+  PADDLE_ENFORCE(optimize_blocks.size() >= 1,
+                 "optimize blocks should be 1 at least on the pserver side.");
+  auto *program = optimize_blocks[0]->Program();
  framework::Executor executor(dev_place);

  // prepare for prefetch
@@ -317,7 +322,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
  if (sync_mode) {
    RunSyncLoop(&executor, program, &recv_scope, prefetch_block_id_list);
  } else {
-    RunAsyncLoop(&executor, program);
+    RunAsyncLoop(&executor, program, &recv_scope);
  }
 }

@@ -339,8 +344,9 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
        "a map from grad name to it's optimize block id")
        .SetDefault({});
    AddAttr<bool>("sync_mode", "if works at sync_mode or not").SetDefault(true);
-    AddAttr<framework::BlockDesc *>(kOptimizeBlock,
-                                    "BlockID to run on server side.");
+    AddAttr<std::vector<framework::BlockDesc *>>(
+        kOptimizeBlocks, "Optimize blocks to run on server side.")
+        .SetDefault({});
    AddAttr<std::vector<std::string>>(kPrefetchVarNameToBlockId,
                                      "prefetch blocks to run on server side.")
        .SetDefault({});

--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -30,7 +30,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-constexpr char kOptimizeBlock[] = "OptimizeBlock";
+constexpr char kOptimizeBlocks[] = "optimize_blocks";
 constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id";

 void RunServer(std::shared_ptr<distributed::RPCServer> service);
@@ -50,7 +50,8 @@ class ListenAndServOp : public framework::OperatorBase {
                   const std::vector<int>& prefetch_block_id_list) const;

  void RunAsyncLoop(framework::Executor* executor,
-                    framework::ProgramDesc* program) const;
+                    framework::ProgramDesc* program,
+                    framework::Scope* recv_scope) const;

  void SavePort() const;


--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@@ -295,7 +295,7 @@ class ParallelDoGradOp : public framework::OperatorBase {

        auto sum_op = framework::OpRegistry::CreateOp(
            "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
-            framework::AttributeMap{});
+            framework::AttributeMap{{"use_mkldnn", {false}}});
        VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]);
        sum_op->Run(*sub_scopes[0], places[0]);
        WaitOnPlace(places[0]);

--- a/paddle/fluid/operators/random_crop_op.cc
+++ b/paddle/fluid/operators/random_crop_op.cc
@@ -37,6 +37,11 @@ class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("SeedOut", "The random seed after random cropping.")
        .AsIntermediate();
    AddAttr<std::vector<int>>("shape", "The shape of a cropped instance.");
+    AddAttr<int>("startup_seed",
+                 "If the input 'Seed' is not initialized, the 'startup_seed' "
+                 "will be used to replace it. Even so, the seed after random "
+                 "crop will also be outputed to the 'SeedOut'.")
+        .SetDefault(0);
    AddComment(R"DOC(
      This operator takes a batch of instance, and do random cropping on each instance.
      It means that cropping positions differs on each instance, which is determined
@@ -49,8 +54,6 @@ class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker {
 class RandomCropOpInferShape : public framework::InferShapeBase {
 public:
  void operator()(framework::InferShapeContext* ctx) const override {
-    auto seed_dim = ctx->GetInputDim("Seed");
-    PADDLE_ENFORCE(seed_dim.size() == 1 && seed_dim[0] == 1);
    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
    auto x_dim = ctx->GetInputDim("X");
    PADDLE_ENFORCE_GT(x_dim.size(), static_cast<int64_t>(shape.size()));
@@ -62,7 +65,6 @@ class RandomCropOpInferShape : public framework::InferShapeBase {
      out_dim[x_i] = shape[shape_i];
    }
    ctx->SetOutputDim("Out", framework::make_ddim(out_dim));
-    ctx->SetOutputDim("SeedOut", framework::make_ddim({1}));
  }
 };


--- a/paddle/fluid/operators/random_crop_op.h
+++ b/paddle/fluid/operators/random_crop_op.h
@@ -142,16 +142,22 @@ template <typename DeviceContext, typename T>
 class RandomCropKernel : public framework::OpKernel<T> {
 public:
  virtual void Compute(const framework::ExecutionContext& ctx) const {
-    auto& seed_tensor = detail::Ref(ctx.Input<framework::LoDTensor>("Seed"));
    int64_t seed = 0;
-    if (platform::is_cpu_place(seed_tensor.place())) {
-      seed = *seed_tensor.data<int64_t>();
+    auto& seed_tensor = detail::Ref(ctx.Input<framework::LoDTensor>("Seed"));
+    if (seed_tensor.IsInitialized()) {
+      if (platform::is_cpu_place(seed_tensor.place())) {
+        seed = *seed_tensor.data<int64_t>();
+      } else {
+        LOG(WARNING) << "It is slow to place seed in GPU memory. Please verify "
+                        "your program";
+        framework::LoDTensor cpu_seed;
+        framework::TensorCopySync(seed_tensor, platform::CPUPlace(), &cpu_seed);
+        seed = *cpu_seed.data<int64_t>();
+      }
    } else {
-      LOG(WARNING) << "It is slow to place seed in GPU memory. Please verify "
-                      "your program";
-      framework::LoDTensor cpu_seed;
-      framework::TensorCopySync(seed_tensor, platform::CPUPlace(), &cpu_seed);
-      seed = *cpu_seed.data<int64_t>();
+      VLOG(5) << "WARNING: The input 'Seed' is not initialized, use attribute "
+                 "'startup_seed' instead.";
+      seed = ctx.Attr<int>("startup_seed");
    }
    auto shape = ctx.Attr<std::vector<int>>("shape");
    auto& x = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
@@ -171,7 +177,7 @@ class RandomCropKernel : public framework::OpKernel<T> {
    engine.discard(functor.prod_batchsize_dims_ *
                   (functor.rank_ - functor.num_batchsize_dims_));
    *ctx.Output<framework::LoDTensor>("SeedOut")->mutable_data<int64_t>(
-        platform::CPUPlace()) = engine();
+        framework::make_ddim({1}), platform::CPUPlace()) = engine();
  }
 };


--- a/paddle/fluid/operators/reader/create_custom_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc
@@ -39,6 +39,7 @@ class CustomReader : public framework::DecoratedReader {
  const framework::ProgramDesc program_;
  int sub_block_id_;
  framework::Executor exe_;
+  framework::Scope scope_;

  std::vector<std::string> source_var_names_;
  std::vector<std::string> sink_var_names_;
@@ -158,23 +159,24 @@ void CustomReader::ReadNext(std::vector<framework::LoDTensor>* out) {
  // The scope for CustomReader's sub-block should be independent and shouldn't
  // be any other computation scope's child. Otherwise, data preprocessing and
  // compution cannot be concurrent.
-  framework::Scope scope;
+  framework::Scope* exe_scope = &scope_.NewScope();
  // 1. Copy LoDTensors from underlying reader's output to source variables.
  for (size_t i = 0; i < source_var_names_.size(); ++i) {
-    framework::Variable* var = scope.Var(source_var_names_[i]);
+    framework::Variable* var = exe_scope->Var(source_var_names_[i]);
    framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
    tensor->ShareDataWith(underlying_outs[i]);
    tensor->set_lod(underlying_outs[i].lod());
  }
  // 2. Run the sub-block.
-  exe_.Run(program_, &scope, sub_block_id_, false, true);
+  exe_.Run(program_, exe_scope, sub_block_id_, false, true);
  // 3. Copy LoDTensors from sink variables to out.
  out->resize(sink_var_names_.size());
  for (size_t i = 0; i < sink_var_names_.size(); ++i) {
-    const auto& tensor = detail::Ref(scope.FindVar(sink_var_names_[i]))
+    const auto& tensor = detail::Ref(exe_scope->FindVar(sink_var_names_[i]))
                             .Get<framework::LoDTensor>();
    framework::TensorCopySync(tensor, platform::CPUPlace(), &(*out)[i]);
  }
+  scope_.DeleteScope(exe_scope);
 }

 }  // namespace reader

--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -23,13 +23,13 @@ namespace reader {

 // 'Double buffer' means we shall maintain two batches of input data at the same
 // time. So the kCacheSize shoul be at least 2.
-static constexpr size_t kCacheSize = 3;
+static constexpr size_t kCacheSize = 5;
 // There will be two bacthes out of the channel during training:
 // 1. the one waiting to be sent to the channel
 // 2. the one just be received from the channel, which is also being used by
 // subsequent operators.
 // So the channel size should be kChacheSize - 2
-static constexpr size_t kChannelSize = 1;  // kCacheSize - 2
+static constexpr size_t kChannelSize = 3;  // kCacheSize - 2

 class DoubleBufferReader : public framework::DecoratedReader {
 public:

--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -429,7 +429,8 @@ class RecurrentGradOp : public RecurrentBase {

          auto sum_op = framework::OpRegistry::CreateOp(
              "sum", {{"X", {pg_names[param_id], new_inside_name}}},
-              {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
+              {{"Out", {pg_names[param_id]}}},
+              framework::AttributeMap{{"use_mkldnn", {false}}});
          sum_op->Run(cur_scope, place);

          cur_scope.Rename(new_inside_name, inside_grad_name);

--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -129,7 +129,10 @@ void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) {
  // sub program run in listen_and_serv_op, for simple test we use sum
  f::ProgramDesc program;
  const auto &root_block = program.Block(0);
+  std::vector<framework::BlockDesc *> optimize_blocks;
  auto *optimize_block = program.AppendBlock(root_block);
+  optimize_blocks.push_back(optimize_block);
+
  auto *prefetch_block = program.AppendBlock(root_block);
  // X for server side tensors, RX for received tensors, must be of same shape.
  AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block,
@@ -139,7 +142,7 @@ void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) {
  attrs.insert({"Fanin", 1});
  attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
  attrs.insert({"GradList", std::vector<std::string>({"x1"})});
-  attrs.insert({"OptimizeBlock", optimize_block});
+  attrs.insert({"optimize_blocks", optimize_blocks});
  attrs.insert({"PrefetchBlock", prefetch_block});
  attrs.insert({"grad_to_block_id", std::vector<std::string>({""})});
  attrs.insert({"sync_mode", true});

--- a/paddle/fluid/operators/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/softmax_mkldnn_op.cc
@@ -27,8 +27,81 @@ using paddle::platform::MKLDNNMemDesc;
 using mkldnn::memory;  // Note: paddle has also "memory" namespace
 using mkldnn::primitive;
 using mkldnn::softmax_forward;
+using mkldnn::softmax_backward;
 using mkldnn::prop_kind;
 using mkldnn::stream;
+using platform::to_void_cast;
+
+class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
+ public:
+  SoftmaxMKLDNNHandler(
+      std::shared_ptr<mkldnn::softmax_forward::primitive_desc> softmax_pd,
+      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+      const std::string& base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
+        softmax_pd_(softmax_pd) {}
+
+  SoftmaxMKLDNNHandler(
+      std::shared_ptr<mkldnn::softmax_forward::primitive_desc> softmax_pd,
+      std::shared_ptr<mkldnn::softmax_backward::primitive_desc> softmax_bwd_pd,
+      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+      const std::string& base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
+        softmax_pd_(softmax_pd),
+        softmax_bwd_pd_(softmax_bwd_pd) {
+    // If we are in Grad operatgor then update a key with BWD suffix to
+    // distinguish from FWD memory primitives
+    key_ += "-BWD";
+  }
+
+  std::shared_ptr<mkldnn::softmax_forward> AcquireSoftmax(
+      std::shared_ptr<mkldnn::memory> dst_memory_p,
+      std::shared_ptr<mkldnn::memory> src_memory_p) {
+    /*Generate key*/
+    auto prim_key = key_ + "@softmax_p";
+
+    auto softmax_p = std::static_pointer_cast<mkldnn::softmax_forward>(
+        dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((softmax_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find softmax primitive in device context");
+    if (softmax_p == nullptr) {
+      softmax_p = std::make_shared<mkldnn::softmax_forward>(
+          *(softmax_pd_.get()),
+          *(static_cast<mkldnn::memory*>(src_memory_p.get())),
+          *(static_cast<mkldnn::memory*>(dst_memory_p.get())));
+      dev_ctx_.SetBlob(prim_key, softmax_p);
+    } else {
+      is_reusing_ = true;
+    }
+
+    return softmax_p;
+  }
+
+  std::shared_ptr<mkldnn::softmax_backward> AcquireSoftmaxBackward(
+      std::shared_ptr<mkldnn::memory> dst_memory_p,
+      std::shared_ptr<mkldnn::memory> diff_dst_memory_p,
+      std::shared_ptr<mkldnn::memory> diff_src_memory_p) {
+    auto prim_key = key_ + "@softmax_bwd_p";
+    auto softmax_bwd_p = std::static_pointer_cast<mkldnn::softmax_backward>(
+        dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((softmax_bwd_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find softmax backward primitive in device context");
+    if (softmax_bwd_p == nullptr) {
+      softmax_bwd_p = std::make_shared<mkldnn::softmax_backward>(
+          *softmax_bwd_pd_, *(dst_memory_p.get()), *(diff_dst_memory_p.get()),
+          *(diff_src_memory_p.get()));
+      dev_ctx_.SetBlob(prim_key, softmax_bwd_p);
+    } else {
+      is_reusing_ = true;
+    }
+
+    return softmax_bwd_p;
+  }
+
+ private:
+  std::shared_ptr<mkldnn::softmax_forward::primitive_desc> softmax_pd_;
+  std::shared_ptr<mkldnn::softmax_backward::primitive_desc> softmax_bwd_pd_;
+};

 template <typename T>
 class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
@@ -54,56 +127,27 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
    // Same memory descriptor to be used for input and output
    memory::dims softmax_tz = {src_tz[0], src_tz[1]};
    // Generate keys for storing/retriving primitives for this operator
-    // TODO(jczaja): Each MKLDNN operator may have diffrent hashing function
-    auto gethash = [](memory::dims& operand_dims) {
-      return std::string(std::to_string(operand_dims[0]) + "-" +
-                         std::to_string(operand_dims[1]));
-    };
-    const std::string key = gethash(softmax_tz);
-    const std::string key_softmax_p = key + "@softmax_p";
-    const std::string key_softmax_src_mem_p = key + "@softmax_src_mem_p";
-    const std::string key_softmax_dst_mem_p = key + "@softmax_dst_mem_p";
-
-    std::shared_ptr<void> softmax_p = dev_ctx.GetBlob(key_softmax_p);
-    if (softmax_p == nullptr) {
-      // Currently only NC data format is supported
-      auto softmax_md =
-          MKLDNNMemDesc({softmax_tz}, memory::f32, memory::format::nc);
-      // Normalization is made after innermost dimension eg. C out of NC
-      auto softmax_desc = softmax_forward::desc(prop_kind::forward_scoring,
-                                                softmax_md, 1 /*dim: C*/);
-      // create memory primitives
-      auto softmax_src_memory_p = std::make_shared<memory>(
-          memory::primitive_desc{softmax_md, mkldnn_engine},
-          static_cast<void*>(const_cast<T*>(input_data)));
-      dev_ctx.SetBlob(key_softmax_src_mem_p, softmax_src_memory_p);
-      auto softmax_dst_memory_p = std::make_shared<memory>(
-          memory::primitive_desc{softmax_md, mkldnn_engine},
-          static_cast<void*>(output_data));
-      dev_ctx.SetBlob(key_softmax_dst_mem_p, softmax_dst_memory_p);
-
-      auto softmax_forward_pd =
-          std::make_shared<softmax_forward::primitive_desc>(softmax_desc,
-                                                            mkldnn_engine);
-      softmax_p = std::make_shared<softmax_forward>(
-          *(softmax_forward_pd.get()),
-          *(static_cast<memory*>(softmax_src_memory_p.get())),
-          *(static_cast<memory*>(softmax_dst_memory_p.get())));
-      dev_ctx.SetBlob(key_softmax_p, softmax_p);
-    } else {
-      // Primitives already exist
-      auto src_memory_p = std::static_pointer_cast<memory>(
-          dev_ctx.GetBlob(key_softmax_src_mem_p));
-      PADDLE_ENFORCE(src_memory_p != nullptr,
-                     "Fail to find softmax src mem_p in device context");
-      auto dst_memory_p = std::static_pointer_cast<memory>(
-          dev_ctx.GetBlob(key_softmax_dst_mem_p));
-      PADDLE_ENFORCE(dst_memory_p != nullptr,
-                     "Fail to find softmax dst mem_p in device context");
-      src_memory_p->set_data_handle(
-          reinterpret_cast<void*>(const_cast<T*>(input_data)));
-      dst_memory_p->set_data_handle(output_data);
-    }
+    const std::string key =
+        platform::MKLDNNHandler::GetHash(softmax_tz, ctx.op().Output("Out"));
+    const std::string key_softmax_pd = key + "@softmax_pd";
+
+    // Currently only NC data format is supported
+    auto softmax_md = MKLDNNMemDesc(
+        {softmax_tz}, platform::MKLDNNGetDataType<T>(), memory::format::nc);
+    // Normalization is made after innermost dimension eg. C out of NC
+    auto softmax_desc = softmax_forward::desc(prop_kind::forward_scoring,
+                                              softmax_md, 1 /*dim: C*/);
+    auto softmax_pd = std::make_shared<mkldnn::softmax_forward::primitive_desc>(
+        softmax_desc, mkldnn_engine);
+    dev_ctx.SetBlob(key_softmax_pd, softmax_pd);
+
+    SoftmaxMKLDNNHandler handler(softmax_pd, dev_ctx, mkldnn_engine, key);
+    auto softmax_src_memory_p =
+        handler.AcquireSrcMemory(softmax_md, to_void_cast<T>(input_data));
+    auto softmax_dst_memory_p =
+        handler.AcquireDstMemory(softmax_md, to_void_cast<T>(output_data));
+    auto softmax_p =
+        handler.AcquireSoftmax(softmax_dst_memory_p, softmax_src_memory_p);

    std::vector<primitive> pipeline{
        *(static_cast<softmax_forward::primitive*>(softmax_p.get()))};
@@ -120,6 +164,77 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
  }
 };

+template <typename T>
+class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    auto mkldnn_engine = dev_ctx.GetEngine();
+    const Tensor* output = ctx.Input<Tensor>("Out");
+    const T* dst_data = output->data<T>();
+
+    auto* dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
+    const auto* diff_dst_ptr = dout->template data<T>();
+
+    auto* dx =
+        ctx.template Output<framework::Tensor>(framework::GradVarName("X"));
+    T* diff_src_ptr = dx->template mutable_data<T>(ctx.GetPlace());
+
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    std::vector<int> src_tz(dst_tz);
+    PADDLE_ENFORCE(output->dims().size() == 2UL,
+                   "The input of softmax op must be a 2D matrix.");
+    // MKL-DNN does support softmax over selected axis. Having 2D Tensor,
+    // we will make normalization after final eg. axis: 1
+    PADDLE_ENFORCE(((src_tz[0] == dst_tz[0]) && (src_tz[1] == dst_tz[1])),
+                   "Softmax input and output dimensions should match");
+    // Same memory descriptor to be used for input and output
+    memory::dims softmax_tz = {src_tz[0], src_tz[1]};
+    // Currently only supports NC data format
+    // retrieve eltwise primitive desc from device context
+    const std::string key =
+        platform::MKLDNNHandler::GetHash(softmax_tz, ctx.op().Input("Out"));
+    const std::string key_softmax_pd = key + "@softmax_pd";
+
+    auto softmax_pd =
+        std::static_pointer_cast<mkldnn::softmax_forward::primitive_desc>(
+            dev_ctx.GetBlob(key_softmax_pd));
+    PADDLE_ENFORCE(softmax_pd != nullptr,
+                   "Fail to find softmax_pd in device context");
+
+    // TODO(jczaja): Add layouts support when there is a need to do so
+    // Two dimensional softmax does support NC format
+    auto data_softmax_md = MKLDNNMemDesc(
+        {softmax_tz}, platform::MKLDNNGetDataType<T>(), memory::format::nc);
+    auto diff_softmax_md = MKLDNNMemDesc(
+        {softmax_tz}, platform::MKLDNNGetDataType<T>(), memory::format::nc);
+    // Normalization is made after innermost dimension eg. C out of NC
+    auto softmax_bwd_desc =
+        softmax_backward::desc(diff_softmax_md, data_softmax_md, 1 /* dim: C*/);
+    auto softmax_bwd_pd =
+        std::make_shared<mkldnn::softmax_backward::primitive_desc>(
+            softmax_bwd_desc, mkldnn_engine, *softmax_pd);
+
+    SoftmaxMKLDNNHandler handler(softmax_pd, softmax_bwd_pd, dev_ctx,
+                                 mkldnn_engine, key);
+    auto dst_memory_p =
+        handler.AcquireDstMemory(data_softmax_md, to_void_cast<T>(dst_data));
+    auto diff_dst_memory_p = handler.AcquireDiffDstMemory(
+        diff_softmax_md, to_void_cast<T>(diff_dst_ptr));
+    auto diff_src_memory_p = handler.AcquireDiffSrcMemory(
+        diff_softmax_md, to_void_cast<T>(diff_src_ptr));
+
+    // Get primitve from device context
+    auto softmax_bwd_p = handler.AcquireSoftmaxBackward(
+        dst_memory_p, diff_dst_memory_p, diff_src_memory_p);
+
+    std::vector<primitive> pipeline{*softmax_bwd_p};
+    stream(stream::kind::eager).submit(pipeline).wait();
+  }
+};
 }  // namespace operators
 }  // namespace paddle

@@ -127,3 +242,5 @@ namespace ops = paddle::operators;

 REGISTER_OP_KERNEL(softmax, MKLDNN, ::paddle::platform::CPUPlace,
                   ops::SoftmaxMKLDNNKernel<float>);
+REGISTER_OP_KERNEL(softmax_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::SoftmaxMKLDNNGradKernel<float>);
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -145,16 +145,30 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
      const framework::ExecutionContext& ctx) const override {
    // choose cudnn kernel if the runtime supported.
    framework::LibraryType library_{framework::LibraryType::kPlain};
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);

 #ifdef PADDLE_WITH_CUDA
    if (platform::CanCUDNNBeUsed(ctx)) {
      library_ = framework::LibraryType::kCUDNN;
    }
 #endif
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-        framework::StringToDataLayout(data_format), library_);
+#ifdef PADDLE_WITH_MKLDNN
+    if (library_ == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kMKLDNN;
+      layout_ = framework::DataLayout::kMKLDNN;
+    }
+#endif
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    if (input_data_type == framework::proto::VarType::FP16) {
+      PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                     "float16 can only be used on GPU place");
+    }
+
+    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
+                                   library_);
  }
 };


--- a/paddle/fluid/operators/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/sum_mkldnn_op.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*Licensed under the Apache License, Version 2.0(the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. */
+
+#include "mkldnn.hpp"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/operators/sum_op.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+using paddle::platform::MKLDNNDeviceContext;
+using paddle::platform::CPUDeviceContext;
+using framework::DataLayout;
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::stream;
+using mkldnn::sum;
+using mkldnn::reorder;
+using platform::to_void_cast;
+
+template <typename T>
+class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+    auto in_vars = ctx.MultiInputVar("X");
+
+    const int N = in_vars.size();
+    auto out_var = ctx.OutputVar("Out");
+    bool in_place = out_var == in_vars[0];
+
+    if (out_var->IsType<framework::LoDTensor>()) {
+      LoDTensor* output = ctx.Output<LoDTensor>("Out");
+      T* output_data = output->mutable_data<T>(ctx.GetPlace());
+
+      std::vector<int> dst_tz = framework::vectorize2int(output->dims());
+      auto src_tz = dst_tz;
+      memory::format output_format{memory::format::format_undef};
+      std::vector<float> scales;
+      std::vector<memory::primitive_desc> srcs_mpd;
+      std::vector<mkldnn::memory> srcs_mem;
+
+      PADDLE_ENFORCE(in_vars[0]->IsType<LoDTensor>(),
+                     "Input[0] must be LoDTensors");
+      auto& input0 = in_vars[0]->Get<LoDTensor>();
+      PADDLE_ENFORCE(input0.layout() == DataLayout::kMKLDNN &&
+                         input0.format() != memory::format::format_undef,
+                     "Wrong layout/format for inputs[0]");
+
+      memory::format input_format = input0.format();
+
+      if (src_tz.size() == 1 && (input_format == memory::format::nchw ||
+                                 input_format == memory::format::nhwc)) {
+        input_format = memory::format::x;
+      }
+      if (src_tz.size() == 2 && (input_format == memory::format::nchw ||
+                                 input_format == memory::format::nhwc)) {
+        input_format = memory::format::nc;
+      }
+
+      for (int i = in_place ? 1 : 0; i < N; i++) {
+        PADDLE_ENFORCE(in_vars[i]->IsType<LoDTensor>(),
+                       "all inputs must be all LoDTensors");
+        auto& input = in_vars[i]->Get<LoDTensor>();
+        PADDLE_ENFORCE(input.layout() == DataLayout::kMKLDNN &&
+                           input.format() != memory::format::format_undef,
+                       "Wrong layout/format for inputs");
+
+        if (input.numel() == 0) {
+          continue;
+        }
+
+        const T* input_data = input.data<T>();
+
+        auto src_md =
+            memory::desc(src_tz, memory::data_type::f32, input_format);
+        auto src_mpd = memory::primitive_desc(src_md, mkldnn_engine);
+        auto src_mem = memory(src_mpd, to_void_cast(input_data));
+        srcs_mpd.push_back(src_mpd);
+        srcs_mem.push_back(src_mem);
+        scales.push_back(1.0);
+      }
+
+      auto dst_md =
+          memory::desc(dst_tz, memory::data_type::f32, memory::format::any);
+
+      auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_mpd);
+
+      std::shared_ptr<memory> dst_mem;
+      if (in_place) {
+        dst_mem.reset(new memory(sum_pd.dst_primitive_desc()));
+      } else {
+        dst_mem.reset(new memory(sum_pd.dst_primitive_desc(), output_data));
+      }
+      std::vector<mkldnn::primitive::at> inputs;
+      for (size_t i = 0; i < srcs_mem.size(); ++i) {
+        inputs.push_back(srcs_mem[i]);
+      }
+
+      auto sum_prim = mkldnn::sum(sum_pd, inputs, *dst_mem);
+      output_format = (memory::format)platform::GetMKLDNNFormat(sum_pd);
+
+      primitive reorder_prim;
+      std::shared_ptr<memory> target_mem;
+      if (in_place) {
+        output_format = input_format;
+        target_mem.reset(new memory(
+            {{{src_tz}, memory::data_type::f32, output_format}, mkldnn_engine},
+            output_data));
+        reorder_prim = reorder(*dst_mem, *target_mem);
+      }
+
+      std::vector<primitive> pipeline;
+      pipeline.push_back(sum_prim);
+      if (in_place) pipeline.push_back(reorder_prim);
+      stream(stream::kind::eager).submit(pipeline).wait();
+
+      output->set_layout(DataLayout::kMKLDNN);
+      output->set_format(output_format);
+    } else if (out_var->IsType<framework::SelectedRows>()) {
+      // TODO(@mozga-intel) Add MKLDNN SelectedRows support
+      std::unique_ptr<framework::SelectedRows> in0;
+      if (in_place) {
+        // If is in_place, we store the input[0] to in0
+        auto& in_sel0 = in_vars[0]->Get<SelectedRows>();
+        auto& rows = in_sel0.rows();
+        in0.reset(new framework::SelectedRows(rows, in_sel0.height()));
+        in0->mutable_value()->ShareDataWith(in_sel0.value());
+      }
+
+      auto get_selected_row = [&](size_t i) -> const SelectedRows& {
+        if (i == 0 && in0) {
+          return *in0.get();
+        } else {
+          return in_vars[i]->Get<SelectedRows>();
+        }
+      };
+      auto* out = ctx.Output<SelectedRows>("Out");
+      out->mutable_rows()->clear();
+      auto* out_value = out->mutable_value();
+
+      // Runtime InferShape
+      size_t first_dim = 0;
+      for (int i = 0; i < N; i++) {
+        auto& sel_row = get_selected_row(i);
+        first_dim += sel_row.rows().size();
+      }
+      auto in_dim =
+          framework::vectorize(get_selected_row(N - 1).value().dims());
+      in_dim[0] = static_cast<int64_t>(first_dim);
+
+      out_value->Resize(framework::make_ddim(in_dim));
+
+      // if all the input sparse vars are empty, no need to
+      // merge these vars.
+      if (first_dim == 0UL) {
+        return;
+      }
+      out_value->mutable_data<T>(ctx.GetPlace());
+      math::SelectedRowsAddTo<CPUDeviceContext, T> functor;
+      int64_t offset = 0;
+      for (int i = 0; i < N; i++) {
+        auto& sel_row = get_selected_row(i);
+        if (sel_row.rows().size() == 0) {
+          continue;
+        }
+        PADDLE_ENFORCE_EQ(out->height(), sel_row.height());
+        functor(ctx.template device_context<CPUDeviceContext>(), sel_row,
+                offset, out);
+        offset += sel_row.value().numel();
+      }
+    } else if (out_var->IsType<framework::LoDTensorArray>()) {
+      // TODO(@mozga-intel) Add MKLDNN LoDTensorArray support
+      auto& out_array = *out_var->GetMutable<framework::LoDTensorArray>();
+      for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) {
+        PADDLE_ENFORCE(in_vars[i]->IsType<framework::LoDTensorArray>(),
+                       "Only support all inputs are TensorArray");
+        auto& in_array = in_vars[i]->Get<framework::LoDTensorArray>();
+
+        for (size_t i = 0; i < in_array.size(); ++i) {
+          if (in_array[i].numel() != 0) {
+            if (i >= out_array.size()) {
+              out_array.resize(i + 1);
+            }
+            if (out_array[i].numel() == 0) {
+              framework::TensorCopy(in_array[i], in_array[i].place(),
+                                    ctx.device_context(), &out_array[i]);
+              out_array[i].set_lod(in_array[i].lod());
+            } else {
+              PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod());
+              auto in = EigenVector<T>::Flatten(in_array[i]);
+              auto result = EigenVector<T>::Flatten(out_array[i]);
+              result.device(*ctx.template device_context<MKLDNNDeviceContext>()
+                                 .eigen_device()) = result + in;
+            }
+          }
+        }
+      }
+    } else {
+      PADDLE_THROW("Unexpected branch, output variable type is %s",
+                   out_var->Type().name());
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_KERNEL(sum, MKLDNN, ::paddle::platform::CPUPlace,
+                   paddle::operators::SumMKLDNNOpKernel<float>);
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
@@ -203,11 +203,11 @@ class WhileGradOp : public framework::OperatorBase {
                ->set_lod(inside_tensor.lod());
          }
        }
-
        auto new_inside_name = cur_scope.Rename(inside_grad_name);
        auto sum_op = framework::OpRegistry::CreateOp(
            "sum", {{"X", {pg_names[param_id], new_inside_name}}},
-            {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
+            {{"Out", {pg_names[param_id]}}},
+            framework::AttributeMap{{"use_mkldnn", {false}}});
        sum_op->Run(cur_scope, dev_place);
        cur_scope.Rename(new_inside_name, inside_grad_name);
      }

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -106,14 +106,6 @@ class CUDADeviceContext : public DeviceContext {
    PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
  }

-  // FIXME(zcd): A temporary fix for some language model that has sparse
-  // parameter.
-  template <typename Callback>
-  void RecordEventNoMutex(cudaEvent_t ev, Callback callback) {
-    callback();
-    PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
-  }
-
 private:
  CUDAPlace place_;


--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)

-list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc nccl.cc)
+list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc)
+
+# There is no macOS version of NCCL.
+if (NOT APPLE)
+  list(APPEND CUDA_SRCS nccl.cc)
+endif()
+
 if (TENSORRT_FOUND)
  list(APPEND CUDA_SRCS tensorrt.cc)
 endif()

-
 configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
 if (CUPTI_FOUND)
    list(APPEND CUDA_SRCS cupti.cc)

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -268,7 +268,8 @@ void BindOpDesc(pybind11::module *m) {
      .value("STRINGS", pd::proto::AttrType::STRINGS)
      .value("BOOL", pd::proto::AttrType::BOOLEAN)
      .value("BOOLS", pd::proto::AttrType::BOOLEANS)
-      .value("BLOCK", pd::proto::AttrType::BLOCK);
+      .value("BLOCK", pd::proto::AttrType::BLOCK)
+      .value("BLOCKS", pd::proto::AttrType::BLOCKS);

  pybind11::class_<pd::OpDesc> op_desc(*m, "OpDesc", "");
  op_desc
@@ -293,6 +294,7 @@ void BindOpDesc(pybind11::module *m) {
      .def("set_attr", &pd::OpDesc::SetAttr)
      .def("attr", &pd::OpDesc::GetAttr)
      .def("set_block_attr", &pd::OpDesc::SetBlockAttr)
+      .def("set_blocks_attr", &pd::OpDesc::SetBlocksAttr)
      .def("set_serialized_attr",
           [](pd::OpDesc &self, const std::string &name,
              const pybind11::bytes &seriralized) {

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
--- a/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
--- a/python/paddle/v2/dataset/cifar.py
+++ b/python/paddle/v2/dataset/cifar.py
--- a/python/paddle/v2/dataset/flowers.py
+++ b/python/paddle/v2/dataset/flowers.py