Merge branch 'develop' of github.com:PaddlePaddle/Paddle into ckpt_m2

b6e63554 · tangwei12 · 88cb5d79 · 991cedb4 · b6e63554 · b6e63554
50 changed file
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -40,12 +40,12 @@ ExternalProject_Add(
    # NOTE(wuyi):
    # this package is generated by following steps:
    # 1. git clone -b v1.8.x https://github.com/grpc/grpc.git
-    # 2. submodule update --init
+    # 2. git submodule update --init
    # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
    #    checkout and clean other dirs under third_party
    # 4. remove .git, and package the directory.
-    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.8.x.tar.gz"
+    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz"
-    URL_MD5  "c9c58ee7d0e8929a63155af6a2ecdbd0"
+    URL_MD5  "1f268a2aff6759839dccd256adcc91cf"
    PREFIX          ${GRPC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CONFIGURE_COMMAND ""

--- a/doc/fluid/api/evaluator.rst
+++ b/doc/fluid/api/evaluator.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!
-=========
+=============
-evaluator
+fluid.average
-=========
+=============
+.. _api_fluid_average_WeightedAverage:
+WeightedAverage
+---------------
+..  autoclass:: paddle.fluid.average.WeightedAverage
+    :members:
+    :noindex:
--- a/doc/fluid/api/backward.rst
+++ b/doc/fluid/api/backward.rst
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+==============
+fluid.backward
+==============
+.. _api_fluid_backward_append_backward:
+append_backward
+---------------
+..  autofunction:: paddle.fluid.backward.append_backward
+    :noindex:
+.. _api_fluid_backward_calc_gradient:
+calc_gradient
+-------------
+..  autofunction:: paddle.fluid.backward.calc_gradient
+    :noindex:
--- a/doc/fluid/api/clip.rst
+++ b/doc/fluid/api/clip.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!
-====
+==========
-clip
+fluid.clip
-====
+==========
+.. _api_fluid_clip_ErrorClipByValue:
 ErrorClipByValue
 ----------------
@@ -12,6 +14,8 @@ ErrorClipByValue
    :members:
    :noindex:
+.. _api_fluid_clip_GradientClipByValue:
 GradientClipByValue
 -------------------
@@ -19,6 +23,8 @@ GradientClipByValue
    :members:
    :noindex:
+.. _api_fluid_clip_GradientClipByNorm:
 GradientClipByNorm
 ------------------
@@ -26,6 +32,8 @@ GradientClipByNorm
    :members:
    :noindex:
+.. _api_fluid_clip_GradientClipByGlobalNorm:
 GradientClipByGlobalNorm
 ------------------------
@@ -33,15 +41,3 @@ GradientClipByGlobalNorm
    :members:
    :noindex:
-append_gradient_clip_ops
------------------------
-..  autofunction:: paddle.fluid.clip.append_gradient_clip_ops
-    :noindex:
-error_clip_callback
-------------------
-..  autofunction:: paddle.fluid.clip.error_clip_callback
-    :noindex:
--- a/doc/fluid/api/data.rst
+++ b/doc/fluid/api/data.rst
-==================================
-Data Reader Interface and DataSets
-==================================
-..  toctree::
-    :maxdepth: 1
-    data/data_reader.rst
-    data/image.rst
-    data/dataset.rst
--- a/doc/fluid/api/data_feeder.rst
+++ b/doc/fluid/api/data_feeder.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!
-===========
+=================
-data_feeder
+fluid.data_feeder
-===========
+=================
+.. _api_fluid_data_feeder_DataFeeder:
 DataFeeder
 ----------

--- a/doc/fluid/api/detection.rst
+++ b/doc/fluid/api/detection.rst
--- a/doc/fluid/api/executor.rst
+++ b/doc/fluid/api/executor.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!
-========
+==============
-executor
+fluid.executor
-========
+==============
+.. _api_fluid_executor_Executor:
 Executor
 --------
@@ -12,24 +14,32 @@ Executor
    :members:
    :noindex:
+.. _api_fluid_executor_global_scope:
 global_scope
 ------------
 ..  autofunction:: paddle.fluid.executor.global_scope
    :noindex:
+.. _api_fluid_executor_scope_guard:
 scope_guard
 -----------
 ..  autofunction:: paddle.fluid.executor.scope_guard
    :noindex:
-switch_scope
+.. _api_fluid_executor__switch_scope:
------------
+_switch_scope
+-------------
-..  autofunction:: paddle.fluid.executor.switch_scope
+..  autofunction:: paddle.fluid.executor._switch_scope
    :noindex:
+.. _api_fluid_executor_fetch_var:
 fetch_var
 ---------

--- a/doc/fluid/api/fluid.rst
+++ b/doc/fluid/api/fluid.rst
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+=====
+fluid
+=====
+.. _api_fluid_Block:
+Block
+-----
+..  autoclass:: paddle.fluid.Block
+    :members:
+    :noindex:
+.. _api_fluid_Variable:
+Variable
+--------
+..  autoclass:: paddle.fluid.Variable
+    :members:
+    :noindex:
+.. _api_fluid_Program:
+Program
+-------
+..  autoclass:: paddle.fluid.Program
+    :members:
+    :noindex:
+.. _api_fluid_Operator:
+Operator
+--------
+..  autoclass:: paddle.fluid.Operator
+    :members:
+    :noindex:
+.. _api_fluid_default_startup_program:
+default_startup_program
+-----------------------
+..  autofunction:: paddle.fluid.default_startup_program
+    :noindex:
+.. _api_fluid_default_main_program:
+default_main_program
+--------------------
+..  autofunction:: paddle.fluid.default_main_program
+    :noindex:
+.. _api_fluid_program_guard:
+program_guard
+-------------
+..  autofunction:: paddle.fluid.program_guard
+    :noindex:
+.. _api_fluid_get_var:
+get_var
+-------
+..  autofunction:: paddle.fluid.get_var
+    :noindex:
+.. _api_fluid_Executor:
+Executor
+--------
+..  autoclass:: paddle.fluid.Executor
+    :members:
+    :noindex:
+.. _api_fluid_global_scope:
+global_scope
+------------
+..  autofunction:: paddle.fluid.global_scope
+    :noindex:
+.. _api_fluid_scope_guard:
+scope_guard
+-----------
+..  autofunction:: paddle.fluid.scope_guard
+    :noindex:
+.. _api_fluid__switch_scope:
+_switch_scope
+-------------
+..  autofunction:: paddle.fluid._switch_scope
+    :noindex:
+.. _api_fluid_fetch_var:
+fetch_var
+---------
+..  autofunction:: paddle.fluid.fetch_var
+    :noindex:
+.. _api_fluid_Go:
+Go
+--
+..  autoclass:: paddle.fluid.Go
+    :members:
+    :noindex:
+.. _api_fluid_make_channel:
+make_channel
+------------
+..  autofunction:: paddle.fluid.make_channel
+    :noindex:
+.. _api_fluid_channel_send:
+channel_send
+------------
+..  autofunction:: paddle.fluid.channel_send
+    :noindex:
+.. _api_fluid_channel_recv:
+channel_recv
+------------
+..  autofunction:: paddle.fluid.channel_recv
+    :noindex:
+.. _api_fluid_channel_close:
+channel_close
+-------------
+..  autofunction:: paddle.fluid.channel_close
+    :noindex:
+.. _api_fluid_Select:
+Select
+------
+..  autoclass:: paddle.fluid.Select
+    :members:
+    :noindex:
+.. _api_fluid_Trainer:
+Trainer
+-------
+..  autoclass:: paddle.fluid.Trainer
+    :members:
+    :noindex:
+.. _api_fluid_BeginEpochEvent:
+BeginEpochEvent
+---------------
+..  autoclass:: paddle.fluid.BeginEpochEvent
+    :members:
+    :noindex:
+.. _api_fluid_EndEpochEvent:
+EndEpochEvent
+-------------
+..  autoclass:: paddle.fluid.EndEpochEvent
+    :members:
+    :noindex:
+.. _api_fluid_BeginStepEvent:
+BeginStepEvent
+--------------
+..  autoclass:: paddle.fluid.BeginStepEvent
+    :members:
+    :noindex:
+.. _api_fluid_EndStepEvent:
+EndStepEvent
+------------
+..  autoclass:: paddle.fluid.EndStepEvent
+    :members:
+    :noindex:
+.. _api_fluid_CheckpointConfig:
+CheckpointConfig
+----------------
+..  autoclass:: paddle.fluid.CheckpointConfig
+    :members:
+    :noindex:
+.. _api_fluid_Inferencer:
+Inferencer
+----------
+..  autoclass:: paddle.fluid.Inferencer
+    :members:
+    :noindex:
+.. _api_fluid_DistributeTranspiler:
+DistributeTranspiler
+--------------------
+..  autoclass:: paddle.fluid.DistributeTranspiler
+    :members:
+    :noindex:
+.. _api_fluid_memory_optimize:
+memory_optimize
+---------------
+..  autofunction:: paddle.fluid.memory_optimize
+    :noindex:
+.. _api_fluid_release_memory:
+release_memory
+--------------
+..  autofunction:: paddle.fluid.release_memory
+    :noindex:
+.. _api_fluid_ParallelExecutor:
+ParallelExecutor
+----------------
+..  autoclass:: paddle.fluid.ParallelExecutor
+    :members:
+    :noindex:
+.. _api_fluid_ExecutionStrategy:
+ExecutionStrategy
+-----------------
+..  autoclass:: paddle.fluid.ExecutionStrategy
+    :members:
+    :noindex:
+.. _api_fluid_BuildStrategy:
+BuildStrategy
+-------------
+..  autoclass:: paddle.fluid.BuildStrategy
+    :members:
+    :noindex:
+.. _api_fluid_create_lod_tensor:
+create_lod_tensor
+-----------------
+..  autofunction:: paddle.fluid.create_lod_tensor
+    :noindex:
+.. _api_fluid_create_random_int_lodtensor:
+create_random_int_lodtensor
+---------------------------
+..  autofunction:: paddle.fluid.create_random_int_lodtensor
+    :noindex:
+.. _api_fluid_LoDTensor:
+LoDTensor
+---------
+..  autoclass:: paddle.fluid.LoDTensor
+    :members:
+    :noindex:
+.. _api_fluid_CPUPlace:
+CPUPlace
+--------
+..  autoclass:: paddle.fluid.CPUPlace
+    :members:
+    :noindex:
+.. _api_fluid_CUDAPlace:
+CUDAPlace
+---------
+..  autoclass:: paddle.fluid.CUDAPlace
+    :members:
+    :noindex:
+.. _api_fluid_CUDAPinnedPlace:
+CUDAPinnedPlace
+---------------
+..  autoclass:: paddle.fluid.CUDAPinnedPlace
+    :members:
+    :noindex:
+.. _api_fluid_Tensor:
+Tensor
+------
+..  autoclass:: paddle.fluid.Tensor
+    :members:
+    :noindex:
+.. _api_fluid_ParamAttr:
+ParamAttr
+---------
+..  autoclass:: paddle.fluid.ParamAttr
+    :members:
+    :noindex:
+.. _api_fluid_WeightNormParamAttr:
+WeightNormParamAttr
+-------------------
+..  autoclass:: paddle.fluid.WeightNormParamAttr
+    :members:
+    :noindex:
+.. _api_fluid_DataFeeder:
+DataFeeder
+----------
+..  autoclass:: paddle.fluid.DataFeeder
+    :members:
+    :noindex:
+.. _api_fluid_Scope:
+Scope
+-----
+..  autoclass:: paddle.fluid.Scope
+    :members:
+    :noindex:
--- a/doc/fluid/api/gen_doc.py
+++ b/doc/fluid/api/gen_doc.py
@@ -29,9 +29,17 @@ def parse_arg():
 class DocGenerator(object):
-    def __init__(self, module_name, stream=sys.stdout):
+    def __init__(self, module_name=None, stream=sys.stdout):
+        if module_name == "":
+            module_name = None
        self.stream = stream
-        self.module_name = module_name
+        if module_name is None:
+            self.module_name = "fluid"
+        else:
+            self.module_name = "fluid." + module_name
+        if module_name is None:
+            self.module = fluid
+        else:
            if not hasattr(fluid, module_name):
                raise ValueError("Cannot find fluid.{0}".format(module_name))
            else:
@@ -41,7 +49,7 @@ class DocGenerator(object):
 ''')
-        self._print_header_(module_name, dot='=', is_title=True)
+        self._print_header_(self.module_name, dot='=', is_title=True)
    def print_submodule(self, submodule_name):
        submodule = getattr(self.module, submodule_name)
@@ -60,25 +68,29 @@ class DocGenerator(object):
        self._print_header_(name, dot='=', is_title=False)
    def print_item(self, name):
-        item = getattr(self.module, name)
+        item = getattr(self.module, name, None)
+        if item is None:
+            return
        if isinstance(item, types.TypeType):
            self.print_class(name)
        elif isinstance(item, types.FunctionType):
            self.print_method(name)
        else:
-            raise RuntimeError("Unsupported item {0}".format(name))
+            pass
    def print_class(self, name):
+        self._print_ref_(name)
        self._print_header_(name, dot='-', is_title=False)
-        self.stream.write('''..  autoclass:: paddle.fluid.{0}.{1}
+        self.stream.write('''..  autoclass:: paddle.{0}.{1}
    :members:
    :noindex:
 '''.format(self.module_name, name))
    def print_method(self, name):
+        self._print_ref_(name)
        self._print_header_(name, dot='-', is_title=False)
-        self.stream.write('''..  autofunction:: paddle.fluid.{0}.{1}
+        self.stream.write('''..  autofunction:: paddle.{0}.{1}
    :noindex:
 '''.format(self.module_name, name))
@@ -94,6 +106,10 @@ class DocGenerator(object):
        self.stream.write('\n')
        self.stream.write('\n')
+    def _print_ref_(self, name):
+        self.stream.write(".. _api_{0}_{1}:\n\n".format("_".join(
+            self.module_name.split(".")), name))
 def main():
    args = parse_arg()

--- a/doc/fluid/api/gen_doc.sh
+++ b/doc/fluid/api/gen_doc.sh
 #!/bin/bash
-python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler metric > layers.rst
+python gen_doc.py layers --submodules control_flow device io nn ops tensor learning_rate_scheduler detection metric_op tensor > layers.rst
-for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler
+for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler recordio_writer backward average profiler
 do
  python gen_doc.py ${module} > ${module}.rst
 done
+python gen_doc.py "" > fluid.rst
--- a/doc/fluid/api/index_en.rst
+++ b/doc/fluid/api/index_en.rst
-======================
+=============
-Fluid
+API Reference
-======================
+=============
 ..  toctree::
    :maxdepth: 1
+    fluid.rst
    layers.rst
    data_feeder.rst
    executor.rst
@@ -18,3 +19,8 @@ Fluid
    regularizer.rst
    io.rst
    data.rst
+    transpiler.rst
+    recordio_writer.rst
+    backward.rst
+    average.rst
+    profiler.rst
--- a/doc/fluid/api/initializer.rst
+++ b/doc/fluid/api/initializer.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!
-===========
+=================
-initializer
+fluid.initializer
-===========
+=================
+.. _api_fluid_initializer_Constant:
 Constant
 --------
@@ -12,6 +14,8 @@ Constant
    :members:
    :noindex:
+.. _api_fluid_initializer_Uniform:
 Uniform
 -------
@@ -19,6 +23,8 @@ Uniform
    :members:
    :noindex:
+.. _api_fluid_initializer_Normal:
 Normal
 ------
@@ -26,6 +32,8 @@ Normal
    :members:
    :noindex:
+.. _api_fluid_initializer_Xavier:
 Xavier
 ------
@@ -33,6 +41,8 @@ Xavier
    :members:
    :noindex:
+.. _api_fluid_initializer_Bilinear:
 Bilinear
 --------
@@ -40,18 +50,33 @@ Bilinear
    :members:
    :noindex:
+.. _api_fluid_initializer_MSRA:
+MSRA
+----
+..  autoclass:: paddle.fluid.initializer.MSRA
+    :members:
+    :noindex:
+.. _api_fluid_initializer_force_init_on_cpu:
 force_init_on_cpu
 -----------------
 ..  autofunction:: paddle.fluid.initializer.force_init_on_cpu
    :noindex:
+.. _api_fluid_initializer_init_on_cpu:
 init_on_cpu
 -----------
 ..  autofunction:: paddle.fluid.initializer.init_on_cpu
    :noindex:
+.. _api_fluid_initializer_ConstantInitializer:
 ConstantInitializer
 -------------------
@@ -59,6 +84,8 @@ ConstantInitializer
    :members:
    :noindex:
+.. _api_fluid_initializer_UniformInitializer:
 UniformInitializer
 ------------------
@@ -66,6 +93,8 @@ UniformInitializer
    :members:
    :noindex:
+.. _api_fluid_initializer_NormalInitializer:
 NormalInitializer
 -----------------
@@ -73,6 +102,8 @@ NormalInitializer
    :members:
    :noindex:
+.. _api_fluid_initializer_XavierInitializer:
 XavierInitializer
 -----------------
@@ -80,6 +111,8 @@ XavierInitializer
    :members:
    :noindex:
+.. _api_fluid_initializer_BilinearInitializer:
 BilinearInitializer
 -------------------
@@ -87,3 +120,12 @@ BilinearInitializer
    :members:
    :noindex:
+.. _api_fluid_initializer_MSRAInitializer:
+MSRAInitializer
+---------------
+..  autoclass:: paddle.fluid.initializer.MSRAInitializer
+    :members:
+    :noindex:
--- a/doc/fluid/api/io.rst
+++ b/doc/fluid/api/io.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!
-==
+========
-io
+fluid.io
-==
+========
+.. _api_fluid_io_save_vars:
 save_vars
 ---------
@@ -11,84 +13,112 @@ save_vars
 ..  autofunction:: paddle.fluid.io.save_vars
    :noindex:
+.. _api_fluid_io_save_params:
 save_params
 -----------
 ..  autofunction:: paddle.fluid.io.save_params
    :noindex:
+.. _api_fluid_io_save_persistables:
 save_persistables
 -----------------
 ..  autofunction:: paddle.fluid.io.save_persistables
    :noindex:
+.. _api_fluid_io_load_vars:
 load_vars
 ---------
 ..  autofunction:: paddle.fluid.io.load_vars
    :noindex:
+.. _api_fluid_io_load_params:
 load_params
 -----------
 ..  autofunction:: paddle.fluid.io.load_params
    :noindex:
+.. _api_fluid_io_load_persistables:
 load_persistables
 -----------------
 ..  autofunction:: paddle.fluid.io.load_persistables
    :noindex:
+.. _api_fluid_io_save_inference_model:
 save_inference_model
 --------------------
 ..  autofunction:: paddle.fluid.io.save_inference_model
    :noindex:
+.. _api_fluid_io_load_inference_model:
 load_inference_model
 --------------------
 ..  autofunction:: paddle.fluid.io.load_inference_model
    :noindex:
+.. _api_fluid_io_get_inference_program:
 get_inference_program
 ---------------------
 ..  autofunction:: paddle.fluid.io.get_inference_program
    :noindex:
+.. _api_fluid_io_save_checkpoint:
 save_checkpoint
 ---------------
 ..  autofunction:: paddle.fluid.io.save_checkpoint
    :noindex:
+.. _api_fluid_io_load_checkpoint:
 load_checkpoint
 ---------------
 ..  autofunction:: paddle.fluid.io.load_checkpoint
    :noindex:
+.. _api_fluid_io_clean_checkpoint:
 clean_checkpoint
 ----------------
 ..  autofunction:: paddle.fluid.io.clean_checkpoint
    :noindex:
+.. _api_fluid_io_load_persist_vars_without_grad:
 load_persist_vars_without_grad
 ------------------------------
 ..  autofunction:: paddle.fluid.io.load_persist_vars_without_grad
    :noindex:
+.. _api_fluid_io_save_persist_vars_without_grad:
 save_persist_vars_without_grad
 ------------------------------
 ..  autofunction:: paddle.fluid.io.save_persist_vars_without_grad
    :noindex:
+.. _api_fluid_io_get_latest_checkpoint_serial:
 get_latest_checkpoint_serial
 ----------------------------

--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
--- a/doc/fluid/api/metrics.rst
+++ b/doc/fluid/api/metrics.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!
-=======
+=============
-metrics
+fluid.metrics
-=======
+=============
+.. _api_fluid_metrics_MetricBase:
 MetricBase
 ----------
@@ -12,6 +14,8 @@ MetricBase
    :members:
    :noindex:
+.. _api_fluid_metrics_CompositeMetric:
 CompositeMetric
 ---------------
@@ -19,6 +23,26 @@ CompositeMetric
    :members:
    :noindex:
+.. _api_fluid_metrics_Precision:
+Precision
+---------
+..  autoclass:: paddle.fluid.metrics.Precision
+    :members:
+    :noindex:
+.. _api_fluid_metrics_Recall:
+Recall
+------
+..  autoclass:: paddle.fluid.metrics.Recall
+    :members:
+    :noindex:
+.. _api_fluid_metrics_Accuracy:
 Accuracy
 --------
@@ -26,6 +50,8 @@ Accuracy
    :members:
    :noindex:
+.. _api_fluid_metrics_ChunkEvaluator:
 ChunkEvaluator
 --------------
@@ -33,6 +59,8 @@ ChunkEvaluator
    :members:
    :noindex:
+.. _api_fluid_metrics_EditDistance:
 EditDistance
 ------------
@@ -40,6 +68,8 @@ EditDistance
    :members:
    :noindex:
+.. _api_fluid_metrics_DetectionMAP:
 DetectionMAP
 ------------
@@ -47,6 +77,8 @@ DetectionMAP
    :members:
    :noindex:
+.. _api_fluid_metrics_Auc:
 Auc
 ---

--- a/doc/fluid/api/nets.rst
+++ b/doc/fluid/api/nets.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!
-====
+==========
-nets
+fluid.nets
-====
+==========
+.. _api_fluid_nets_simple_img_conv_pool:
 simple_img_conv_pool
 --------------------
@@ -11,18 +13,24 @@ simple_img_conv_pool
 ..  autofunction:: paddle.fluid.nets.simple_img_conv_pool
    :noindex:
+.. _api_fluid_nets_sequence_conv_pool:
 sequence_conv_pool
 ------------------
 ..  autofunction:: paddle.fluid.nets.sequence_conv_pool
    :noindex:
+.. _api_fluid_nets_glu:
 glu
 ---
 ..  autofunction:: paddle.fluid.nets.glu
    :noindex:
+.. _api_fluid_nets_scaled_dot_product_attention:
 scaled_dot_product_attention
 ----------------------------

--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!
-=========
+===============
-optimizer
+fluid.optimizer
-=========
+===============
+.. _api_fluid_optimizer_SGD:
 SGD
 ---
@@ -12,6 +14,8 @@ SGD
    :members:
    :noindex:
+.. _api_fluid_optimizer_Momentum:
 Momentum
 --------
@@ -19,6 +23,8 @@ Momentum
    :members:
    :noindex:
+.. _api_fluid_optimizer_Adagrad:
 Adagrad
 -------
@@ -26,6 +32,8 @@ Adagrad
    :members:
    :noindex:
+.. _api_fluid_optimizer_Adam:
 Adam
 ----
@@ -33,6 +41,8 @@ Adam
    :members:
    :noindex:
+.. _api_fluid_optimizer_Adamax:
 Adamax
 ------
@@ -40,6 +50,8 @@ Adamax
    :members:
    :noindex:
+.. _api_fluid_optimizer_DecayedAdagrad:
 DecayedAdagrad
 --------------
@@ -47,6 +59,17 @@ DecayedAdagrad
    :members:
    :noindex:
+.. _api_fluid_optimizer_Ftrl:
+Ftrl
+----
+..  autoclass:: paddle.fluid.optimizer.Ftrl
+    :members:
+    :noindex:
+.. _api_fluid_optimizer_SGDOptimizer:
 SGDOptimizer
 ------------
@@ -54,6 +77,8 @@ SGDOptimizer
    :members:
    :noindex:
+.. _api_fluid_optimizer_MomentumOptimizer:
 MomentumOptimizer
 -----------------
@@ -61,6 +86,8 @@ MomentumOptimizer
    :members:
    :noindex:
+.. _api_fluid_optimizer_AdagradOptimizer:
 AdagradOptimizer
 ----------------
@@ -68,6 +95,8 @@ AdagradOptimizer
    :members:
    :noindex:
+.. _api_fluid_optimizer_AdamOptimizer:
 AdamOptimizer
 -------------
@@ -75,6 +104,8 @@ AdamOptimizer
    :members:
    :noindex:
+.. _api_fluid_optimizer_AdamaxOptimizer:
 AdamaxOptimizer
 ---------------
@@ -82,6 +113,8 @@ AdamaxOptimizer
    :members:
    :noindex:
+.. _api_fluid_optimizer_DecayedAdagradOptimizer:
 DecayedAdagradOptimizer
 -----------------------
@@ -89,6 +122,8 @@ DecayedAdagradOptimizer
    :members:
    :noindex:
+.. _api_fluid_optimizer_RMSPropOptimizer:
 RMSPropOptimizer
 ----------------
@@ -96,6 +131,17 @@ RMSPropOptimizer
    :members:
    :noindex:
+.. _api_fluid_optimizer_FtrlOptimizer:
+FtrlOptimizer
+-------------
+..  autoclass:: paddle.fluid.optimizer.FtrlOptimizer
+    :members:
+    :noindex:
+.. _api_fluid_optimizer_Adadelta:
 Adadelta
 --------
@@ -103,6 +149,8 @@ Adadelta
    :members:
    :noindex:
+.. _api_fluid_optimizer_ModelAverage:
 ModelAverage
 ------------
@@ -110,6 +158,8 @@ ModelAverage
    :members:
    :noindex:
+.. _api_fluid_optimizer_Optimizer:
 Optimizer
 ---------
@@ -117,3 +167,12 @@ Optimizer
    :members:
    :noindex:
+.. _api_fluid_optimizer_RMSPropOptimizer:
+RMSPropOptimizer
+----------------
+..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
+    :members:
+    :noindex:
--- a/doc/fluid/api/param_attr.rst
+++ b/doc/fluid/api/param_attr.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!
-==========
+================
-param_attr
+fluid.param_attr
-==========
+================
+.. _api_fluid_param_attr_ParamAttr:
 ParamAttr
 ---------
@@ -12,6 +14,8 @@ ParamAttr
    :members:
    :noindex:
+.. _api_fluid_param_attr_WeightNormParamAttr:
 WeightNormParamAttr
 -------------------

--- a/doc/fluid/api/profiler.rst
+++ b/doc/fluid/api/profiler.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!
-========
+==============
-profiler
+fluid.profiler
-========
+==============
+.. _api_fluid_profiler_cuda_profiler:
 cuda_profiler
 -------------
@@ -11,24 +13,32 @@ cuda_profiler
 ..  autofunction:: paddle.fluid.profiler.cuda_profiler
    :noindex:
+.. _api_fluid_profiler_reset_profiler:
 reset_profiler
 --------------
 ..  autofunction:: paddle.fluid.profiler.reset_profiler
    :noindex:
+.. _api_fluid_profiler_profiler:
 profiler
 --------
 ..  autofunction:: paddle.fluid.profiler.profiler
    :noindex:
+.. _api_fluid_profiler_start_profiler:
 start_profiler
 --------------
 ..  autofunction:: paddle.fluid.profiler.start_profiler
    :noindex:
+.. _api_fluid_profiler_stop_profiler:
 stop_profiler
 -------------

--- a/doc/fluid/api/recordio_writer.rst
+++ b/doc/fluid/api/recordio_writer.rst
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+=====================
+fluid.recordio_writer
+=====================
+.. _api_fluid_recordio_writer_convert_reader_to_recordio_file:
+convert_reader_to_recordio_file
+-------------------------------
+..  autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_file
+    :noindex:
+.. _api_fluid_recordio_writer_convert_reader_to_recordio_files:
+convert_reader_to_recordio_files
+--------------------------------
+..  autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_files
+    :noindex:
--- a/doc/fluid/api/regularizer.rst
+++ b/doc/fluid/api/regularizer.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!
-===========
+=================
-regularizer
+fluid.regularizer
-===========
+=================
+.. _api_fluid_regularizer_append_regularization_ops:
 append_regularization_ops
 -------------------------
@@ -11,12 +13,7 @@ append_regularization_ops
 ..  autofunction:: paddle.fluid.regularizer.append_regularization_ops
    :noindex:
-WeightDecayRegularizer
+.. _api_fluid_regularizer_L1Decay:
----------------------
-..  autoclass:: paddle.fluid.regularizer.WeightDecayRegularizer
-    :members:
-    :noindex:
 L1Decay
 -------
@@ -25,6 +22,8 @@ L1Decay
    :members:
    :noindex:
+.. _api_fluid_regularizer_L2Decay:
 L2Decay
 -------
@@ -32,6 +31,8 @@ L2Decay
    :members:
    :noindex:
+.. _api_fluid_regularizer_L1DecayRegularizer:
 L1DecayRegularizer
 ------------------
@@ -39,6 +40,8 @@ L1DecayRegularizer
    :members:
    :noindex:
+.. _api_fluid_regularizer_L2DecayRegularizer:
 L2DecayRegularizer
 ------------------

--- a/doc/fluid/api/transpiler.rst
+++ b/doc/fluid/api/transpiler.rst
 ..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
    !DO NOT EDIT THIS FILE MANUALLY!
-==========
+================
-transpiler
+fluid.transpiler
-==========
+================
+.. _api_fluid_transpiler_DistributeTranspiler:
 DistributeTranspiler
 --------------------
@@ -12,12 +14,7 @@ DistributeTranspiler
    :members:
    :noindex:
-InferenceTranspiler
+.. _api_fluid_transpiler_memory_optimize:
-------------------
-..  autoclass:: paddle.fluid.transpiler.InferenceTranspiler
-    :members:
-    :noindex:
 memory_optimize
 ---------------
@@ -25,12 +22,16 @@ memory_optimize
 ..  autofunction:: paddle.fluid.transpiler.memory_optimize
    :noindex:
+.. _api_fluid_transpiler_release_memory:
 release_memory
 --------------
 ..  autofunction:: paddle.fluid.transpiler.release_memory
    :noindex:
+.. _api_fluid_transpiler_HashName:
 HashName
 --------
@@ -38,9 +39,12 @@ HashName
    :members:
    :noindex:
+.. _api_fluid_transpiler_RoundRobin:
 RoundRobin
 ----------
 ..  autoclass:: paddle.fluid.transpiler.RoundRobin
    :members:
    :noindex:
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -147,10 +147,9 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                 "Input tensor type is not supported: ", in.type().name());
  memory::data_type out_type = in_type;
-  memory::format in_format =
+  auto in_format = MKLDNNFormatForSize(in_tz.size(), in.format());
-      in_tz.size() == 2 ? memory::format::nc : in.format();
+  auto out_format =
-  memory::format out_format =
+      MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
-      out_tz.size() == 2 ? memory::format::nc : ToMKLDNNFormat(out_layout);
  void* in_data = GetDataFromTensor(in, in_type);

--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -61,6 +61,13 @@ inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) {
  if (iter != dict.end()) return iter->second;
  return MKLDNNDataType::data_undef;
 }
+inline MKLDNNFormat MKLDNNFormatForSize(size_t dims_size,
+                                        MKLDNNFormat default_format) {
+  return (dims_size == 1
+              ? mkldnn::memory::format::x
+              : dims_size == 2 ? mkldnn::memory::format::nc : default_format);
+}
 #endif
 void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,

--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -47,9 +47,13 @@ void DataTransform(const OpKernelType& expected_kernel_type,
 #ifdef PADDLE_WITH_MKLDNN
        // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
        // Just set layout/format. No real transform occur
+        auto out_format =
+            MKLDNNFormatForSize(in.dims().size(), ToMKLDNNFormat(lin));
        out.ShareDataWith(input_tensor);
        out.set_layout(DataLayout::kMKLDNN);
-        out.set_format(ToMKLDNNFormat(lin));
+        out.set_format(out_format);
 #endif
      } else {
        // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel

--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -470,7 +470,7 @@ void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op,
 void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
                                                const OpDesc &op) const {
  int op_dev_id = -1;
-  if (op.Type() == "split_byref") {
+  if (op.Type() == "split_byref" || op.Type() == "split_selected_rows") {
    op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
      op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());

--- a/paddle/fluid/operators/assign_value_op.cc
+++ b/paddle/fluid/operators/assign_value_op.cc
@@ -70,6 +70,7 @@ $$Out = values$$
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(assign_value, ops::AssignValueOp, ops::AssignValueOpMaker);
+REGISTER_OPERATOR(assign_value, ops::AssignValueOp, ops::AssignValueOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
 REGISTER_OP_CPU_KERNEL(assign_value, ops::AssignValueKernel<int>,
                       ops::AssignValueKernel<float>);
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -286,14 +286,15 @@ void GRPCClient::Proceed() {
 }
 std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
-  // TODO(Yancey1989): make grpc client completely thread-safe
  std::lock_guard<std::mutex> guard(chan_mutex_);
  auto it = channels_.find(ep);
  if (it != channels_.end()) {
    return it->second;
  }
+  // Channel configurations:
  grpc::ChannelArguments args;
+  args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 2000);
  args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE);
  args.SetMaxSendMessageSize(std::numeric_limits<int>::max());
  args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());

--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -76,6 +76,7 @@ class BaseProcessor {
  virtual void Prepare(const VarHandle& var_info, int64_t time_out) {
    context_.reset(new grpc::ClientContext());
    var_h_ = var_info;
+    context_->set_wait_for_ready(true);
    std::chrono::system_clock::time_point deadline =
        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
@@ -85,6 +86,7 @@ class BaseProcessor {
  virtual void Prepare(int64_t time_out) {
    context_.reset(new grpc::ClientContext());
+    context_->set_wait_for_ready(true);
    std::chrono::system_clock::time_point deadline =
        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
@@ -190,26 +192,24 @@ class GRPCClient : public RPCClient {
  bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
                    const framework::Scope& scope, const std::string& var_name,
-                    int64_t time_out = RPCClient::rpc_time_out) override;
+                    int64_t time_out = FLAGS_grpc_deadline) override;
  bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx,
                   const framework::Scope& scope, const std::string& var_name,
-                   int64_t time_out = RPCClient::rpc_time_out) override;
+                   int64_t time_out = FLAGS_grpc_deadline) override;
  bool AsyncPrefetchVar(const std::string& ep,
                        const platform::DeviceContext& ctx,
                        const framework::Scope& scope,
                        const std::string& in_var_name,
                        const std::string& out_var_name,
-                        int64_t time_out = RPCClient::rpc_time_out) override;
+                        int64_t time_out = FLAGS_grpc_deadline) override;
-  void AsyncSendBatchBarrier(
+  void AsyncSendBatchBarrier(const std::string& ep,
-      const std::string& ep,
+                             int64_t time_out = FLAGS_grpc_deadline) override;
-      int64_t time_out = RPCClient::rpc_time_out) override;
-  void AsyncSendFetchBarrier(
+  void AsyncSendFetchBarrier(const std::string& ep,
-      const std::string& ep,
+                             int64_t time_out = FLAGS_grpc_deadline) override;
-      int64_t time_out = RPCClient::rpc_time_out) override;
  void AsyncCheckpointNotify(
      const std::string& ep, const std::string& dir,
@@ -229,7 +229,7 @@ class GRPCClient : public RPCClient {
  void Proceed();
  void AsyncSendComplete(const std::string& ep,
-                         int64_t time_out = RPCClient::rpc_time_out);
+                         int64_t time_out = FLAGS_grpc_deadline);
  std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);

--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
@@ -97,7 +97,7 @@ class RequestSend final : public RequestBase {
  void Process() override {
    std::string varname = GetReqName();
-    VLOG(3) << "RequestSend var_name:" << varname;
+    VLOG(4) << "RequestSend var_name:" << varname;
    auto scope = request_->GetMutableLocalScope();
    auto invar = request_->GetVar();
@@ -132,7 +132,7 @@ class RequestGet final : public RequestBase {
  void Process() override {
    // proc request.
    std::string varname = request_.varname();
-    VLOG(3) << "RequestGet " << varname;
+    VLOG(4) << "RequestGet " << varname;
    auto scope = request_handler_->scope();
    auto invar = scope->FindVar(varname);
@@ -178,7 +178,7 @@ class RequestPrefetch final : public RequestBase {
    // prefetch process...
    std::string in_var_name = request_->Varname();
    std::string out_var_name = request_->OutVarname();
-    VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name
+    VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name
            << " out_var_name: " << out_var_name;
    auto scope = request_->GetMutableLocalScope();
@@ -240,10 +240,10 @@ class RequestCheckpointNotify final : public RequestBase {
 };
 void AsyncGRPCServer::WaitServerReady() {
-  VLOG(3) << "AsyncGRPCServer is wait server ready";
+  VLOG(4) << "AsyncGRPCServer is wait server ready";
  std::unique_lock<std::mutex> lock(this->mutex_ready_);
  condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
-  VLOG(3) << "AsyncGRPCServer WaitSeverReady";
+  VLOG(4) << "AsyncGRPCServer WaitSeverReady";
 }
 void AsyncGRPCServer::StartServer() {
@@ -283,7 +283,7 @@ void AsyncGRPCServer::StartServer() {
    for (int i = 0; i < threadnum; i++) {
      rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind(
          &AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f)));
-      VLOG(3) << t.first << " creates threads!";
+      VLOG(4) << t.first << " creates threads!";
    }
  }
@@ -300,7 +300,7 @@ void AsyncGRPCServer::StartServer() {
    auto& threads = t.second;
    for (size_t i = 0; i < threads.size(); ++i) {
      threads[i]->join();
-      VLOG(3) << t.first << " threads ends!";
+      VLOG(4) << t.first << " threads ends!";
    }
  }
 }
@@ -308,7 +308,7 @@ void AsyncGRPCServer::StartServer() {
 void AsyncGRPCServer::ShutdownQueue() {
  for (auto& t : rpc_cq_) {
    t.second->Shutdown();
-    VLOG(3) << t.first << " shutdown!";
+    VLOG(4) << t.first << " queue shutdown!";
  }
 }
@@ -317,7 +317,7 @@ void AsyncGRPCServer::ShutDownImpl() {
  is_shut_down_ = true;
  ShutdownQueue();
-  VLOG(3) << "server_ shutdown!";
+  VLOG(4) << "server_ shutdown!";
  server_->Shutdown();
 }
@@ -325,7 +325,7 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
                                          int req_id) {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  if (is_shut_down_) {
-    LOG(WARNING) << "shutdown, do not TryToRegisterNewSendOne";
+    VLOG(4) << "shutdown, do not TryToRegisterNewSendOne";
    return;
  }

--- a/paddle/fluid/operators/distributed/rpc_client.cc
+++ b/paddle/fluid/operators/distributed/rpc_client.cc
@@ -13,6 +13,10 @@
 // limitations under the License.
 #include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "gflags/gflags.h"
+// default to 3min to avoid temprary network failures.
+DEFINE_int32(grpc_deadline, 180000, "deadline timeouts for grpc");
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -15,11 +15,14 @@
 #pragma once
 #include <string>
+#include "gflags/gflags.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
+DECLARE_int32(grpc_deadline);
 namespace paddle {
 namespace operators {
 namespace distributed {
@@ -32,26 +35,26 @@ class RPCClient {
                            const platform::DeviceContext& ctx,
                            const framework::Scope& scope,
                            const std::string& var_name,
-                            int64_t time_out = rpc_time_out) = 0;
+                            int64_t time_out = FLAGS_grpc_deadline) = 0;
  virtual bool AsyncGetVar(const std::string& ep,
                           const platform::DeviceContext& ctx,
                           const framework::Scope& scope,
                           const std::string& var_name,
-                           int64_t time_out = rpc_time_out) = 0;
+                           int64_t time_out = FLAGS_grpc_deadline) = 0;
  virtual bool AsyncPrefetchVar(const std::string& ep,
                                const platform::DeviceContext& ctx,
                                const framework::Scope& scope,
                                const std::string& in_var_name,
                                const std::string& out_var_name,
-                                int64_t time_out = rpc_time_out) = 0;
+                                int64_t time_out = FLAGS_grpc_deadline) = 0;
-  virtual void AsyncSendBatchBarrier(const std::string& ep,
+  virtual void AsyncSendBatchBarrier(
-                                     int64_t time_out = rpc_time_out) = 0;
+      const std::string& ep, int64_t time_out = FLAGS_grpc_deadline) = 0;
-  virtual void AsyncSendFetchBarrier(const std::string& ep,
+  virtual void AsyncSendFetchBarrier(
-                                     int64_t time_out = rpc_time_out) = 0;
+      const std::string& ep, int64_t time_out = FLAGS_grpc_deadline) = 0;
  virtual void AsyncCheckpointNotify(const std::string& ep,
                                     const std::string& dir,
@@ -64,8 +67,6 @@ class RPCClient {
  virtual void Wait() = 0;
-  static constexpr int64_t rpc_time_out = 120 * 1000;
  template <typename T>
  static RPCClient* GetInstance() {
    std::call_once(init_flag_, &RPCClient::Init<T>);

--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -47,11 +47,12 @@ void RPCServer::WaitBarrier(const std::string& rpc_name) {
    return (barrier_counter_[rpc_name] >= client_num_ || exit_flag_.load());
  });
-  VLOG(3) << "batch_barrier_:" << barrier_counter_[rpc_name];
+  VLOG(3) << "batch_barrier_: " << rpc_name << " "
+          << barrier_counter_[rpc_name];
 }
 void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
-  VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
+  VLOG(4) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
  int b = 0;
  std::unique_lock<std::mutex> lock(mutex_);
  b = ++barrier_counter_[rpc_name];
@@ -100,7 +101,7 @@ void RPCServer::SetCond(const std::string& rpc_name) {
 }
 void RPCServer::WaitCond(const std::string& rpc_name) {
-  VLOG(3) << "RPCServer WaitCond " << rpc_name;
+  VLOG(4) << "RPCServer WaitCond " << rpc_name;
  int cond = 0;
  {
    std::unique_lock<std::mutex> lock(mutex_);

--- a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/elementwise_add_op.h"
+#include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+namespace paddle {
+namespace operators {
+using framework::DataLayout;
+using framework::Tensor;
+using mkldnn::memory;
+using mkldnn::reorder;
+using mkldnn::primitive;
+using mkldnn::stream;
+using mkldnn::sum;
+template <typename T>
+class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    const T* x_data = x->data<T>();
+    const T* y_data = y->data<T>();
+    T* z_data = z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    auto x_dims = x->dims();
+    auto y_dims = y->dims();
+    auto z_dims = z->dims();
+    // Execute default elementwise_add operator when
+    // broadcast operations need to performed.
+    if (x_dims != y_dims) {
+      auto sum_func = [](T a, T b) -> T { return a + b; };
+      TransformFunctor<decltype(sum_func), T,
+                       paddle::platform::CPUDeviceContext, T>
+          functor(
+              x, y, z,
+              ctx.template device_context<paddle::platform::CPUDeviceContext>(),
+              sum_func);
+      axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+      PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+                     "Axis should be in range [0, x_dims)");
+      trim_trailing_singular_dims(&y_dims);
+      axis = (y_dims.size() == 0) ? x_dims.size() : axis;
+      int pre, n, post;
+      get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
+      if (post == 1) {
+        functor.RunRowWise(n, pre);
+      } else {
+        functor.RunMidWise(n, pre, post);
+      }
+      z->set_layout(DataLayout::kMKLDNN);
+      z->set_format(x->format());
+    } else {
+      PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
+                         x->format() != memory::format::format_undef,
+                     "Wrong layout/format set for X tensor");
+      PADDLE_ENFORCE(y->layout() == DataLayout::kMKLDNN &&
+                         y->format() != memory::format::format_undef,
+                     "Wrong layout/format set for X tensor");
+      std::vector<int> src_x_tz = framework::vectorize2int(x_dims);
+      std::vector<int> src_y_tz = framework::vectorize2int(y_dims);
+      std::vector<int> dst_tz = framework::vectorize2int(z_dims);
+      std::vector<memory::primitive_desc> srcs_pd;
+      std::vector<memory> srcs;
+      std::vector<float> scales = {1.0f, 1.0f};
+      auto src_x_pd = memory::primitive_desc(
+          {{src_x_tz}, memory::data_type::f32, x->format()}, mkldnn_engine);
+      auto src_y_pd = memory::primitive_desc(
+          {{src_y_tz}, memory::data_type::f32, y->format()}, mkldnn_engine);
+      auto src_x_memory =
+          memory(src_x_pd, paddle::platform::to_void_cast(x_data));
+      auto src_y_memory =
+          memory(src_y_pd, paddle::platform::to_void_cast(y_data));
+      srcs_pd.push_back(src_x_pd);
+      srcs_pd.push_back(src_y_pd);
+      srcs.push_back(src_x_memory);
+      srcs.push_back(src_y_memory);
+      auto dst_md =
+          memory::desc({dst_tz}, memory::data_type::f32, memory::format::any);
+      // create primitive descriptor for sum
+      auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_pd);
+      // create mkldnn memory for dst
+      memory dst_memory = memory(sum_pd.dst_primitive_desc(), z_data);
+      std::vector<primitive::at> inputs;
+      inputs.push_back(srcs[0]);
+      inputs.push_back(srcs[1]);
+      // create sum primitive
+      auto sum_prim = sum(sum_pd, inputs, dst_memory);
+      std::vector<primitive> pipeline;
+      pipeline.push_back(sum_prim);
+      stream(stream::kind::eager).submit(pipeline).wait();
+      z->set_layout(DataLayout::kMKLDNN);
+      z->set_format(
+          (memory::format)dst_memory.get_primitive_desc().desc().data.format);
+    }
+  }
+};
+template <typename T>
+class EltwiseAddMKLDNNGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
+    auto set_mkldnn_format = [](Tensor* in, const Tensor* out) {
+      in->set_layout(DataLayout::kMKLDNN);
+      in->set_format(out->format());
+    };
+    if (x->dims() == y->dims()) {
+      auto blas = math::GetBlas<paddle::platform::CPUDeviceContext, T>(ctx);
+      if (dx) {
+        blas.VCOPY(dout->numel(), dout->data<T>(),
+                   dx->mutable_data<T>(ctx.GetPlace()));
+        set_mkldnn_format(dx, dout);
+      }
+      if (dy) {
+        blas.VCOPY(dout->numel(), dout->data<T>(),
+                   dy->mutable_data<T>(ctx.GetPlace()));
+        set_mkldnn_format(dy, dout);
+      }
+    } else {
+      // Execute default kernel when broadcast is needed
+      ElemwiseGradCompute<paddle::platform::CPUDeviceContext, T,
+                          IdentityGrad<T>, IdentityGrad<T>>(
+          ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(),
+          IdentityGrad<T>());
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(elementwise_add, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::EltwiseAddMKLDNNKernel<float>)
+REGISTER_OP_KERNEL(elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::EltwiseAddMKLDNNGradKernel<float>)
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -14,8 +14,12 @@ limitations under the License. */
 #pragma once
 #include <string>
+#include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 namespace paddle {
 namespace operators {
@@ -40,6 +44,21 @@ class ElementwiseOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("Out", x_dim);
    ctx->ShareLoD("X", /*->*/ "Out");
  }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+#ifdef PADDLE_WITH_MKLDNN
+    if (platform::CanMKLDNNBeUsed(ctx)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 class ElementwiseOpInferVarType : public framework::VarTypeInference {
@@ -65,6 +84,8 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
                 "for broadcasting Y onto X.")
        .SetDefault(-1)
        .EqualGreaterThan(-1);
+    AddAttr<bool>("use_mkldnn", "(bool, default false). Used by MKLDNN.")
+        .SetDefault(false);
    AddComment(string::Sprintf(R"DOC(
 Limited Elementwise %s Operator
@@ -138,6 +159,21 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
      ctx->SetOutputDim(y_grad_name, y_dims);
    }
  }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+#ifdef PADDLE_WITH_MKLDNN
+    if (platform::CanMKLDNNBeUsed(ctx)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 }  // namespace operators
 }  // namespace paddle

--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -164,8 +164,8 @@ void ListenAndServOp::RunSyncLoop(
 }
 void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
-                                   framework::ProgramDesc *program) const {
+                                   framework::ProgramDesc *program,
-  VLOG(3) << "RunAsyncLoop in";
+                                   framework::Scope *recv_scope) const {
  // grad name to block id
  std::unordered_map<std::string, int32_t> grad_to_block_id;
  std::unordered_map<int32_t, std::string> id_to_grad;
@@ -192,6 +192,10 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
    block_list.push_back(blkid);
  }
  auto optimize_prepared = executor->Prepare(*program, block_list);
+  // execute global block if needed
+  if (block_list[0] == 1 && id_to_grad.count(1) == 0) {
+    executor->RunPreparedContext(optimize_prepared[0].get(), recv_scope);
+  }
  std::unordered_map<std::string,
                     std::shared_ptr<framework::ExecutorPrepareContext>>
      grad_to_prepared_ctx;
@@ -203,7 +207,6 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
  request_get_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
  request_prefetch_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
-  VLOG(3) << "RunAsyncLoop into while";
  while (true) {
    if (rpc_service_->IsExit()) {
      VLOG(4) << "get exit!rpc_processor break!";
@@ -338,7 +341,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
    RunSyncLoop(&executor, program, &recv_scope, prefetch_block_id_list,
                checkpoint_block_id);
  } else {
-    RunAsyncLoop(&executor, program);
+    RunAsyncLoop(&executor, program, &recv_scope);
  }
 }

--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -52,7 +52,8 @@ class ListenAndServOp : public framework::OperatorBase {
                   const int checkpoint_point_block_id) const;
  void RunAsyncLoop(framework::Executor* executor,
-                    framework::ProgramDesc* program) const;
+                    framework::ProgramDesc* program,
+                    framework::Scope* recv_scope) const;
  void SavePort() const;

--- a/paddle/fluid/operators/random_crop_op.cc
+++ b/paddle/fluid/operators/random_crop_op.cc
@@ -37,6 +37,11 @@ class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("SeedOut", "The random seed after random cropping.")
        .AsIntermediate();
    AddAttr<std::vector<int>>("shape", "The shape of a cropped instance.");
+    AddAttr<int>("startup_seed",
+                 "If the input 'Seed' is not initialized, the 'startup_seed' "
+                 "will be used to replace it. Even so, the seed after random "
+                 "crop will also be outputed to the 'SeedOut'.")
+        .SetDefault(0);
    AddComment(R"DOC(
      This operator takes a batch of instance, and do random cropping on each instance.
      It means that cropping positions differs on each instance, which is determined
@@ -49,8 +54,6 @@ class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker {
 class RandomCropOpInferShape : public framework::InferShapeBase {
 public:
  void operator()(framework::InferShapeContext* ctx) const override {
-    auto seed_dim = ctx->GetInputDim("Seed");
-    PADDLE_ENFORCE(seed_dim.size() == 1 && seed_dim[0] == 1);
    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
    auto x_dim = ctx->GetInputDim("X");
    PADDLE_ENFORCE_GT(x_dim.size(), static_cast<int64_t>(shape.size()));
@@ -62,7 +65,6 @@ class RandomCropOpInferShape : public framework::InferShapeBase {
      out_dim[x_i] = shape[shape_i];
    }
    ctx->SetOutputDim("Out", framework::make_ddim(out_dim));
-    ctx->SetOutputDim("SeedOut", framework::make_ddim({1}));
  }
 };

--- a/paddle/fluid/operators/random_crop_op.h
+++ b/paddle/fluid/operators/random_crop_op.h
@@ -142,8 +142,9 @@ template <typename DeviceContext, typename T>
 class RandomCropKernel : public framework::OpKernel<T> {
 public:
  virtual void Compute(const framework::ExecutionContext& ctx) const {
-    auto& seed_tensor = detail::Ref(ctx.Input<framework::LoDTensor>("Seed"));
    int64_t seed = 0;
+    auto& seed_tensor = detail::Ref(ctx.Input<framework::LoDTensor>("Seed"));
+    if (seed_tensor.IsInitialized()) {
      if (platform::is_cpu_place(seed_tensor.place())) {
        seed = *seed_tensor.data<int64_t>();
      } else {
@@ -153,6 +154,11 @@ class RandomCropKernel : public framework::OpKernel<T> {
        framework::TensorCopySync(seed_tensor, platform::CPUPlace(), &cpu_seed);
        seed = *cpu_seed.data<int64_t>();
      }
+    } else {
+      VLOG(5) << "WARNING: The input 'Seed' is not initialized, use attribute "
+                 "'startup_seed' instead.";
+      seed = ctx.Attr<int>("startup_seed");
+    }
    auto shape = ctx.Attr<std::vector<int>>("shape");
    auto& x = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
    auto& out = detail::Ref(ctx.Output<framework::LoDTensor>("Out"));
@@ -171,7 +177,7 @@ class RandomCropKernel : public framework::OpKernel<T> {
    engine.discard(functor.prod_batchsize_dims_ *
                   (functor.rank_ - functor.num_batchsize_dims_));
    *ctx.Output<framework::LoDTensor>("SeedOut")->mutable_data<int64_t>(
-        platform::CPUPlace()) = engine();
+        framework::make_ddim({1}), platform::CPUPlace()) = engine();
  }
 };

--- a/paddle/fluid/operators/reader/create_custom_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc
@@ -39,6 +39,7 @@ class CustomReader : public framework::DecoratedReader {
  const framework::ProgramDesc program_;
  int sub_block_id_;
  framework::Executor exe_;
+  framework::Scope scope_;
  std::vector<std::string> source_var_names_;
  std::vector<std::string> sink_var_names_;
@@ -158,23 +159,24 @@ void CustomReader::ReadNext(std::vector<framework::LoDTensor>* out) {
  // The scope for CustomReader's sub-block should be independent and shouldn't
  // be any other computation scope's child. Otherwise, data preprocessing and
  // compution cannot be concurrent.
-  framework::Scope scope;
+  framework::Scope* exe_scope = &scope_.NewScope();
  // 1. Copy LoDTensors from underlying reader's output to source variables.
  for (size_t i = 0; i < source_var_names_.size(); ++i) {
-    framework::Variable* var = scope.Var(source_var_names_[i]);
+    framework::Variable* var = exe_scope->Var(source_var_names_[i]);
    framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
    tensor->ShareDataWith(underlying_outs[i]);
    tensor->set_lod(underlying_outs[i].lod());
  }
  // 2. Run the sub-block.
-  exe_.Run(program_, &scope, sub_block_id_, false, true);
+  exe_.Run(program_, exe_scope, sub_block_id_, false, true);
  // 3. Copy LoDTensors from sink variables to out.
  out->resize(sink_var_names_.size());
  for (size_t i = 0; i < sink_var_names_.size(); ++i) {
-    const auto& tensor = detail::Ref(scope.FindVar(sink_var_names_[i]))
+    const auto& tensor = detail::Ref(exe_scope->FindVar(sink_var_names_[i]))
                             .Get<framework::LoDTensor>();
    framework::TensorCopySync(tensor, platform::CPUPlace(), &(*out)[i]);
  }
+  scope_.DeleteScope(exe_scope);
 }
 }  // namespace reader

--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -23,13 +23,13 @@ namespace reader {
 // 'Double buffer' means we shall maintain two batches of input data at the same
 // time. So the kCacheSize shoul be at least 2.
-static constexpr size_t kCacheSize = 3;
+static constexpr size_t kCacheSize = 5;
 // There will be two bacthes out of the channel during training:
 // 1. the one waiting to be sent to the channel
 // 2. the one just be received from the channel, which is also being used by
 // subsequent operators.
 // So the channel size should be kChacheSize - 2
-static constexpr size_t kChannelSize = 1;  // kCacheSize - 2
+static constexpr size_t kChannelSize = 3;  // kCacheSize - 2
 class DoubleBufferReader : public framework::DecoratedReader {
 public:

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -559,19 +559,8 @@ class Operator(object):
                        self.attrs[attr_name] is None):
                    continue
                attr_val = self.attrs[attr_name]
-                if isinstance(attr_val, Block):
+                self._update_desc_attr(attr_name, attr_val)
-                    self.desc.set_block_attr(attr_name,
-                                             self.attrs[attr_name].desc)
-                elif isinstance(attr_val, list) and attr_val and \
-                      all(isinstance(v, Block) for v in attr_val):
-                    self.desc.set_blocks_attr(attr_name,
-                                              [v.desc for v in attr_val])
-                elif isinstance(attr_val, core.BlockDesc) or \
-                        isinstance(attr_val, core.ProgramDesc):
-                    self.desc.set_serialized_attr(
-                        attr_name, attr_val.serialize_to_string())
-                else:
-                    self.desc.set_attr(attr_name, attr_val)
        self.desc.check_attrs()
        if self.has_kernel(type):
            self.desc.infer_var_type(self.block.desc)
@@ -718,6 +707,19 @@ class Operator(object):
            ValueError: If the type of value doesn't match with desc.attr_type(name).
        """
        self.attrs[name] = val
+        self._update_desc_attr(name, val)
+    def _update_desc_attr(self, name, val):
+        """
+        Update the value of desc's attribute by attribute's name.
+        Args:
+            name(str): the attribute name.
+            val(bool|int|str|float|list): the value of the attribute.
+        Raises:
+            ValueError: If the type of value doesn't match with desc.attr_type(name).
+        """
        if isinstance(val, Block):
            self.desc.set_block_attr(name, val.desc)
        elif isinstance(val, list) and val and all(

--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -469,10 +469,13 @@ def open_files(filenames,
       lod_levels(list): List of ints which declaring data lod_level.
       dtypes(list): List of strs which declaring data type.
       thread_num(int): The maximal concurrent prefetch thread number.
-       buffer_size(int): The size of prefetch buffer.
+       buffer_size(int|None): The size of prefetch buffer. If it is setted None, 
+            buffer size will be thread_num * 3.
+            Default: None
       pass_num(int): Number of passes to run.
       for_parallel(Bool): Set it as True if you are going to run 
            subsequent operators in parallel.
+            Default: True
    Returns:
       Variable: A Reader Variable via which we can get file data.
@@ -492,7 +495,7 @@ def open_files(filenames,
         image, label = fluid.layers.io.read_file(reader)
    """
    if buffer_size is None:
-        buffer_size = thread_num
+        buffer_size = thread_num * 3
    if isinstance(filenames, basestring):
        filenames = [filenames]
    dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -23,6 +23,7 @@ from layer_function_generator import autodoc, templatedoc
 from tensor import concat
 import utils
 import random
+from .. import unique_name
 __all__ = [
    'fc',
@@ -4266,14 +4267,18 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
                                say :attr:`actual_shape` has a higher priority
                                than :attr:`shape`.
        act (str): The non-linear activation to be applied to output variable.
-        inplace(bool): If this flag is set true, a new output tensor is created
+        inplace(bool): If this flag is set true, the output
-                       whose data is copied from input x, otherwise the output
+                       shares data with input without copying, otherwise
-                       shares data with input without copying.
+                       a new output tensor is created
+                       whose data is copied from input x.
        name (str): The name of this layer. It is optional.
    Returns:
        Variable: The output tensor.
+    Raises:
+        TypeError: if actual_shape is neither Variable nor None.
    Examples:
        .. code-block:: python
@@ -4285,6 +4290,11 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
    if not (isinstance(shape, list) or isinstance(shape, tuple)):
        raise ValueError("Input shape must be a python lsit or tuple.")
+    inputs = {"X": x}
+    if isinstance(actual_shape, Variable):
+        inputs["Shape"] = actual_shape
+    elif actual_shape is not None:
+        raise TypeError("actual_shape should either be Variable or None")
    # Validate the shape
    unk_dim_idx = -1
@@ -4305,9 +4315,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
    reshaped = helper.create_tmp_variable(dtype=x.dtype)
    helper.append_op(
        type="reshape",
-        inputs={"X": x,
+        inputs=inputs,
-                "Shape": actual_shape}
-        if isinstance(actual_shape, Variable) else {"X": x},
        attrs={"shape": shape,
               "inplace": inplace},
        outputs={"Out": reshaped})
@@ -4889,47 +4897,39 @@ def random_crop(x, shape, seed=None):
        >>> cropped_img = fluid.layers.random_crop(img, shape=[3, 224, 224])
    """
    helper = LayerHelper("random_crop", **locals())
-    dtype = helper.input_dtype()
+    dtype = x.dtype
    out = helper.create_tmp_variable(dtype)
    if seed is None:
        seed = random.randint(-65536, 65535)
+    op_attrs = {"shape": shape}
    if isinstance(seed, int):
-        seed_value = seed
+        op_attrs["startup_seed"] = seed
-        seed = helper.create_tmp_variable(dtype="int64")
+        seed = helper.create_variable(
-        helper.append_op(
+            name=unique_name.generate("random_crop_seed"),
-            type="fill_constant",
+            dtype="int64",
-            inputs={},
+            persistable=True)
-            outputs={"Out": seed},
-            attrs={
-                "dtype": seed.dtype,
-                "shape": [1],
-                "value": float(seed_value),
-                "force_cpu": True
-            })
    elif not isinstance(seed, Variable):
        raise ValueError("'seed' must be a Variable or an int.")
-    seed_out = helper.create_tmp_variable(dtype="int64")
    helper.append_op(
        type="random_crop",
        inputs={"X": x,
                "Seed": seed},
        outputs={"Out": out,
-                 "SeedOut": seed_out},
+                 "SeedOut": seed},
-        attrs={"shape": shape})
+        attrs=op_attrs)
    return out
-def log(input):
+def log(x):
    """
    Calculates the natural log of the given input tensor, element-wise.
    .. math::
-        Out = \\ln(input)
+        Out = \\ln(x)
    Args:
-        input (Variable): Input tensor.
+        x (Variable): Input tensor.
    Returns:
        Variable: The natural log of the input tensor computed element-wise.
@@ -4938,7 +4938,7 @@ def log(input):
        .. code-block:: python
-            output = fluid.layers.log(input)
+            output = fluid.layers.log(x)
    """
    helper = LayerHelper('log', **locals())
    dtype = helper.input_dtype(input_param_name='x')
@@ -4947,18 +4947,18 @@ def log(input):
    return out
-def relu(input):
+def relu(x):
    """
    Relu takes one input data (Tensor) and produces one output data (Tensor)
-    where the rectified linear function, y = max(0, input), is applied to
+    where the rectified linear function, y = max(0, x), is applied to
    the tensor elementwise.
    .. math::
-        Out = \\max(0, input)
+        Out = \\max(0, x)
    Args:
-        input (Variable): The input tensor.
+        x (Variable): The input tensor.
    Returns:
        Variable: The output tensor with the same shape as input.
@@ -4967,7 +4967,7 @@ def relu(input):
        .. code-block:: python
-            output = fluid.layers.relu(input)
+            output = fluid.layers.relu(x)
    """
    helper = LayerHelper('relu', **locals())
    dtype = helper.input_dtype(input_param_name='x')

--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -238,7 +238,7 @@ def sums(input, out=None):
    return out
-def assign(input, output):
+def assign(input, output=None):
    """
    **Assign**
@@ -246,7 +246,7 @@ def assign(input, output):
    Args:
        input(Variable|numpy.ndarray): The source variable
-        output(Variable): The destination variable
+        output(Variable|None): The destination variable
    Returns:
        Variable: The destination variable that was supplied as the *output*.
@@ -259,6 +259,8 @@ def assign(input, output):
          fluid.layers.assign(hidden, out)
    """
    helper = LayerHelper('assign', **locals())
+    if output is None:
+        output = helper.create_tmp_variable(dtype=input.dtype)
    if isinstance(input, Variable):
        helper.append_op(
            type='assign', inputs={'X': [input]}, outputs={'Out': [output]})

--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -596,12 +596,12 @@ class Auc(MetricBase):
            tp, fn, tn, fp = 0, 0, 0, 0
            for i, lbl in enumerate(labels):
                if lbl:
-                    if predictions[i, 1] >= thresh:
+                    if preds[i, 1] >= thresh:
                        tp += 1
                    else:
                        fn += 1
                else:
-                    if predictions[i, 1] >= thresh:
+                    if preds[i, 1] >= thresh:
                        fp += 1
                    else:
                        tn += 1

--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+from test_elementwise_add_op import *
+'''
+Some tests differ from the tests defined in test_elementwise_add_op.py
+because MKLDNN does not support tensors of number of dimensions 3.
+Such dimensions cause exceptions in MKLDNN reorder primitive.
+'''
+class TestMKLDNNElementwiseAddOp(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+class TestMKLDNNElementwiseAddOp_scalar(TestElementwiseAddOp_scalar):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+class TestMKLDNNElementwiseAddOp_scalar2(TestElementwiseAddOp_scalar2):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+class TestMKLDNNElementwiseAddOp_Vector(TestElementwiseAddOp_Vector):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+class TesMKLDNNtElementwiseAddOp_broadcast_0(TestElementwiseAddOp_broadcast_0):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(2).astype(self.dtype)
+        self.out = self.x + self.y.reshape(2, 1, 1, 1)
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+class TestMKLDNNElementwiseAddOp_broadcast_1(TestElementwiseAddOp_broadcast_1):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(3).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 3, 1, 1)
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+class TestMKLDNNElementwiseAddOp_broadcast_2(TestElementwiseAddOp_broadcast_2):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1, 1, 4)
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+class TestMKLDNNElementwiseAddOp_broadcast_3(TestElementwiseAddOp_broadcast_3):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+class TestMKLDNNElementwiseAddOp_broadcast_4(TestElementwiseAddOp_broadcast_4):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+class TestMKLDNNElementwiseAddOp_rowwise_add_0(
+        TestElementwiseAddOp_rowwise_add_0):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(3, 4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 3, 4, 1)
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+class TestMKLDNNElementwiseAddOp_rowwise_add_1(
+        TestElementwiseAddOp_rowwise_add_1):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+class TestMKLDNNElementwiseAddOp_channelwise_add(
+        TestElementwiseAddOp_channelwise_add):
+    def init_input_output(self):
+        self.x = np.random.rand(3, 5, 20, 20).astype(self.dtype)
+        self.y = np.random.rand(3, 1, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -18,19 +18,23 @@ from op_test import OpTest
 class TestElementwiseAddOp(OpTest):
+    def init_kernel_type(self):
+        self.use_mkldnn = False
    def setUp(self):
        self.op_type = "elementwise_add"
        self.dtype = np.float32
        self.axis = -1
        self.init_dtype()
        self.init_input_output()
+        self.init_kernel_type()
        self.init_axis()
        self.inputs = {
            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
        }
-        self.attrs = {'axis': self.axis}
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
        self.outputs = {'Out': self.out}
    def test_check_output(self):

--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -1323,16 +1323,6 @@ class DistributeTranspiler(object):
                    ufind.union(op1, op2)
        return ufind
-    def _is_opt_role_op(self, op):
-        # NOTE: depend on oprole to find out whether this op is for
-        # optimize
-        op_maker = core.op_proto_and_checker_maker
-        optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
-        if op_maker.kOpRoleAttrName() in op.attrs and \
-            int(op.attrs[op_maker.kOpRoleAttrName()]) == int(optimize_role):
-            return True
-        return False
    def _is_optimizer_op(self, op):
        if "Param" in op.input_names and \
            "LearningRate" in op.input_names:
@@ -1423,7 +1413,10 @@ class DistributeTranspiler(object):
        params_grads = []
        origin_var_dict = self.origin_program.global_block().vars
        for op in block.ops:
-            if self._is_opt_role_op(op):
+            # NOTE(Yancey1989): we can not use op role to distinguish an optimizer op
+            # or not, because all ops in optimizer sub-graph would
+            # sign the optimizer op role
+            if self._is_optimizer_op(op):
                opt_ops.append(op)
                # HACK(wuyi): if we find grad vars from input of optimize
                # ops, we may get the output of clip op. Use syntax "@GRAD"